Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Code/BertToken.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,8 @@ def main():
parser.add_argument("--max_seq_length", default=128, type=int, help="max seq length after tokenization")

args = parser.parse_args()
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
#device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
device = 'cpu'
args.device = device

# Set up logging
Expand Down
4 changes: 2 additions & 2 deletions Code/train_token.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ else
OUT=$TASK
fi

python $PWD/Code/BertToken.py \
python3 $PWD/Code/BertToken.py \
--data_dir $DATA_DIR/$TASK \
--output_dir $OUT_DIR/$OUT \
--model_type $MODEL_TYPE \
--model_name $MODEL \
--num_train_epochs $EPOCH \
--train_batch_size $BATCH_SIZE \
--max_seq_length $MAX_SEQ \
--save_steps -1
--save_steps -1
1 change: 1 addition & 0 deletions Data/Original_Data/QA_EN_HI/code_mixed_qa_train.json

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions Data/Preprocess_Scripts/preprocess_pos_en_hi_ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def scrape_tweets(original_path):
outfile.write(i)

#scraping tweets
call(shlex.split('python crawl_tweets_copy.py -i tweet_ids_train.txt -a train-annot.json -o tweets_train.conll'))
call(shlex.split('python crawl_tweets_copy.py -i tweet_ids_dev.txt -a dev-annot.json -o tweets_dev.conll'))
call(shlex.split('python crawl_tweets_copy.py -i tweet_ids_test.txt -a test-annot.json -o tweets_test.conll'))
call(shlex.split('python3 crawl_tweets_copy.py -i tweet_ids_train.txt -a train-annot.json -o tweets_train.conll'))
call(shlex.split('python3 crawl_tweets_copy.py -i tweet_ids_dev.txt -a dev-annot.json -o tweets_dev.conll'))
call(shlex.split('python3 crawl_tweets_copy.py -i tweet_ids_test.txt -a test-annot.json -o tweets_test.conll'))

def make_files(original_path,new_path):

Expand Down Expand Up @@ -176,4 +176,4 @@ def main():
open(new_path+'Devanagari/all.txt', 'a').writelines([l for l in open(new_path+'Devanagari/validation.txt').readlines() ])

if __name__=="__main__":
main()
main()
12 changes: 6 additions & 6 deletions Data/Preprocess_Scripts/preprocess_qa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ PART1=`dirname "$INP_FILE"`
PART2=`basename "$INP_FILE"`

#preprocesss for DrQA
python $PREPROCESS_DIR/preprocess_drqa.py --data_dir $ORIGINAL_DATA_DIR
python3 $PREPROCESS_DIR/preprocess_drqa.py --data_dir $ORIGINAL_DATA_DIR

#run DrQA
git clone https://github.com/facebookresearch/DrQA.git
cd DrQA
git checkout 96f343c
pip install elasticsearch==7.8.0 nltk==3.5 scipy==1.5.0 prettytable==0.7.2 tqdm==4.46.1 regex==2020.6.8 termcolor==1.1.0 scikit-learn==0.23.1 numpy==1.18.5 torch==1.4.0
python setup.py develop
python3 setup.py develop
pip install spacy==2.3.0
python -m spacy download xx_ent_wiki_sm
python -c "import nltk;nltk.download(['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])"
python3 -m spacy download xx_ent_wiki_sm
python3 -c "import nltk;nltk.download(['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])"
./download.sh
sed -i 's/np.load(filename)/np.load(filename, allow_pickle=True)/g' drqa/retriever/utils.py
sed -i 's/\[\x27tokenizer_class\x27\], {},/\[\x27tokenizer_class\x27\], {\x27model\x27: \x27xx_ent_wiki_sm\x27},/g' scripts/distant/generate.py
Expand All @@ -30,8 +30,8 @@ patch scripts/distant/generate.py <<EOF
263a264
> random.seed(0)
EOF
python scripts/distant/generate.py $PART1 $PART2 $PREPROCESS_DIR --tokenizer spacy --dev-split 0.2 --n-docs 1 --workers 1
python3 scripts/distant/generate.py $PART1 $PART2 $PREPROCESS_DIR --tokenizer spacy --dev-split 0.2 --n-docs 1 --workers 1

cd ./..
# Squad format processor
python $PREPROCESS_DIR/preprocess_qa_en_hi.py --output_dir $PROCESSED_DIR
python3 $PREPROCESS_DIR/preprocess_qa_en_hi.py --output_dir $PROCESSED_DIR
2 changes: 1 addition & 1 deletion Data/Preprocess_Scripts/preprocess_sent_en_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,4 +215,4 @@ def main():
os.unlink('sentiment_annotated.txt')

if __name__=='__main__':
main()
main()
Loading