-
Notifications
You must be signed in to change notification settings - Fork 2
/
UDA-SelfTraining-SC.sh
40 lines (31 loc) · 2.43 KB
/
UDA-SelfTraining-SC.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#1. QG model generates questions from 50k passages and prepares synthetic data for self-training QG
cd QG/
python generate.py --checkpoint NQ-checkpoint/ --file ../data/passages_unaligned.tsv
#2. IR model generates embeddings for 50k passages
cd ../IR/
python generate_dense_embeddings.py --model_file NQ-checkpoint/bert-base-encoder.cp --ctx_file ../data/passages_unaligned.tsv --out_file NQ-checkpoint/embeddings_50k
#3. IR model retrieves passages from questions and prepares synthetic data for self-training IR
python generate.py --model_file NQ-checkpoint/bert-base-encoder.cp --embeddings NQ-checkpoint/embeddings_50k_0.pkl --out_file outputs/ST.tsv
#4. Filter IR model data using IR model (self-consistency)
python consistency.py --model_file NQ-checkpoint/bert-base-encoder.cp --embeddings_file NQ-checkpoint/embeddings_50k_0.pkl --input_file outputs/ST.tsv --output_file outputs/ST-sc.tsv --threshold 78.24
#5. Generated data is converted to json format for training IR model
python gen_dpr_data.py --input_file outputs/ST-sc.tsv --out_file outputs/ST.json
#6. Filter QG model data using QG model (self-consistency)
cd ../QG/
python compute_loss.py --file NQ-checkpoint/QG-predictions-50K.tsv --checkpoint NQ-checkpoint/ --save_to NQ-checkpoint/losses.txt
python consistency.py --input_file NQ-checkpoint/QG-predictions-50K.tsv --threshold_file NQ-checkpoint/losses.txt --output_file NQ-checkpoint/QG-predictions-50K-sc.tsv --threshold 1.19
#7. Train QG model on synthetic self-training data
cd ../QG/
python train.py --epochs 5 --train_file NQ-checkpoint/QG-predictions-50K-sc.tsv --checkpoint NQ-checkpoint/
#8. Train IR model on synthetic self-training data
cd ../IR/
python train_dense_encoder.py --encoder_model_type hf_bert --pretrained_model_cfg bert-base-uncased --train_file outputs/ST.json --num_train_epochs 6 --model_file NQ-checkpoint/bert-base-encoder.cp --output_dir outputs/ --batch_size 32 --dev_file ../data/dev.json
#9. Evaluate QG model on test data
cd ../QG/
python eval.py --checkpoint outputs/ --eval_file ../data/test.tsv
#10. Evaluate IR model on test data
#a. Generate embeddings of 11k test passages
cd ../IR/
python generate_dense_embeddings.py --model_file outputs/dpr_biencoder.5.1106 --ctx_file ../data/test_passages.tsv --out_file outputs/embeddings_11k
#b. evaluate top-k retrieval accuracy on test data
python eval_retriever.py --model_file outputs/dpr_biencoder.5.1106 --embeddings outputs/embeddings_11k_0.pkl --eval_file ../data/test.tsv