forked from facebookresearch/flores
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflores_move.py
73 lines (55 loc) · 6.94 KB
/
flores_move.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import glob
import os
import subprocess
all_filenames = ["ace_Arab","ace_Latn","acm_Arab","acq_Arab","aeb_Arab","afr_Latn","ajp_Arab","aka_Latn","amh_Ethi","apc_Arab","arb_Arab","arb_Latn","ars_Arab","ary_Arab","arz_Arab","asm_Beng","ast_Latn","awa_Deva","ayr_Latn","azb_Arab","azj_Latn","bak_Cyrl","bam_Latn","ban_Latn","bel_Cyrl","bem_Latn","ben_Beng","bho_Deva","bjn_Arab","bjn_Latn","bod_Tibt","bos_Latn","bug_Latn","bul_Cyrl","cat_Latn","ceb_Latn","ces_Latn","cjk_Latn","ckb_Arab","crh_Latn","cym_Latn","dan_Latn","deu_Latn","dik_Latn","dyu_Latn","dzo_Tibt","ell_Grek","eng_Latn","epo_Latn","est_Latn","eus_Latn","ewe_Latn","fao_Latn","pes_Arab","fij_Latn","fin_Latn","fon_Latn","fra_Latn","fur_Latn","fuv_Latn","gla_Latn","gle_Latn","glg_Latn","grn_Latn","guj_Gujr","hat_Latn","hau_Latn","heb_Hebr","hin_Deva","hne_Deva","hrv_Latn","hun_Latn","hye_Armn","ibo_Latn","ilo_Latn","ind_Latn","isl_Latn","ita_Latn","jav_Latn","jpn_Jpan","kab_Latn","kac_Latn","kam_Latn","kan_Knda","kas_Arab","kas_Deva","kat_Geor","knc_Arab","knc_Latn","kaz_Cyrl","kbp_Latn","kea_Latn","khm_Khmr","kik_Latn","kin_Latn","kir_Cyrl","kmb_Latn","kon_Latn","kor_Hang","kmr_Latn","lao_Laoo","lvs_Latn","lij_Latn","lim_Latn","lin_Latn","lit_Latn","lmo_Latn","ltg_Latn","ltz_Latn","lua_Latn","lug_Latn","luo_Latn","lus_Latn","mag_Deva","mai_Deva","mal_Mlym","mar_Deva","min_Arab","min_Latn","mkd_Cyrl","plt_Latn","mlt_Latn","mni_Beng","khk_Cyrl","mos_Latn","mri_Latn","zsm_Latn","mya_Mymr","nld_Latn","nno_Latn","nob_Latn","npi_Deva","nso_Latn","nus_Latn","nya_Latn","oci_Latn","gaz_Latn","ory_Orya","pag_Latn","pan_Guru","pap_Latn","pol_Latn","por_Latn","prs_Arab","pbt_Arab","quy_Latn","ron_Latn","run_Latn","rus_Cyrl","sag_Latn","san_Deva","sat_Beng","scn_Latn","shn_Mymr","sin_Sinh","slk_Latn","slv_Latn","smo_Latn","sna_Latn","snd_Arab","som_Latn","sot_Latn","spa_Latn","als_Latn","srd_Latn","srp_Cyrl","ssw_Latn","sun_Latn","swe_Latn","swh_Latn","szl_Latn","tam_Taml","tat_Cyrl","tel_Telu","tgk_Cyrl","tgl_Latn","tha_Thai","tir_Ethi","taq_Latn","taq_Tfng","tpi_Latn","tsn_Latn","tso_Latn","tuk_Latn","tum_Latn","tur_Latn","twi_Latn","tzm_Tfng","uig_Arab","ukr_Cyrl","umb_Latn","urd_Arab","uzn_Latn","vec_Latn","vie_Latn","war_Latn","wol_Latn","xho_Latn","ydd_Hebr","yor_Latn","yue_Hant","zho_Hans","zho_Hant",
"zul_Latn"]
assert(len(all_filenames) == 204)
filenames = glob.glob("/large_experiments/nllb/mmt/flores200_final/dev/*")
counter = []
for filename in filenames:
langcode = filename.replace("/large_experiments/nllb/mmt/flores200_final/dev/", "").replace(".dev", "")
if "_" in langcode:
counter.append(langcode)
missing_codes = []
counter = set(counter)
for i in all_filenames:
if i not in counter:
missing_codes.append(i)
else:
subprocess.run(f"cp -L /large_experiments/nllb/mmt/flores200_final/dev/{i}.dev /private/home/angelafan/flores_opensource_links/dev/", shell=True)
# print(f"cp /large_experiments/nllb/mmt/flores200_final/dev/{i}.dev /private/home/angelafan/flores_opensource_links/dev/")
filenames = glob.glob("/large_experiments/nllb/mmt/flores101_beta/dev/*")
print(missing_codes)
for filename in filenames:
langcode = filename.replace("/large_experiments/nllb/mmt/flores101_beta/dev/", "").replace(".dev", "")
if langcode in missing_codes:
print(langcode, filename)
subprocess.run(f"cp -L {filename} /private/home/angelafan/flores_opensource_links/dev/", shell=True)
# import glob
# import os
# import subprocess
# all_filenames = ["ace_Arab","ace_Latn","acm_Arab","acq_Arab","aeb_Arab","afr_Latn","ajp_Arab","aka_Latn","amh_Ethi","apc_Arab","arb_Arab","arb_Latn","ars_Arab","ary_Arab","arz_Arab","asm_Beng","ast_Latn","awa_Deva","ayr_Latn","azb_Arab","azj_Latn","bak_Cyrl","bam_Latn","ban_Latn","bel_Cyrl","bem_Latn","ben_Beng","bho_Deva","bjn_Arab","bjn_Latn","bod_Tibt","bos_Latn","bug_Latn","bul_Cyrl","cat_Latn","ceb_Latn","ces_Latn","cjk_Latn","ckb_Arab","crh_Latn","cym_Latn","dan_Latn","deu_Latn","dik_Latn","dyu_Latn","dzo_Tibt","ell_Grek","eng_Latn","epo_Latn","est_Latn","eus_Latn","ewe_Latn","fao_Latn","pes_Arab","fij_Latn","fin_Latn","fon_Latn","fra_Latn","fur_Latn","fuv_Latn","gla_Latn","gle_Latn","glg_Latn","grn_Latn","guj_Gujr","hat_Latn","hau_Latn","heb_Hebr","hin_Deva","hne_Deva","hrv_Latn","hun_Latn","hye_Armn","ibo_Latn","ilo_Latn","ind_Latn","isl_Latn","ita_Latn","jav_Latn","jpn_Jpan","kab_Latn","kac_Latn","kam_Latn","kan_Knda","kas_Arab","kas_Deva","kat_Geor","knc_Arab","knc_Latn","kaz_Cyrl","kbp_Latn","kea_Latn","khm_Khmr","kik_Latn","kin_Latn","kir_Cyrl","kmb_Latn","kon_Latn","kor_Hang","kmr_Latn","lao_Laoo","lvs_Latn","lij_Latn","lim_Latn","lin_Latn","lit_Latn","lmo_Latn","ltg_Latn","ltz_Latn","lua_Latn","lug_Latn","luo_Latn","lus_Latn","mag_Deva","mai_Deva","mal_Mlym","mar_Deva","min_Arab","min_Latn","mkd_Cyrl","plt_Latn","mlt_Latn","mni_Beng","khk_Cyrl","mos_Latn","mri_Latn","zsm_Latn","mya_Mymr","nld_Latn","nno_Latn","nob_Latn","npi_Deva","nso_Latn","nus_Latn","nya_Latn","oci_Latn","gaz_Latn","ory_Orya","pag_Latn","pan_Guru","pap_Latn","pol_Latn","por_Latn","prs_Arab","pbt_Arab","quy_Latn","ron_Latn","run_Latn","rus_Cyrl","sag_Latn","san_Deva","sat_Beng","scn_Latn","shn_Mymr","sin_Sinh","slk_Latn","slv_Latn","smo_Latn","sna_Latn","snd_Arab","som_Latn","sot_Latn","spa_Latn","als_Latn","srd_Latn","srp_Cyrl","ssw_Latn","sun_Latn","swe_Latn","swh_Latn","szl_Latn","tam_Taml","tat_Cyrl","tel_Telu","tgk_Cyrl","tgl_Latn","tha_Thai","tir_Ethi","taq_Latn","taq_Tfng","tpi_Latn","tsn_Latn","tso_Latn","tuk_Latn","tum_Latn","tur_Latn","twi_Latn","tzm_Tfng","uig_Arab","ukr_Cyrl","umb_Latn","urd_Arab","uzn_Latn","vec_Latn","vie_Latn","war_Latn","wol_Latn","xho_Latn","ydd_Hebr","yor_Latn","yue_Hant","zho_Hans","zho_Hant",
# "zul_Latn"]
# assert(len(all_filenames) == 204)
# filenames = glob.glob("/large_experiments/nllb/mmt/flores200_final/devtest/*")
# counter = []
# for filename in filenames:
# langcode = filename.replace("/large_experiments/nllb/mmt/flores200_final/devtest/", "").replace(".devtest", "")
# if "_" in langcode:
# counter.append(langcode)
# missing_codes = []
# counter = set(counter)
# for i in all_filenames:
# if i not in counter:
# missing_codes.append(i)
# else:
# subprocess.run(f"cp -L /large_experiments/nllb/mmt/flores200_final/devtest/{i}.devtest /private/home/angelafan/flores_opensource_links/devtest/", shell=True)
# # print(f"cp -L /large_experiments/nllb/mmt/flores200_final/devtest/{i}.devtest /private/home/angelafan/flores_opensource_links/dev/")
# filenames = glob.glob("/large_experiments/nllb/mmt/flores101_beta/devtest/*")
# print(missing_codes)
# for filename in filenames:
# langcode = filename.replace("/large_experiments/nllb/mmt/flores101_beta/devtest/", "").replace(".devtest", "")
# if langcode in missing_codes:
# print(langcode, filename)
# subprocess.run(f"cp -L {filename} /private/home/angelafan/flores_opensource_links/devtest/", shell=True)
# # print(f"cp -L {filename} /private/home/angelafan/flores_opensource_links/devtest/")