-
Notifications
You must be signed in to change notification settings - Fork 5
/
tok_bpe_sample.py
35 lines (27 loc) · 1.17 KB
/
tok_bpe_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from nltk.tokenize import word_tokenize
import os
import subprocess
dir_path = "/projects/tir2/users/cmalaviy/bible-corpus/combined_test/"
dir_tok_path = "/projects/tir2/users/cmalaviy/bible-corpus/combined_tok/"
for lang in os.listdir(dir_path):
lang_tok = "_opt_tgt_" + lang
if not os.path.exists(dir_tok_path+lang):
os.makedirs(dir_tok_path + lang)
filename_lang = "train_" + lang + "_en." + lang + ".txt"
filename_en = "train_" + lang + "_en.en.txt"
with open(dir_path + lang + "/" + filename_lang) as f:
doc = f.read()
all_tok_lines = []
for line in doc.split("\n"):
tok_line = (lang_tok + " ".join(word_tokenize(line.decode('utf-8')))).lower()
all_tok_lines.append(tok_line)
with open(dir_tok_path + lang + "/" + filename_lang[:-4] + ".tok.txt",'w') as f:
f.write("\n".join(all_tok_lines))
with open(dir_path + lang + "/" + filename_en) as f:
doc = f.read()
all_tok_lines = []
for line in doc.split("\n"):
tok_line = " ".join(word_tokenize(line.decode('utf-8'))).lower()
all_tok_lines.append(tok_line)
with open(dir_tok_path + lang + "/" + filename_en[:-4] + ".tok.txt",'w') as f:
f.write("\n".join(all_tok_lines))