-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_vocab.py
51 lines (43 loc) · 1.08 KB
/
create_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
'''
----------------------------------------------
CREATE VOCAB FROM TRAIN ENGLISH AND HINDI DATA
----------------------------------------------
'''
import pandas as pd
import os
import collections
import codecs
if not os.path.exists('vocab'):
os.makedirs('vocab')
train=pd.read_csv('dl2019pa3/train.csv')
ids=train['id'].tolist()
eng_words=train['ENG'].tolist()
hindi_words=train['HIN'].tolist()
# make english vocab
eng_vocab=[]
for i in range(len(eng_words)) :
chars=eng_words[i].split(' ')
eng_vocab.extend(chars)
eng_vocab=list(set(eng_vocab))
with codecs.open(os.path.join('vocab','eng.txt'),'w') as f :
f.write('<unk>')
f.write('\n')
f.write('<pad>')
f.write('\n')
for char in eng_vocab :
f.write(char)
f.write('\n')
# make hindi vocab
hindi_vocab=[]
for i in range(len(hindi_words)) :
chars=hindi_words[i].split(' ')
hindi_vocab.extend(chars)
hindi_vocab=list(set(hindi_vocab))
with codecs.open(os.path.join('vocab','hindi.txt'),'w') as f :
f.write('<unk>')
f.write('\n')
f.write('<pad>')
f.write('\n')
for char in hindi_vocab :
f.write(char)
f.write('\n')