-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
68 lines (50 loc) · 1.5 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import csv
import re
import nltk
import numpy as np
import pickle
sentence_detector = nltk.data.load("tokenizers/punkt/english.pickle")
def preprocess(s):
s = s.decode(encoding='UTF-8').strip()
tokenized = []
for string in sentence_detector.tokenize(s):
string = re.sub(r"`", " ", string)
string = re.sub(r"/", " / ", string)
string = re.sub(r"=", " = ", string)
string = re.sub(r"~", " ~ ", string)
string = re.sub(r"~(\s+~)+", " ~~~ ", string)
string = re.sub(r"\.{3,}", "...", string)
tokenized.extend(nltk.tokenize.word_tokenize(string))
result = " ".join(tokenized)
return result.lower()
f1 = open("raw_train.csv",'rb')
f2 = open("raw_test.csv",'rb')
s1 = []
s2 = []
label = []
reader = csv.reader(f1)
for row in reader:
s1 = s1 + [preprocess(row[3])]
s2 = s2 + [preprocess(row[4])]
label = label + [row[5]]
s1 = s1[1:]
s2 = s2[1:]
label = np.array(label[1:])
label = label.astype(int)
one_hot_label = np.zeros((len(s1),2))
one_hot_label[np.arange(len(s1)), label] = 1
pickle.dump((s1,s2,one_hot_label), open("preprocessed_train.pickle",'wb'))
index = 0
s1 = []
s2 = []
reader = csv.reader(f2)
reader.next()
for row in reader:
s1 = s1 + [preprocess(row[1])]
s2 = s2 + [preprocess(row[2])]
index = index + 1
if index % 200000 == 0:
pickle.dump((s1,s2), open("preprocessed_test%d.pickle" % (index/200000 - 1),'wb'))
s1 = []
s2 = []
pickle.dump((s1,s2), open("preprocessed_test%d.pickle" % (index/200000),'wb'))