-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
38 lines (25 loc) · 1015 Bytes
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# split by new lines and spaces
news_split = all_text.split('\n\n')
all_text = ' '.join(news_split)
# create a list of words
words = all_text.split()
# feel free to use this import
from collections import Counter
## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
int_to_vocab = {word: ii for ii, word in vocab_to_int.items()}
woc_int = []
for w in words:
woc_int.append([vocab_to_int[word] for word in w.split()])
def pad_features(woc_ints, seq_length):
''' Return features of review_ints, where each review is padded with 0's
or truncated to the input seq_length.
'''
# getting the correct rows x cols shape
features = np.zeros((len(woc_ints), seq_length), dtype=int)
# for each review, I grab that review and
for i, row in enumerate(woc_ints):
features[i, -len(row):] = np.array(row)[:seq_length]
return features