-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProcessed.py
145 lines (115 loc) · 3.95 KB
/
Processed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pickle, string, nltk, os
from nltk.corpus import stopwords
nltk.download('stopwords')
def open_file(filename):
input_file = open(filename,'rb')
tweets = pickle.load(input_file)
input_file.close()
return tweets
def process_file(filename):
"""Makes a histogram that contains the words from a file.
filename: string
returns: frequency of words in a dictionary
"""
hist = {}
if filename[-7:] == '.pickle':
data = open_file(filename)
for line in data:
line = line.replace('-', ' ')
strippables = string.punctuation + string.whitespace
# remove punctuation and convert to lowercase
for word in line.split():
word = word.strip(strippables)
word = word.lower()
hist[word] = hist.get(word, 0) + 1
return hist
else:
fp = open(filename, encoding='utf8')
for line in fp:
line = line.replace('-', ' ')
strippables = string.punctuation + string.whitespace
for word in line.split():
word = word.strip(strippables)
word = word.lower()
hist[word] = hist.get(word, 0) + 1
return hist
def similar(d1, d2):
"""Returns a dictionary with all keys that appear in d1 and d2.
d1: pickle file
d2: word.txt
"""
result = {}
for i in d1:
if i in d2:
result[i] = d1[i]
return result
def final_words(dic):
"""
Returns a list of words that have the stop words removed.
dic is a dictionary
returns a dictionary
"""
#Check to see if the word is a stopword. If it is not a stopword, then append it to dictionary.
stop = set(stopwords.words('english'))
result = {}
for i in dic:
if i not in stop:
result[i]=dic[i]
return result
def most_common(hist, num):
"""Makes a list of word-freq pairs in descending order of frequency.
num: number of results showed
hist: map from word to frequency
returns: list of num (frequency, word) pairs
"""
com = []
for word, freq in hist.items():
com.append((freq, word))
com.sort()
com.reverse()
return com[:num]
def write_file(hist, filename):
"""Writes a dictionary to a pickle file.
filename: string without '.pickle'
hist: dictionary
"""
filename += '.pickle'
if not os.path.exists(filename):
f = open(filename,'wb')
pickle.dump(hist,f)
f.close()
print('File created as {}.' .format(filename))
else:
response = input("File {} already exists. Replace existing? (Yes/No): " .format(filename))
if response.lower() == 'yes':
f = open(filename,'wb')
pickle.dump(hist,f)
f.close()
print('File replaced as {}.' .format(filename))
elif response.lower() == 'no':
print('Action aborted.')
def main():
input_file1 = open('elontweets.pickle','br')
pickle.load(input_file1)
hist1 = process_file('elontweets.pickle')
words = process_file('words.txt')
real_words1 = similar(hist1, words)
completed_words1 = final_words(real_words1)
print("The processed words in the Elon tweets are:")
print(completed_words1)
input_file2 = open('trumptweets.pickle','br')
pickle.load(input_file2)
hist2 = process_file('trumptweets.pickle')
words = process_file('words.txt')
real_words2 = similar(hist2, words)
completed_words2 = final_words(real_words2)
print("The processed words in the Trump tweets are:")
print(completed_words2)
print('Most common words in Elon tweets are:')
print(most_common(completed_words1, 20))
print('Most common words in Trump tweets are:')
print(most_common(completed_words2, 20))
write_file(completed_words1, 'processedElon')
write_file(completed_words2, 'processedTrump')
if __name__ == '__main__':
main()