-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
59 lines (52 loc) · 1.59 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy
def get_sentiment_dict():
word_sent_dict = {}
with open('../external_software/sentiment/swn.txt', 'r') as f:
line = f.readline()
while line != '':
parts = line.split('\t')
if '#' in parts[0]:
line = f.readline()
continue
words = parts[4].split()
pos = float(parts[2])
neg = float(parts[3])
for word in words:
w = word.split('#')[0]
if w not in word_sent_dict:
word_sent_dict[w] = [0, 0, 0] # pos, neg, count
prev_pos, prev_neg, prev_count = word_sent_dict[w]
word_sent_dict[w][0] = (prev_pos * prev_count + pos) / (prev_count + 1)
word_sent_dict[w][1] = (prev_neg * prev_count + neg) / (prev_count + 1)
word_sent_dict[w][2] += 1
line = f.readline()
return word_sent_dict
def get_sentiment_counts(train_file):
features = []
sent_dict = get_sentiment_dict()
curr_review_words = set()
num_docs = 0
not_found = 0
found = 0
with open(train_file, 'r') as f:
line = f.readline()
while line != '':
# Iterate through file collecting counts for the entire corpus
if 'review/text: ' in line:
features.append([0.0] * 2) # positive, negative
line = line[len('review/text: '):]
words = line.split()
for word in words:
# Check if word in sentiment corpus
if word in sent_dict:
pos, neg, count = sent_dict[word]
features[-1][0] += pos / len(words)
features[-1][1] += neg / len(words)
found += 1.0
else:
not_found += 1.0
line = f.readline()
return numpy.array(features)
if __name__ == "__main__":
sd = get_sentiment_dict()
print get_sentiment_counts('../dataset/small_train.txt')