-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathget_sentiment.py
133 lines (107 loc) · 4.55 KB
/
get_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
'''Code to assign sentiment to Arabic tweets by counting terms
~22.5m to process 400k tweets on Macbook Pro'''
#################
import csv,re,sys
import numpy as np
import matplotlib.pyplot as plt
import argparse
import codecs
###########
def getWordLists(stem):
###########
'''Reads terms to be tested matched in documents.
@stem is path to files
Returns tuple of words lists'''
posFile=csv.reader(open(stem+'pos_words.txt','r'),delimiter='\t')
negFile=csv.reader(open(stem+'neg_words_all.txt','r'),delimiter='\t')
stopFile=csv.reader(open(stem+'stop_words.txt','r'),delimiter='\t')
negationFile=csv.reader(open(stem+'negation_words.txt','r'),delimiter='\t')
posEmojiFile=csv.reader(open(stem+'pos_emojis.txt','r'),delimiter='\t')
negEmojiFile=csv.reader(open(stem+'neg_emojis.txt','r'),delimiter='\t')
posWords=[line[0].decode('utf-8') for line in posFile if len(line)>0]
negWords=[line[0].decode('utf-8') for line in negFile if len(line)>0]
#negWords+=[line[0].decode('utf-8') for line in negFileAdd if len(line)>0]
stopWords=[line[0].decode('utf-8') for line in stopFile if len(line)>0]
negationWords=[line[0].decode('utf-8') for line in negationFile if len(line)>0]
posEmojis=[line[0].decode('utf-8') for line in posEmojiFile if len(line)>0]
negEmojis=[line[0].decode('utf-8') for line in negEmojiFile if len(line)>0]
posEmojis=[re.escape(e) for e in posEmojis]
negEmojis=[re.escape(e) for e in negEmojis]
return posWords,negWords,stopWords,negationWords,posEmojis,negEmojis
###########
def main():
###########
parser = argparse.ArgumentParser()
parser.add_argument('inFilePath',help='Specify input file',type=str)
parser.add_argument('--stem',help='Path to files',type=str,default='terms/')
parser.add_argument('-p','--plot',help='Plot sentiments',action='store_true',default=False)
parser.add_argument('-v',help='Set verbose output',action='store_true',default=False)
args = parser.parse_args()
stem=args.stem
inFilePath=args.inFilePath
v=args.v
posCount=0
negCount=0
stopCount=0
negationCount=0
with codecs.open(inFilePath,'r',encoding='utf-8') as inFile:
tweets=inFile.read().split('\n')[0:-1]
# tweets=[u'انا بغير سعيد']
# tweets=[u':-)']
# tweets=[u'😜']
posWords,negWords,stopWords,negationWords,posEmojis,negEmojis=getWordLists(stem)
########################
positives=np.zeros(shape=len(tweets))
negatives=np.zeros(shape=len(tweets))
for t,tweet in enumerate(tweets):
if (t+1)%100000==0:print t+1,'Processed....'
posCount=negCount=stopCount=negationCount=0
for w,word in enumerate(tweet.split(' ')):
if v:print 'Word:',word
'''
if word in posWords:
posCount+=1
if v:print ' => POS'
if word in negWords:
negCount+=1
if v:print ' => NEG'
'''
if word in stopWords:
# Don't do RE match as single/double letter
# combos included in stop words
stopCount+=1
if v:print ' => STOP'
if word in negationWords:
negationCount+=1
if v:print ' => NEGATION'
if any([re.search(e,word,re.U) for e in posEmojis]):
posCount+=1
if v:print ' => POS EMOJI'
if any([re.search(e,word,re.U) for e in negEmojis]):
negCount+=1
if v:print ' => NEG EMOJI'
if v:print ''
if v:print '(pos,neg,stop,negation) = ',(posCount,negCount,stopCount,negationCount)
positives[t]=posCount
negatives[t]=negCount
combined=np.vstack((positives,negatives)).T
np.savetxt(stem+'sentiments.txt',combined,fmt="%d",delimiter='\t')
counts,xedges,yedges,im=plt.hist2d(positives,negatives)
print '%2.2f HAVE ZERO SENTIMENT' % (100.0*counts[0,0]/len(positives))
if args.plot:
# Plot distribution of sentiments?
fig=plt.figure()
ax=fig.add_subplot(211)
posRange=range(0,int(np.max(positives))+1)
ax.hist(positives,bins=posRange)
plt.xticks([0.5+i for i in posRange],[str(i) for i in posRange])
plt.ylabel('Positive Sentiment')
ax=fig.add_subplot(212)
negRange=range(0,int(np.max(negatives))+1)
ax.hist(negatives,bins=negRange)
plt.xticks([0.5+i for i in negRange],[str(i) for i in negRange])
plt.ylabel('Negative Sentiment')
plt.show()
if __name__=="__main__":
main()