-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_freq.py
67 lines (54 loc) · 1.7 KB
/
word_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#-*- coding: utf-8 -*-
import re
import morfeusz
import csv
import nltk
from collections import defaultdict
from sys import stderr
from sys import argv
from time import time
####################
def get_word_counts(text):
words = defaultdict(int)
tokens = nltk.wordpunct_tokenize(text)
for word in tokens: # for every word in given text -> lemmatize & count
word = re.sub(r'[_+=:;"\'\?/>.<,\\]',' ',word)
if len(word) > 1:
#print word.encode('utf8')
res = morfeusz.analyse(word,expand_tags=False,dag=True) # morfological analyzer for Polish
try:
base = res[0][2][1]
except IndexError:
base = None
#list.append(tup[1])
if base is not None:
words[base] += 1 # increment the word count
else:
pass
return words
#################
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
yield [unicode(cell, 'utf-8') for cell in row]
#############################################################3
try:
sname, fname, column = argv
except ValueError:
quit("Err: pass a filename.")
column = int(column)
with open(fname, "r") as csvfile:
reader = unicode_csv_reader(csvfile,delimiter=';',quotechar='"')
start = time()
i= 0
for row in reader:
if i % 1000 == 0:
print >> stderr, "%d rows read in %f" % (i, time() -start)
i += 1
rows_returned = get_word_counts(row[column])
for k,v in rows_returned.iteritems():
out = '"' + '";"'.join(row[:column] ) + '";' + u'"' + k + u'";"' + unicode(v) + '"'
print out.encode('utf8')
end = time()
# print "Exec time: %r"% (end - start)
#read whole file - for big files You might want to change this to buffer-wise reading