-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount.py
124 lines (96 loc) · 3.27 KB
/
count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import numpy as np
import re
iDebug = 2 # level of debug
filename = "happiness_seg.txt"
if iDebug>3:
filename = "happiness.txt" # for test
## load file
print("-"*20)
print("Loading file...")
file_object = open(filename,'r',encoding="UTF-8")
try:
file_context = file_object.read()
finally:
file_object.close()
#print(file_context)
print("="*80)
print("-"*20)
print("File length:%d"%len(file_context))
## split input to wordsSplited
print("-"*40)
print("split input to wordsSplited...")
textSplit = re.split(r'[: 。 ; , : “ ”( ) 、 ? 《 》 \s \t,.-―]*',file_context)
textLen = len(textSplit)
if iDebug>3:
print("text splited:")
print(textSplit)
print("-"*40)
print("Total wordsSplited: %d"%textLen)
wordsArray =[]
N = 0
for word in textSplit:
if len(word)>0:
if not word in wordsArray:
wordsArray.append(word)
N = N+1
print("Total words: %d"%N)
if iDebug>3:
print("words array:")
print(wordsArray)
## create the array
print("-"*40)
print("Create the array...")
arrayCounts = np.zeros((N,N))
listTopWords= []
# for word in textSplit:
# for iWord, word in enumerate(textSplit):
for i in range(textLen-1):
wordCurrent = textSplit[i]
wordNext = textSplit[i+1]
# skip if the word is empty
if len(wordCurrent)<1 or len(wordNext)<1:
continue
# update the matrix
iRow = wordsArray.index(wordCurrent)
iCol = wordsArray.index(wordNext)
newValue = arrayCounts[iRow,iCol] + 1
arrayCounts[iRow,iCol] = newValue
if iDebug>2:
print ("(%8d,%8d)=%4d:%s %s"%(iRow,iCol,arrayCounts[iRow,iCol],wordCurrent,wordNext))
# search for the top 10 word pairs
if (len(listTopWords)<10) and (not [iRow,iCol] in listTopWords):
listTopWords.append([iRow,iCol])
else:
# find and remove the minual in top list
iIndexMin = listTopWords[0]
listMinvalue = arrayCounts[iIndexMin[0],iIndexMin[1]]
for iIndex in listTopWords:
iValue = arrayCounts[iIndex[0],iIndex[1]]
if iValue < listMinvalue:
#print("my:%d,%d=%d"%(iIndex[0],iIndex[1],arrayCounts[iIndex[0],iIndex[1]]))
iIndexMin = iIndex
listMinvalue = iValue
if iDebug>3:
print("the minum index is")
print(listMinvalue)
print("(%d,%d)=%d:%s %s"%(iIndexMin[0],iIndexMin[1],arrayCounts[iIndexMin[0],iIndexMin[1]],wordsArray[iIndexMin[0]],wordsArray[iIndexMin[1]]))
if (newValue > listMinvalue) and (not [iRow,iCol] in listTopWords):
listTopWords.remove(iIndexMin)
listTopWords.append([iRow,iCol])
if iDebug>3:
print(listTopWords)
# print the final results
totalCount = np.sum(arrayCounts)
print("-"*20)
print("Final top words:")
for iIndex in listTopWords:
cnt = arrayCounts[iIndex[0],iIndex[1]]
print("%s %s\t:%.2f%%(%d of %d)"%(wordsArray[iIndex[0]],wordsArray[iIndex[1]],100*cnt/totalCount,cnt,totalCount))
if iDebug>3:
print(arrayCounts)
print(np.max(arrayCounts))
print(listTopWords)
for iIndex in listTopWords:
print("(%d,%d)=%d/%d:%s %s"%(iIndex[0],iIndex[1],arrayCounts[iIndex[0],iIndex[1]],totalCount,wordsArray[iIndex[0]],wordsArray[iIndex[1]]))