-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSummarize.py
65 lines (50 loc) · 1.35 KB
/
Summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import numpy as np
import re
import math
#function to find distance b/w points
def distance(a,b):
vect=(a-b)**2
dis=0
for a in vect:
dis+=a
return math.sqrt(dis)
#read text file and separate lines, then remove special symbols and from line containing only words
lineAry=[]
with open("data.txt") as file:
for line in file:
processedLine=""
for word in re.findall(r"[\w']+", line):
processedLine=processedLine+word
lineAry.append(processedLine)
print(lineAry)
#count vectorize lines with allowed ngram upto 2
vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(lineAry)
print(X.toArray())
#remove trivial (un-interesting) words
X=TfidfTransformer().fit_transform(X)
print(X.toArray())
#cluster the document
k= KMeans(n_clusters=min(len(lineAry),4), random_state=0).fit_transform(X)
#mean points
centers=k.cluster_centers_
#find points closest to the means
minIndex={}
min={}
i=0
for val in k.labels_:
d=distance(X[i],centers[val])
if !(val in minIndex):
min[val]=d
minIndex[val]=i
else :
if d<min[val]:
min[val]=d
minIndex[val]=i
i=i+1
for key in minIndex:
index=a[key]
print(lineAry[index])