-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTF_IDF_baseline.py
89 lines (82 loc) · 2.44 KB
/
TF_IDF_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import num2words
import math
import re
D=[]
porter_stemmer = PorterStemmer()
#To tokenize and to convert inot lower case,removal of stop words.
def word_tokenize_all(list):
punctuations=['+',',','?','!','/','@','#','$']
tokens=[]
for lines in list:
all_words=word_tokenize(lines)
all_words=map(lambda x:x.lower(),all_words)
filtered_words = [word for word in all_words if word not in stopwords.words('english')+punctuations]
stemmed_word=[porter_stemmer.stem(word) for word in filtered_words]
add_numbers=[num2words.num2words(int(words)) if words.isnumeric() else words for words in stemmed_word]
tokens.append(add_numbers)
return tokens
fp=open('./data/train.pair_tok.tsv','r')
list=[line.replace(',', ' ') for line in fp.readlines()]
D=word_tokenize_all(list)
fp.close()
def score(q, d, op_type):
if op_type=='and':
if len(set(q).intersection(d))==len(set(q)):
ret = 0
for t in q:
ret += (tf(t, d) * idf(t))
return ret
else:
return 0
else:
ret = 0
for t in q:
ret += (tf(t, d) * idf(t))
return ret
def tf(t, d):
return float(d.count(t)) / len(d)
def idf(t):
count_in_doucment = 0
for d in D:
if t in d:
count_in_doucment += 1
return 0 if count_in_doucment == 0 else math.log(len(D)/float(count_in_doucment))
while True:
#dictionary containing top k documents which matches the query term
scores={}
query=input('enter the query string: or q to quit\n')
print ('**************************************************************\n')
#checking if the query starts with double quotes(exact match)
if query.startswith('"'):
op='and'
else:
op='or'
query=re.sub(r'boundary','boundary 4 6',query)
query=re.sub(r'couple','couple 2',query)
query=re.sub(r'[#+?!"/@$,]',' ',query)
if query=='q' or query=='quit':
break
query=word_tokenize_all([query])
i=0
for d in D:
#calculating score for each document
sc=score(query[0],d,op)
#storing score in the dictionary only if the score > 0
if sc != 0:
scores[i]=sc
i+=1
if len(scores)==0:
print ('no match found')
else:
#getting top 5 documents matching the query
top_docs=sorted(scores, key=scores.get, reverse=True)[:5]
rank=1
for id in top_docs:
print( "rank:",rank,'score',scores[id])
print (list[id].strip())
print( "-----------------------------------------------")
rank+=1