-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
164 lines (127 loc) · 5.24 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# from operator import itemgetter
from __future__ import print_function
from math import log, sqrt
from collections import defaultdict
import nltk, io
import sys
import pickle
nos_of_documents = 0
vects_for_docs = [] # we will need nos of docs number of vectors, each vector is a dictionary
document_freq_vect = {} # sort of equivalent to initializing the number of unique words to 0
word_line_array = []
with (open("/home/godfather/Documents/text-search-engine-master/db1.txt", "r")) as openfile:
while True:
try:
vects_for_docs.append(pickle.load(openfile))
except EOFError:
break
with (open("/home/godfather/Documents/text-search-engine-master/db2.txt", "r")) as openfile1:
while True:
try:
word_line_array.append(pickle.load(openfile1))
except EOFError:
break
def main(arg):
global nos_of_documents
nos_of_documents = int(arg)+1
if __name__ == "__main__":
x= main(sys.argv[1])
# creates a vector from a query in the form of a list (l1) , vector is a dictionary, containing words:frequency pairs
def create_vector_from_query(l1):
vect = {}
for token in l1:
if token in vect:
vect[token] += 1.0
else:
vect[token] = 1.0
return vect
# note: even though you do not need to convert the query vector into a unit vector,
# I have done so because that would make all the dot products <= 1
# as the name suggests, this function converts a given query vector
# into a tf-idf unit vector(word:tf-idf vector given a word:frequency vector
def get_tf_idf_from_query_vect(query_vector1):
vect_length = 0.0
for word1 in query_vector1:
word_freq = query_vector1[word1]
if word1 in document_freq_vect: # I have left out any term which has not occurred in any document because
query_vector1[word1] = calc_tf_idf(word1, word_freq)
else:
query_vector1[word1] = log(1 + word_freq) * log(nos_of_documents)
# this additional line will ensure that if the 2 queries,
# the first having all words in some documents,
# and the second having and extra word that is not in any document,
# will not end up having the same dot product value for all documents
vect_length += query_vector1[word1] ** 2
vect_length = sqrt(vect_length)
if vect_length != 0:
for word1 in query_vector1:
query_vector1[word1] /= vect_length
# precondition: word is in the document_freq_vect
# this function calculates the tf-idf score for a given word in a document
def calc_tf_idf(word1, word_freq):
return log(1 + word_freq) * log(nos_of_documents / document_freq_vect[word1])
# this function returns a list of tokenized and stemmed words of any text
def get_tokenized_and_normalized_list(doc_text):
# return doc_text.split()
tokens = nltk.word_tokenize(doc_text)
ps = nltk.stem.PorterStemmer()
stemmed = []
for words in tokens:
stemmed.append(ps.stem(words))
return stemmed
# this function returns the dot product of vector1 and vector2
def get_dot_product(vector1, vector2):
if len(vector1) > len(vector2): # this will ensure that len(dict1) < len(dict2)
temp = vector1
vector1 = vector2
vector2 = temp
keys1 = vector1.keys()
keys2 = vector2.keys()
sum = 0
for i in keys1:
if i in keys2:
sum += vector1[i] * vector2[i]
return sum
def get_line_info(vector1, vector2):
if len(vector1) > len(vector2): # this will ensure that len(dict1) < len(dict2)
temp = vector1
vector1 = vector2
vector2 = temp
keys1 = vector1.keys()
keys2 = vector2.keys()
sum = "000"
for i in keys1:
if i in keys2:
sum = sum + str(vector2[i]).zfill(3)
return sum
# this function takes the dot product of the query with all the documents
# and returns a sorted list of tuples of docId, cosine score pairs
def get_result_from_query_vect(query_vector1):
parsed_list = []
for i in range(nos_of_documents - 1):
dot_prod = get_dot_product(query_vector1, vects_for_docs[i])
line_info = get_line_info(query_vector1, word_line_array[i])
parsed_list.append((i, dot_prod, line_info))
parsed_list = sorted(parsed_list, key=lambda x: x[1])
return parsed_list
print()
while True:
query = raw_input("Please enter your query....")
if len(query) == 0:
break
query_list = get_tokenized_and_normalized_list(query)
query_vector = create_vector_from_query(query_list)
get_tf_idf_from_query_vect(query_vector)
result_set = get_result_from_query_vect(query_vector)
for tup in reversed(result_set) :
if tup[1] != 0:
print("The docid is " + str(tup[0]).zfill(4) + " and the weight is " + str(tup[1]) )
temp = int(tup[2])
#print (temp)
print("Line number : ",end=' ')
while temp !=0 :
print(str(temp%1000) + " ",end='')
temp = temp/1000
print()
print()
print()