-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearchengine.py
123 lines (108 loc) · 4.21 KB
/
searchengine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import os
from collections import defaultdict
from flask import Flask, request
from diskindex import *
from normalize import stem
from query import QueryProcessor
app = Flask(__name__)
@app.route('/test', methods=['GET', 'POST'])
def home():
"""Used to test API."""
return json.dumps({'response': "hello"})
@app.route('/buildindex', methods=['GET', 'POST'])
def buildindex():
"""Parse files of given directory, build in-memory index index, and return."""
if request.method == 'POST':
docs = []
file_contents = {}
doc_id_files = {}
docs_dir = request.form['corpus_dir']
build_str = request.form['build']
build = True if build_str == 'true' else False
id = 0
for root,dirs,files in os.walk(docs_dir):
files = sorted(files)
for file in files:
doc_id_files[id] = file
id += 1
with open(os.path.join(docs_dir, file), 'r') as json_data:
content = json.load(json_data)
docs.append(content['body'])
file_contents[file] = {'body': content['body'],
'title': content['title'],
'url': content['url']}
if build:
indexwriter = IndexWriter()
indexwriter.build_index(docs)
queryprocessor = QueryProcessor(num_docs=len(files))
diskindex = queryprocessor.disk_index
vocab = diskindex.get_vocab()
app.config['queryprocessor'] = queryprocessor
app.config['diskindex'] = diskindex
app.config['vocab'] = vocab
app.config['file_contents'] = file_contents
app.config['doc_id_files'] = doc_id_files
return json.dumps({
'files': files,
'doc_count': len(files),
'terms': vocab,
'term_count': len(vocab)
})
@app.route('/showterms', methods=['GET', 'POST'])
def showterms():
"""Return terms of the index."""
if request.method == 'POST':
vocab = app.config['vocab']
alphabet = defaultdict(list)
for term in vocab:
if term != "":
alphabet[term[0]].append(term)
return json.dumps({
'vocab': alphabet
})
@app.route('/query', methods=['GET', 'POST'])
def query():
"""Process user query and return relevant documents."""
if request.method == 'POST':
doc_id_files = app.config['doc_id_files']
file_contents = app.config['file_contents']
queryprocessor = app.config['queryprocessor']
vocab = app.config['vocab']
ranked_str = request.form['rankedRetrieval']
query = request.form['query']
ranked = True if ranked_str == 'true' else False
search_results = queryprocessor.query(query, ranked)
relevant_files = []
relevant_contents = {}
scores = []
spell_corrected = queryprocessor.check_spelling(query, vocab, ranked)
for result in search_results:
if ranked:
doc_id = result[0]
scores.append(result[1])
else:
doc_id = result
file = doc_id_files[doc_id]
relevant_files.append(file)
relevant_contents[file] = file_contents[file]
return json.dumps({
'doc_ids': search_results,
'files': relevant_files,
'contents': relevant_contents,
'ranked': ranked,
'scores': scores,
'spell_corrected': spell_corrected
})
@app.route('/stem', methods=['GET', 'POST'])
def stem():
"""Return the stem of the word."""
if request.method == 'POST':
stemmed_term = stem(request.form['term'])
print(stemmed_term)
return json.dumps({
'term': request.form['term'],
'stemmed_term': stemmed_term
})
if __name__ == "__main__":
app.run(debug = False)