-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_index.py
103 lines (88 loc) · 3.55 KB
/
search_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# encoding=utf-8
from __future__ import unicode_literals
import os
import re
from lxml import etree
from whoosh import highlight
from whoosh.analysis import CharsetFilter, RegexTokenizer
from whoosh.fields import Schema, ID, NUMERIC, TEXT
from whoosh.index import create_in, open_dir
from whoosh.query import And, Term
from whoosh.qparser import QueryParser
from whoosh.support.charset import accent_map
analyzer = RegexTokenizer() | CharsetFilter(accent_map)
schema = Schema(
bookname=ID(stored=True),
pagenum=NUMERIC(stored=True),
content=TEXT(analyzer=analyzer, stored=True)
)
BOOK_PATH = os.path.join(os.path.expanduser('books'), '')
INDEX_PATH = os.path.join(BOOK_PATH, '.index')
class StringFormatter(highlight.Formatter):
def __init__(self, begin_str, end_str):
self.begin_str = begin_str
self.end_str = end_str
def format_token(self, text, token, replace=False):
tokentext = highlight.get_text(text, token, replace)
return "{0}{1}{2}".format(self.begin_str, tokentext, self.end_str)
def _get_index():
if not os.path.exists(INDEX_PATH):
os.mkdir(INDEX_PATH)
ix = create_in(INDEX_PATH, schema)
else:
ix = open_dir(INDEX_PATH)
return ix
def index_book(bookname):
# TODO: Index by paragraph, not page
idx = _get_index()
writer = idx.writer()
writer.delete_by_term('bookname', unicode(bookname))
path = os.path.join(BOOK_PATH, bookname, "{0}.hocr".format(bookname))
bookname = unicode(os.path.splitext(os.path.basename(path))[0])
booktree = etree.parse(path)
for page in booktree.xpath('//div[@class="ocr_page"]'):
# Get cleaned up text for page
text = "\n".join("".join(x.itertext()).strip()
for x in page.xpath('.//span[@class="ocr_line"]'))
pagenum = int(page.get('id')[5:])
writer.add_document(bookname=bookname, pagenum=pagenum, content=text)
writer.commit()
def search(term, bookname=None, limit=None):
out_list = []
with _get_index().searcher() as searcher:
parser = QueryParser("content", schema=schema)
print searcher
query = parser.parse(term)
if bookname:
query = And([query, Term("bookname", unicode(bookname))])
results = searcher.search(query, limit=limit)
results.fragmenter.charlimit = None
results.fragmenter.maxchars = 300
results.fragmenter.surround = 50
results.formatter = StringFormatter('{{{', '}}}')
for hit in results:
out_list.append({
'bookname': hit['bookname'],
'pagenum': hit['pagenum'],
'snippet': hit.highlights("content"),
'highlights': _get_highlights(hit)
})
print out_list
return out_list
def _get_highlights(result):
# FIXME: This is f*****ing slow...
highlights = []
fname = os.path.join(BOOK_PATH, result['bookname'],
"{0}.hocr".format(result['bookname']))
tree = etree.parse(fname)
page = tree.xpath('//div[@id="page_{0}"]'.format(result['pagenum']))[0]
hl_tokens = set(re.findall(r'{{{([^{}]+)}}}',
result.highlights("content")))
for token in hl_tokens:
occurences = [x for x in page.xpath('.//span[@class="ocrx_word"]')
if "".join(x.itertext())
and token.lower() in "".join(x.itertext()).lower()]
for hit in occurences:
highlights.append(tuple(hit.get('title').replace('bbox ', '')
.split(' ')))
return tuple(highlights)