-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
94 lines (78 loc) · 3.45 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import xapian
try:
import json
except ImportError:
import simplejson as json
import helper
class PluginIndexer(object):
def __init__(self, db_path):
self.db = xapian.WritableDatabase(db_path, xapian.DB_CREATE_OR_OPEN)
def index(self, name, author, author_email, description):
doc = xapian.Document()
data = {'name': name,
'author': author,
'author_email': author_email,
'description': description}
name = name.lower()
author = author.lower()
author_email = author_email.lower()
description = description.lower()
json_data = json.dumps(data)
# this adds arbitrary string data that is retrived when
# you preform a search - you can put as much data or
# as little data as you would like here - for instance
# you can simply put the name in here and query the
# master db to get the rest of the data or you can
# cache the entire DB record here
doc.set_data(json_data)
description_indexer = xapian.TermGenerator()
stemmer = xapian.Stem("english")
description_indexer.set_stemmer(stemmer)
nonstemmed_indexer = xapian.TermGenerator()
description_indexer.set_document(doc)
nonstemmed_indexer.set_document(doc)
nonstemmed_indexer.index_text(name)
description_indexer.index_text(description)
# weight the name
doc.add_term(name, 10)
nonstemmed_indexer.index_text(author)
nonstemmed_indexer.index_text(author_email)
nonstemmed_indexer.index_text(description)
# add values to slots
doc.add_value(helper.NAME_SLOT, name)
doc.add_value(helper.AUTHOR_SLOT, author)
doc.add_value(helper.AUTHOR_EMAIL_SLOT, author_email)
# Add the document to the database.
self.db.add_document(doc)
def populate_test_data():
# remember if the DB already exists you will be adding duplicate doc
# there is two stratagies for updating db data
# 1. always create a new db, erase the old db and move the new one in its place
# 2. check to see if a document already exists (you will need to add boolean slot matches
# on the primary id which is name in this case) and update the document
indexer = PluginIndexer('xapian_test')
test_data = [{'name': 'test1_plugin',
'author': 'John (J5) Palmieri',
'author_email': '[email protected]',
'description': 'This is a test plugin. The first one in fact. I hope you like it!'
},
{'name': 'test2_plugin',
'author': 'John (J5) Palmieri',
'author_email': '[email protected]',
'description': 'This is a test plugin. The second one in fact. I hope you like it!'
},
{'name': 'Foo',
'author': 'Jasper St. Pierre',
'author_email': '[email protected]',
'description': 'FooBar? YouBar! How do you like them apples?'
},
{'name': 'Bar',
'author': 'Jasper St. Pierre',
'author_email': '[email protected]',
'description': 'Apples? I like oranges!'
}]
for data in test_data:
print "Indexing %s" % data['name']
indexer.index(**data)
if __name__ == "__main__":
populate_test_data()