diff --git a/moggie/search/engine.py b/moggie/search/engine.py index 149b92eac..9b0dd7f47 100644 --- a/moggie/search/engine.py +++ b/moggie/search/engine.py @@ -50,7 +50,7 @@ def explain_ops(ops): class PostingListBucket: """ A PostingListBucket is an unsorted sequence of binary packed - (keyword, IntSet) pairs. + (keyword, comment, IntSet) tuples. """ def __init__(self, blob, deleted=None, compress=None): self.blob = blob @@ -60,40 +60,57 @@ def __init__(self, blob, deleted=None, compress=None): def __iter__(self): beg = 0 while beg < len(self.blob): - kw_len, iset_len = struct.unpack('II', self.blob[beg:beg+8]) - end = beg + 8 + kw_len + iset_len - kw = self.blob[beg+8:beg+8+kw_len] + kw_ln, c_ln, iset_ln = struct.unpack('= self.l2_begin: - raise KeyError('Mutations not supported in l2') - + changes = [] with self.lock: - for op, idx in op_idx_kw_list: - iset = self.records[idx] - self.records[idx] = op(iset, mset) + for op, kw, idx in op_idx_kw_list: + plb = PostingListBucket(self.records.get(idx) or b'') + + iset = plb.get(kw) + oset = op(iset, mset) + + plb.set(kw, oset) + self.records[idx] = plb.blob + changes.append((idx, iset, oset)) - return {'mutations': len(op_idx_kw_list)} + return {'mutations': len(op_idx_kw_list), 'changes': changes} def profile_updates(self, which, t0, t1, t2, t3): p1 = int((t1 - t0) * 1000) @@ -406,24 +490,11 @@ def del_results(self, results, tag_namespace=''): t1 = time.time() for idx, kw in sorted(kw_idx_list): with self.lock: - if idx < self.l2_begin: - # These are instances of IntSet, de/serialization is done - # automatically by dumbcode. - try: - iset = self.records[idx] - iset -= keywords[kw] - iset -= self.deleted - self.records[idx] = iset - except: - logging.exception('Ugh, kw=%s idx=%s iset=%s' % (kw, idx, iset)) - raise - else: - # These are instances of PostingList - plb = PostingListBucket(self.records.get(idx) or b'') - plb.deleted = IntSet(copy=self.deleted) - plb.deleted |= keywords[kw] - plb.add(kw) - self.records[idx] = plb.blob + plb = PostingListBucket(self.records.get(idx) or b'') + plb.deleted = IntSet(copy=self.deleted) + plb.deleted |= keywords[kw] + plb.add(kw, []) + self.records[idx] = plb.blob t2 = time.time() self.update_terms(keywords) self.profile_updates('-%d' % len(kw_idx_list), t0, t1, t2, time.time()) @@ -436,35 +507,20 @@ def add_results(self, results, prefer_l1=None, tag_namespace=''): t1 = time.time() for idx, kw in sorted(kw_idx_list): with self.lock: - if idx < self.l2_begin: - try: - # These are instances of IntSet, de/serialization is done - # automatically by dumbcode. - iset = self.records[idx] - iset |= keywords[kw] - iset -= self.deleted - self.records[idx] = iset - except: - logging.exception('Ugh, kw=%s idx=%s iset=%s' % (kw, idx, iset)) - raise - else: - # These are instances of PostingList - plb = PostingListBucket(self.records.get(idx) or b'') - plb.deleted = self.deleted - plb.add(kw, *keywords[kw]) - self.records[idx] = plb.blob + # These are instances of PostingList + plb = PostingListBucket(self.records.get(idx) or b'') + plb.deleted = self.deleted + plb.add(kw, keywords[kw]) + self.records[idx] = plb.blob t2 = time.time() - self.update_terms(keywords) + self.part_spaces[1] |= set(keywords.keys()) self.profile_updates('+%d' % len(kw_idx_list), t0, t1, t2, time.time()) return {'keywords': len(keywords), 'hits': hits} def __getitem__(self, keyword): idx = self.keyword_index(keyword) - if idx < self.l2_begin: - return self.records.get(idx) or IntSet() - else: - plb = PostingListBucket(self.records.get(idx) or b'') - return plb.get(keyword) or IntSet() + plb = PostingListBucket(self.records.get(idx) or b'') + return plb.get(keyword) or IntSet() def _search(self, term, tag_ns): if isinstance(term, tuple): @@ -521,6 +577,26 @@ def search(self, terms, tag_namespace='', mask_deleted=True, explain=False): rv = (tag_namespace, ops, rv) return rv + def search_tags(self, search_set, tag_namespace=''): + """ + Search for tags that match a search (terms or tuple) or result set + (IntSet or list of ints). + + Returns a dictionary of (tag => (comment, IntSet)) mappings. + """ + if isinstance(search_set, (tuple, str)): + search_set = self.search(search_set, tag_namespace=tag_namespace) + if not isinstance(search_set, IntSet): + iset = IntSet() + iset |= search_set + search_set = iset + results = {} + for tag, (bcom, iset) in self.iter_tags(tag_namespace=tag_namespace): + iset &= search_set + if iset: + results[tag] = (bcom, iset) + return results + def magic_terms(self, term): what = term.split(':')[0].lower() magic = self.magic_term_map.get(what) @@ -549,12 +625,12 @@ def magic_candidates(self, term): if __name__ == '__main__': try: pl = PostingListBucket(b'', compress=128) - pl.add('hello', 1, 2, 3, 4) + pl.add('hello', [1, 2, 3, 4]) assert(isinstance(pl.get('hello'), IntSet)) assert(pl.get('floop') is None) assert(1 in pl.get('hello')) assert(5 not in pl.get('hello')) - pl.add('hello', 5) + pl.add('hello', [5]) assert(1 in pl.get('hello')) assert(5 in pl.get('hello')) pl.remove('hello') @@ -584,15 +660,28 @@ def mk_se(): se.deleted |= 0 assert(list(se.search(IntSet.All)) == [1, 2, 3, 4, 5]) + assert(3 in se.search('please')) + assert(5 in se.search('please')) + assert(5 in se.search('please', tag_namespace='work')) + assert(3 not in se.search('please', tag_namespace='work')) + # Make sure tags go to l1, others to l2. assert(se.keyword_index('in:bjarni') < se.l2_begin) assert(se.keyword_index('in:inbox') < se.l2_begin) assert(se.keyword_index('please') >= se.l2_begin) - assert(3 in se.search('please')) - assert(5 in se.search('please')) - assert(5 in se.search('please', tag_namespace='work')) - assert(3 not in se.search('please', tag_namespace='work')) + # We can enumerate our tags and set metadata on them! + se.set_tag_comment('in:bjarni', 'Hello world') + assert(se.get_tag('in:bjarni')[0] == b'Hello world') + assert('in:bjarni' in dict(se.iter_tags())) + assert('in:inbox@work' in dict(se.iter_tags())) + assert('in:bjarni' not in dict(se.iter_tags(tag_namespace='work'))) + assert('in:inbox' in dict(se.iter_tags(tag_namespace='work'))) + assert('in:inbox@work' not in dict(se.iter_tags(tag_namespace='work'))) + assert(not se.search_tags([55])) + assert('in:inbox' in se.search_tags([4, 55])) + assert('in:inbox' not in se.search_tags('please')) + assert('in:inbox' in se.search_tags('please', tag_namespace='work')) assert(3 in se.search('remove')) se.del_results([(3, ['please'])]) @@ -604,19 +693,18 @@ def mk_se(): assert(4 not in se.search('all:mail', tag_namespace='work')) assert(3 not in se.search('in:inbox')) assert(4 in se.search('in:testing')) - se.mutate(IntSet([4, 3]), [(IntSet.Sub, 'in:testing'), (IntSet.Or, 'in:inbox')]) + + mr = se.mutate(IntSet([4, 3]), + [(IntSet.Sub, 'in:testing'), (IntSet.Or, 'in:inbox')]) + assert(mr['mutations'] == 2) + assert([4] == list(mr['changes'][0][1])) assert(4 not in se.search('in:testing')) assert(3 in se.search('in:inbox')) assert(4 in se.search('in:inbox')) assert(4 not in se.search('in:inbox', tag_namespace='work')) - se.rename_l1('in:inbox', 'in:outbox') + se.rename_tag('in:inbox', 'in:outbox') assert(4 in se.search('in:outbox')) assert(4 not in se.search('in:inbox')) - try: - se.mutate(IntSet([4, 3]), [(IntSet.Sub, 'hello'), (IntSet.Or, 'world')]) - assert(not 'reached') - except KeyError: - pass # Test reducing a set to empty and then adding back to it se.del_results([(4, ['in:testempty'])]) diff --git a/moggie/search/extractor.py b/moggie/search/extractor.py index 1e30bc7fc..295658e98 100644 --- a/moggie/search/extractor.py +++ b/moggie/search/extractor.py @@ -86,10 +86,16 @@ def words(self, txt, strip_urls=True, url_domains=None): if url_domains: txt += '\n' + '\n'.join(url_domains) + ltxt = txt.lower() + wordlist = WORD_REGEXP.findall(ltxt) words = set( - w for w in WORD_REGEXP.findall(txt.lower()) + w for w in wordlist if self.min_word_length <= len(w) <= self.max_word_length) + for i in range(0, len(wordlist) - 1): + if (len(wordlist[i]) <= 3) or (len(wordlist[i+1]) <= 3): + words.add('%s %s' % (wordlist[i], wordlist[i+1])) + return (url_domains | words) - self.stoplist def body_text_keywords(self, parsed_email): @@ -264,6 +270,10 @@ def extract_email_keywords(self, metadata, parsed_email): assert('bre@example.org' in keywords) assert('bre2@example.org' in keywords) assert('www.example.org' in keywords) + assert('er auðvitað' in keywords) + assert('þetta er' in keywords) + assert('og svo' in keywords) + assert('svo er' in keywords) print('Tests passed OK') except: print('Keywords:\n\t%s' % '\n\t'.join(sorted(list(keywords)))) diff --git a/moggie/search/parse_greedy.py b/moggie/search/parse_greedy.py index 900fd90d5..b9ad08554 100644 --- a/moggie/search/parse_greedy.py +++ b/moggie/search/parse_greedy.py @@ -26,6 +26,9 @@ Note that the search engine itself will then handle globbing of individual keywords, so searching for "hell* world" might become equivalent to "(hell OR hello OR hellsite) AND world". + +Small words (<3 letters) are considered to be part of both the preceding +and following words: "hello my world" becomes "hello my" AND "my world". """ import re @@ -45,6 +48,24 @@ def _flat(search): else: return tuple(search) + def _make_pairs(srch): + op = srch[0] + if (len(srch) < 3) or (op not in (IntSet.And, IntSet.Or)): + return srch + for i in reversed(range(1, len(srch) - 1)): + if (isinstance(srch[i], str) and isinstance(srch[i+1], str) + and ('*' not in srch[i]) + and ('*' not in srch[i+1]) + and (':' not in srch[i]) + and (':' not in srch[i+1]) + and (' ' not in srch[i+1]) + and ((len(srch[i]) < 4) or (len(srch[i+1]) < 4))): + if ((i == 1) or len(srch[i]) >= 4) and (op == IntSet.And): + srch[i:i+2] = ['%s %s' % (srch[i], srch[i+1])] + else: + srch[i:i+2] = [srch[i], '%s %s' % (srch[i], srch[i+1])] + return srch + search_stack = [[IntSet.And]] changed = False for term in terms: @@ -57,7 +78,7 @@ def _flat(search): if len(search_stack) > 1: changed = True done = search_stack.pop(-1) - search_stack[-1].append(tuple(done)) + search_stack[-1].append(tuple(_make_pairs(done))) elif term in ('*', 'ALL'): search_stack[-1].append(IntSet.All) @@ -99,14 +120,18 @@ def _flat(search): # Close all dangling parens while len(search_stack) > 1: done = search_stack.pop(-1) - search_stack[-1].append(tuple(done)) + search_stack[-1].append(tuple(_make_pairs(done))) - return _flat(search_stack[-1]) + return _flat(_make_pairs(search_stack[-1])) if __name__ == '__main__': + import sys + if sys.argv[1:]: + print('%s' % (greedy_parse_terms(' '.join(sys.argv[1:])),)) + assert(greedy_parse_terms('yes hello world') - == (IntSet.And, 'yes', 'hello', 'world')) + == (IntSet.And, 'yes hello', 'world')) assert(greedy_parse_terms('And AND hello +world +iceland') == (IntSet.Or, (IntSet.And, 'and', 'hello'), 'world', 'iceland')) @@ -138,6 +163,3 @@ def swapper_many(kw): == (IntSet.And, 'yes', (IntSet.Or, 'lo:hel', 'hel:lo'), 'world')) print('Tests passed OK') - import sys - if sys.argv[1:]: - print('%s' % (greedy_parse_terms(' '.join(sys.argv[1:])),)) diff --git a/moggie/util/intset.py b/moggie/util/intset.py index fea3b55f9..b775dccc2 100644 --- a/moggie/util/intset.py +++ b/moggie/util/intset.py @@ -224,6 +224,13 @@ def __iter__(self): if (u64 & (1 << j)): yield (i * self.bits) + j + def __bool__(self): + for i in range(0, len(self.npa)): + u64 = int(self.npa[i]) + if u64: + return True + return False + def count(self): return sum(1 for hit in self) @@ -258,6 +265,7 @@ def count(self): assert(len(is1.tobytes()) == (is1.DEF_INIT * is1.bits // 8)) a100 = IntSet.All(100) + assert(bool(a100)) assert(99 in a100) assert(100 not in a100) assert(len(list(a100)) == 100)