From 477da77b463baa9c2326c763911ecf8b46e1d84b Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 2 Dec 2015 11:38:51 +0000 Subject: [PATCH 1/4] Search: Add prefix matching support --- synapse/storage/search.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 20a62d07ffd3..0dfd7b9fb5a6 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -140,7 +140,10 @@ def search_msgs(self, room_ids, search_term, keys): list of dicts """ clauses = [] - args = [] + if isinstance(self.database_engine, PostgresEngine): + args = [_postgres_parse_query(search_term)] + else: + args = [_sqlite_parse_query(search_term)] # Make sure we don't explode because the person is in too many rooms. # We filter the results below regardless. @@ -162,7 +165,7 @@ def search_msgs(self, room_ids, search_term, keys): if isinstance(self.database_engine, PostgresEngine): sql = ( "SELECT ts_rank_cd(vector, query) AS rank, room_id, event_id" - " FROM plainto_tsquery('english', ?) as query, event_search" + " FROM to_tsquery('english', ?) as query, event_search" " WHERE vector @@ query" ) elif isinstance(self.database_engine, Sqlite3Engine): @@ -183,7 +186,7 @@ def search_msgs(self, room_ids, search_term, keys): sql += " ORDER BY rank DESC LIMIT 500" results = yield self._execute( - "search_msgs", self.cursor_to_dict, sql, *([search_term] + args) + "search_msgs", self.cursor_to_dict, sql, *args ) results = filter(lambda row: row["room_id"] in room_ids, results) @@ -226,7 +229,11 @@ def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None list of dicts """ clauses = [] - args = [search_term] + + if isinstance(self.database_engine, PostgresEngine): + args = [_postgres_parse_query(search_term)] + else: + args = [_sqlite_parse_query(search_term)] # Make sure we don't explode because the person is in too many rooms. # We filter the results below regardless. @@ -263,7 +270,7 @@ def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None sql = ( "SELECT ts_rank_cd(vector, query) as rank," " origin_server_ts, stream_ordering, room_id, event_id" - " FROM plainto_tsquery('english', ?) as query, event_search" + " FROM to_tsquery('english', ?) as query, event_search" " NATURAL JOIN events" " WHERE vector @@ query AND " ) @@ -399,3 +406,23 @@ def _to_postgres_options(options_dict): return "'%s'" % ( ",".join("%s=%s" % (k, v) for k, v in options_dict.items()), ) + + +def _postgres_parse_query(search_term): + """Takes a plain unicode string from the user and converts it into a form + that can be passed to `to_tsquery(..)` postgres func. We use this so that + we can add prefix matching, which isn't something `plainto_tsquery` supports. + """ + results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + + return " & ".join(result + ":*" for result in results) + + +def _sqlite_parse_query(search_term): + """Takes a plain unicode string from the user and converts it into a form + that can be passed to sqlite `MATCH`. We use this so that we can do prefix + matching. + """ + results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + + return " & ".join(result + "*" for result in results) From 7dd6e5efca99fc17fa13225ed3d235931da315c9 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 2 Dec 2015 13:09:37 +0000 Subject: [PATCH 2/4] Remove deuplication. Add comment about regex. --- synapse/storage/search.py | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 0dfd7b9fb5a6..4738bdd503c0 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -140,10 +140,7 @@ def search_msgs(self, room_ids, search_term, keys): list of dicts """ clauses = [] - if isinstance(self.database_engine, PostgresEngine): - args = [_postgres_parse_query(search_term)] - else: - args = [_sqlite_parse_query(search_term)] + args = [_parse_query(self.database_engine, search_term)] # Make sure we don't explode because the person is in too many rooms. # We filter the results below regardless. @@ -230,10 +227,7 @@ def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None """ clauses = [] - if isinstance(self.database_engine, PostgresEngine): - args = [_postgres_parse_query(search_term)] - else: - args = [_sqlite_parse_query(search_term)] + args = [_parse_query(self.database_engine, search_term)] # Make sure we don't explode because the person is in too many rooms. # We filter the results below regardless. @@ -408,21 +402,17 @@ def _to_postgres_options(options_dict): ) -def _postgres_parse_query(search_term): +def _parse_query(database_engine, search_term): """Takes a plain unicode string from the user and converts it into a form - that can be passed to `to_tsquery(..)` postgres func. We use this so that - we can add prefix matching, which isn't something `plainto_tsquery` supports. + that can be passed to database. + We use this so that we can add prefix matching, which isn't something + that is supported by default. """ - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) - return " & ".join(result + ":*" for result in results) - - -def _sqlite_parse_query(search_term): - """Takes a plain unicode string from the user and converts it into a form - that can be passed to sqlite `MATCH`. We use this so that we can do prefix - matching. - """ + # Pull out the individual words, discarding any non-word characters. results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) - return " & ".join(result + "*" for result in results) + if isinstance(database_engine, PostgresEngine): + return " & ".join(result + ":*" for result in results) + else: + return " & ".join(result + "*" for result in results) From b9acef53015fb112c8b888d3d184388f9d030b01 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 2 Dec 2015 13:28:13 +0000 Subject: [PATCH 3/4] Fix so highlight matching works again --- synapse/storage/search.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index 4738bdd503c0..fd7b688cf5c9 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -140,7 +140,10 @@ def search_msgs(self, room_ids, search_term, keys): list of dicts """ clauses = [] - args = [_parse_query(self.database_engine, search_term)] + + search_query = search_query = _parse_query(self.database_engine, search_term) + + args = [search_query] # Make sure we don't explode because the person is in too many rooms. # We filter the results below regardless. @@ -197,7 +200,7 @@ def search_msgs(self, room_ids, search_term, keys): highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = yield self._find_highlights_in_postgres(search_term, events) + highlights = yield self._find_highlights_in_postgres(search_query, events) defer.returnValue({ "results": [ @@ -227,7 +230,9 @@ def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None """ clauses = [] - args = [_parse_query(self.database_engine, search_term)] + search_query = search_query = _parse_query(self.database_engine, search_term) + + args = [search_query] # Make sure we don't explode because the person is in too many rooms. # We filter the results below regardless. @@ -314,7 +319,7 @@ def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = yield self._find_highlights_in_postgres(search_term, events) + highlights = yield self._find_highlights_in_postgres(search_query, events) defer.returnValue({ "results": [ @@ -331,7 +336,7 @@ def search_rooms(self, room_ids, search_term, keys, limit, pagination_token=None "highlights": highlights, }) - def _find_highlights_in_postgres(self, search_term, events): + def _find_highlights_in_postgres(self, search_query, events): """Given a list of events and a search term, return a list of words that match from the content of the event. @@ -339,7 +344,7 @@ def _find_highlights_in_postgres(self, search_term, events): highlight the matching parts. Args: - search_term (str) + search_query (str) events (list): A list of events Returns: @@ -371,14 +376,14 @@ def f(txn): while stop_sel in value: stop_sel += ">" - query = "SELECT ts_headline(?, plainto_tsquery('english', ?), %s)" % ( + query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % ( _to_postgres_options({ "StartSel": start_sel, "StopSel": stop_sel, "MaxFragments": "50", }) ) - txn.execute(query, (value, search_term,)) + txn.execute(query, (value, search_query,)) headline, = txn.fetchall()[0] # Now we need to pick the possible highlights out of the haedline From 976cb5aaa8e052a27b1dc249798e12c80e8aa329 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 2 Dec 2015 13:50:43 +0000 Subject: [PATCH 4/4] Throw if unrecognized DB type --- synapse/storage/search.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/synapse/storage/search.py b/synapse/storage/search.py index fd7b688cf5c9..39f600f53ce9 100644 --- a/synapse/storage/search.py +++ b/synapse/storage/search.py @@ -419,5 +419,8 @@ def _parse_query(database_engine, search_term): if isinstance(database_engine, PostgresEngine): return " & ".join(result + ":*" for result in results) - else: + elif isinstance(database_engine, Sqlite3Engine): return " & ".join(result + "*" for result in results) + else: + # This should be unreachable. + raise Exception("Unrecognized database engine")