From 220af1df54c3b4e5f1d7ec8fb62b375c99a2bbe2 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 2 Nov 2022 12:28:24 +0000 Subject: [PATCH 1/4] Revert "Switch search SQL to triple-quote strings. (#14311)" This reverts commit 81815e0561eea91dbf0c29731589fac2e6f98a40. --- synapse/storage/databases/main/search.py | 188 +++++++++++------------ 1 file changed, 89 insertions(+), 99 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index e9588d175518..594b935614f7 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -80,11 +80,11 @@ def store_search_entries_txn( if not self.hs.config.server.enable_search: return if isinstance(self.database_engine, PostgresEngine): - sql = """ - INSERT INTO event_search - (event_id, room_id, key, vector, stream_ordering, origin_server_ts) - VALUES (?,?,?,to_tsvector('english', ?),?,?) - """ + sql = ( + "INSERT INTO event_search" + " (event_id, room_id, key, vector, stream_ordering, origin_server_ts)" + " VALUES (?,?,?,to_tsvector('english', ?),?,?)" + ) args1 = ( ( @@ -101,20 +101,20 @@ def store_search_entries_txn( txn.execute_batch(sql, args1) elif isinstance(self.database_engine, Sqlite3Engine): - self.db_pool.simple_insert_many_txn( - txn, - table="event_search", - keys=("event_id", "room_id", "key", "value"), - values=( - ( - entry.event_id, - entry.room_id, - entry.key, - _clean_value_for_search(entry.value), - ) - for entry in entries - ), + sql = ( + "INSERT INTO event_search (event_id, room_id, key, value)" + " VALUES (?,?,?,?)" + ) + args2 = ( + ( + entry.event_id, + entry.room_id, + entry.key, + _clean_value_for_search(entry.value), + ) + for entry in entries ) + txn.execute_batch(sql, args2) else: # This should be unreachable. @@ -162,17 +162,15 @@ async def _background_reindex_search( TYPES = ["m.room.name", "m.room.message", "m.room.topic"] def reindex_search_txn(txn: LoggingTransaction) -> int: - sql = """ - SELECT stream_ordering, event_id, room_id, type, json, origin_server_ts - FROM events - JOIN event_json USING (room_id, event_id) - WHERE ? <= stream_ordering AND stream_ordering < ? - AND (%s) - ORDER BY stream_ordering DESC - LIMIT ? - """ % ( - " OR ".join("type = '%s'" % (t,) for t in TYPES), - ) + sql = ( + "SELECT stream_ordering, event_id, room_id, type, json, " + " origin_server_ts FROM events" + " JOIN event_json USING (room_id, event_id)" + " WHERE ? <= stream_ordering AND stream_ordering < ?" + " AND (%s)" + " ORDER BY stream_ordering DESC" + " LIMIT ?" + ) % (" OR ".join("type = '%s'" % (t,) for t in TYPES),) txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size)) @@ -286,10 +284,8 @@ def create_index(conn: LoggingDatabaseConnection) -> None: try: c.execute( - """ - CREATE INDEX CONCURRENTLY event_search_fts_idx - ON event_search USING GIN (vector) - """ + "CREATE INDEX CONCURRENTLY event_search_fts_idx" + " ON event_search USING GIN (vector)" ) except psycopg2.ProgrammingError as e: logger.warning( @@ -327,16 +323,12 @@ def create_index(conn: LoggingDatabaseConnection) -> None: # We create with NULLS FIRST so that when we search *backwards* # we get the ones with non null origin_server_ts *first* c.execute( - """ - CREATE INDEX CONCURRENTLY event_search_room_order - ON event_search(room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST) - """ + "CREATE INDEX CONCURRENTLY event_search_room_order ON event_search(" + "room_id, origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)" ) c.execute( - """ - CREATE INDEX CONCURRENTLY event_search_order - ON event_search(origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST) - """ + "CREATE INDEX CONCURRENTLY event_search_order ON event_search(" + "origin_server_ts NULLS FIRST, stream_ordering NULLS FIRST)" ) conn.set_session(autocommit=False) @@ -353,14 +345,14 @@ def create_index(conn: LoggingDatabaseConnection) -> None: ) def reindex_search_txn(txn: LoggingTransaction) -> Tuple[int, bool]: - sql = """ - UPDATE event_search AS es - SET stream_ordering = e.stream_ordering, origin_server_ts = e.origin_server_ts - FROM events AS e - WHERE e.event_id = es.event_id - AND ? <= e.stream_ordering AND e.stream_ordering < ? - RETURNING es.stream_ordering - """ + sql = ( + "UPDATE event_search AS es SET stream_ordering = e.stream_ordering," + " origin_server_ts = e.origin_server_ts" + " FROM events AS e" + " WHERE e.event_id = es.event_id" + " AND ? <= e.stream_ordering AND e.stream_ordering < ?" + " RETURNING es.stream_ordering" + ) min_stream_id = max_stream_id - batch_size txn.execute(sql, (min_stream_id, max_stream_id)) @@ -464,33 +456,33 @@ async def search_msgs( if isinstance(self.database_engine, PostgresEngine): search_query = search_term tsquery_func = self.database_engine.tsquery_func - sql = f""" - SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank, - room_id, event_id - FROM event_search - WHERE vector @@ {tsquery_func}('english', ?) - """ + sql = ( + f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank," + " room_id, event_id" + " FROM event_search" + f" WHERE vector @@ {tsquery_func}('english', ?)" + ) args = [search_query, search_query] + args - count_sql = f""" - SELECT room_id, count(*) as count FROM event_search - WHERE vector @@ {tsquery_func}('english', ?) - """ + count_sql = ( + "SELECT room_id, count(*) as count FROM event_search" + f" WHERE vector @@ {tsquery_func}('english', ?)" + ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): search_query = _parse_query_for_sqlite(search_term) - sql = """ - SELECT rank(matchinfo(event_search)) as rank, room_id, event_id - FROM event_search - WHERE value MATCH ? - """ + sql = ( + "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" + " FROM event_search" + " WHERE value MATCH ?" + ) args = [search_query] + args - count_sql = """ - SELECT room_id, count(*) as count FROM event_search - WHERE value MATCH ? - """ + count_sql = ( + "SELECT room_id, count(*) as count FROM event_search" + " WHERE value MATCH ?" + ) count_args = [search_query] + count_args else: # This should be unreachable. @@ -596,27 +588,26 @@ async def search_rooms( raise SynapseError(400, "Invalid pagination token") clauses.append( - """ - (origin_server_ts < ? OR (origin_server_ts = ? AND stream_ordering < ?)) - """ + "(origin_server_ts < ?" + " OR (origin_server_ts = ? AND stream_ordering < ?))" ) args.extend([origin_server_ts, origin_server_ts, stream]) if isinstance(self.database_engine, PostgresEngine): search_query = search_term tsquery_func = self.database_engine.tsquery_func - sql = f""" - SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank, - origin_server_ts, stream_ordering, room_id, event_id - FROM event_search - WHERE vector @@ {tsquery_func}('english', ?) AND - """ + sql = ( + f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank," + " origin_server_ts, stream_ordering, room_id, event_id" + " FROM event_search" + f" WHERE vector @@ {tsquery_func}('english', ?) AND " + ) args = [search_query, search_query] + args - count_sql = f""" - SELECT room_id, count(*) as count FROM event_search - WHERE vector @@ {tsquery_func}('english', ?) AND - """ + count_sql = ( + "SELECT room_id, count(*) as count FROM event_search" + f" WHERE vector @@ {tsquery_func}('english', ?) AND " + ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): @@ -628,24 +619,23 @@ async def search_rooms( # in the events table to get the topological ordering. We need # to use the indexes in this order because sqlite refuses to # MATCH unless it uses the full text search index - sql = """ - SELECT - rank(matchinfo) as rank, room_id, event_id, origin_server_ts, stream_ordering - FROM ( - SELECT key, event_id, matchinfo(event_search) as matchinfo - FROM event_search - WHERE value MATCH ? + sql = ( + "SELECT rank(matchinfo) as rank, room_id, event_id," + " origin_server_ts, stream_ordering" + " FROM (SELECT key, event_id, matchinfo(event_search) as matchinfo" + " FROM event_search" + " WHERE value MATCH ?" + " )" + " CROSS JOIN events USING (event_id)" + " WHERE " ) - CROSS JOIN events USING (event_id) - WHERE - """ search_query = _parse_query_for_sqlite(search_term) args = [search_query] + args - count_sql = """ - SELECT room_id, count(*) as count FROM event_search - WHERE value MATCH ? AND - """ + count_sql = ( + "SELECT room_id, count(*) as count FROM event_search" + " WHERE value MATCH ? AND " + ) count_args = [search_query] + count_args else: # This should be unreachable. @@ -657,10 +647,10 @@ async def search_rooms( # We add an arbitrary limit here to ensure we don't try to pull the # entire table from the database. if isinstance(self.database_engine, PostgresEngine): - sql += """ - ORDER BY origin_server_ts DESC NULLS LAST, stream_ordering DESC NULLS LAST - LIMIT ? - """ + sql += ( + " ORDER BY origin_server_ts DESC NULLS LAST," + " stream_ordering DESC NULLS LAST LIMIT ?" + ) elif isinstance(self.database_engine, Sqlite3Engine): sql += " ORDER BY origin_server_ts DESC, stream_ordering DESC LIMIT ?" else: From 7f77f1386c94c753755b79d7d7025e958b228282 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 2 Nov 2022 12:28:47 +0000 Subject: [PATCH 2/4] Revert "Fix tests for change in PostgreSQL 14 behavior change. (#14310)" This reverts commit 67583281e3f8ea923eedbc56a4c85c7ba75d1582. --- synapse/storage/databases/main/search.py | 5 +++-- tests/storage/test_room_search.py | 16 ++++------------ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 594b935614f7..a89fc54c2cbe 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -824,8 +824,9 @@ def _tokenize_query(query: str) -> TokenList: in_phrase = False parts = deque(query.split('"')) for i, part in enumerate(parts): - # The contents inside double quotes is treated as a phrase. - in_phrase = bool(i % 2) + # The contents inside double quotes is treated as a phrase, a trailing + # double quote is not implied. + in_phrase = bool(i % 2) and i != (len(parts) - 1) # Pull out the individual words, discarding any non-word characters. words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE)) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 868b5bee8485..9ddc19900afc 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -239,6 +239,7 @@ class MessageSearchTest(HomeserverTestCase): ("fox -nope", (True, False)), ("fox -brown", (False, True)), ('"fox" quick', True), + ('"fox quick', True), ('"quick brown', True), ('" quick "', True), ('" nope"', False), @@ -268,15 +269,6 @@ def prepare( response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token) self.assertIn("event_id", response) - # The behaviour of a missing trailing double quote changed in PostgreSQL 14 - # from ignoring the initial double quote to treating it as a phrase. - main_store = homeserver.get_datastores().main - found = False - if isinstance(main_store.database_engine, PostgresEngine): - assert main_store.database_engine._version is not None - found = main_store.database_engine._version < 140000 - self.COMMON_CASES.append(('"fox quick', (found, True))) - def test_tokenize_query(self) -> None: """Test the custom logic to tokenize a user's query.""" cases = ( @@ -288,9 +280,9 @@ def test_tokenize_query(self) -> None: ("fox -brown", ["fox", SearchToken.Not, "brown"]), ("- fox", [SearchToken.Not, "fox"]), ('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]), - # No trailing double quote. - ('"fox quick', [Phrase(["fox", "quick"])]), - ('"-fox quick', [Phrase(["-fox", "quick"])]), + # No trailing double quoe. + ('"fox quick', ["fox", SearchToken.And, "quick"]), + ('"-fox quick', [SearchToken.Not, "fox", SearchToken.And, "quick"]), ('" quick "', [Phrase(["quick"])]), ( 'q"uick brow"n', From 7e0dd52782806592df61a1a517660b694515dc27 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 2 Nov 2022 12:29:34 +0000 Subject: [PATCH 3/4] Revert "Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)" This reverts commit d902181de98399d90c46c4e4e2cf631064757941. --- synapse/storage/databases/main/search.py | 197 +++------------- synapse/storage/engines/postgres.py | 16 -- .../73/10_update_sqlite_fts4_tokenizer.py | 62 ----- tests/storage/test_room_search.py | 213 ------------------ 4 files changed, 35 insertions(+), 453 deletions(-) delete mode 100644 synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index a89fc54c2cbe..1b79acf95559 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -11,22 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import enum + import logging import re -from collections import deque -from dataclasses import dataclass -from typing import ( - TYPE_CHECKING, - Any, - Collection, - Iterable, - List, - Optional, - Set, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Set, Tuple import attr @@ -39,7 +27,7 @@ LoggingTransaction, ) from synapse.storage.databases.main.events_worker import EventRedactBehaviour -from synapse.storage.engines import PostgresEngine, Sqlite3Engine +from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine from synapse.types import JsonDict if TYPE_CHECKING: @@ -433,6 +421,8 @@ async def search_msgs( """ clauses = [] + search_query = _parse_query(self.database_engine, search_term) + args: List[Any] = [] # Make sure we don't explode because the person is in too many rooms. @@ -454,24 +444,20 @@ async def search_msgs( count_clauses = clauses if isinstance(self.database_engine, PostgresEngine): - search_query = search_term - tsquery_func = self.database_engine.tsquery_func sql = ( - f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank," + "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank," " room_id, event_id" " FROM event_search" - f" WHERE vector @@ {tsquery_func}('english', ?)" + " WHERE vector @@ to_tsquery('english', ?)" ) args = [search_query, search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" - f" WHERE vector @@ {tsquery_func}('english', ?)" + " WHERE vector @@ to_tsquery('english', ?)" ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): - search_query = _parse_query_for_sqlite(search_term) - sql = ( "SELECT rank(matchinfo(event_search)) as rank, room_id, event_id" " FROM event_search" @@ -483,7 +469,7 @@ async def search_msgs( "SELECT room_id, count(*) as count FROM event_search" " WHERE value MATCH ?" ) - count_args = [search_query] + count_args + count_args = [search_term] + count_args else: # This should be unreachable. raise Exception("Unrecognized database engine") @@ -515,9 +501,7 @@ async def search_msgs( highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = await self._find_highlights_in_postgres( - search_query, events, tsquery_func - ) + highlights = await self._find_highlights_in_postgres(search_query, events) count_sql += " GROUP BY room_id" @@ -526,6 +510,7 @@ async def search_msgs( ) count = sum(row["count"] for row in count_results if row["room_id"] in room_ids) + return { "results": [ {"event": event_map[r["event_id"]], "rank": r["rank"]} @@ -557,6 +542,9 @@ async def search_rooms( Each match as a dictionary. """ clauses = [] + + search_query = _parse_query(self.database_engine, search_term) + args: List[Any] = [] # Make sure we don't explode because the person is in too many rooms. @@ -594,23 +582,20 @@ async def search_rooms( args.extend([origin_server_ts, origin_server_ts, stream]) if isinstance(self.database_engine, PostgresEngine): - search_query = search_term - tsquery_func = self.database_engine.tsquery_func sql = ( - f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank," + "SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank," " origin_server_ts, stream_ordering, room_id, event_id" " FROM event_search" - f" WHERE vector @@ {tsquery_func}('english', ?) AND " + " WHERE vector @@ to_tsquery('english', ?) AND " ) args = [search_query, search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" - f" WHERE vector @@ {tsquery_func}('english', ?) AND " + " WHERE vector @@ to_tsquery('english', ?) AND " ) count_args = [search_query] + count_args elif isinstance(self.database_engine, Sqlite3Engine): - # We use CROSS JOIN here to ensure we use the right indexes. # https://sqlite.org/optoverview.html#crossjoin # @@ -629,14 +614,13 @@ async def search_rooms( " CROSS JOIN events USING (event_id)" " WHERE " ) - search_query = _parse_query_for_sqlite(search_term) args = [search_query] + args count_sql = ( "SELECT room_id, count(*) as count FROM event_search" " WHERE value MATCH ? AND " ) - count_args = [search_query] + count_args + count_args = [search_term] + count_args else: # This should be unreachable. raise Exception("Unrecognized database engine") @@ -676,9 +660,7 @@ async def search_rooms( highlights = None if isinstance(self.database_engine, PostgresEngine): - highlights = await self._find_highlights_in_postgres( - search_query, events, tsquery_func - ) + highlights = await self._find_highlights_in_postgres(search_query, events) count_sql += " GROUP BY room_id" @@ -704,7 +686,7 @@ async def search_rooms( } async def _find_highlights_in_postgres( - self, search_query: str, events: List[EventBase], tsquery_func: str + self, search_query: str, events: List[EventBase] ) -> Set[str]: """Given a list of events and a search term, return a list of words that match from the content of the event. @@ -715,7 +697,6 @@ async def _find_highlights_in_postgres( Args: search_query events: A list of events - tsquery_func: The tsquery_* function to use when making queries Returns: A set of strings. @@ -748,7 +729,7 @@ def f(txn: LoggingTransaction) -> Set[str]: while stop_sel in value: stop_sel += ">" - query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % ( + query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % ( _to_postgres_options( { "StartSel": start_sel, @@ -779,128 +760,20 @@ def _to_postgres_options(options_dict: JsonDict) -> str: return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),) -@dataclass -class Phrase: - phrase: List[str] - - -class SearchToken(enum.Enum): - Not = enum.auto() - Or = enum.auto() - And = enum.auto() - - -Token = Union[str, Phrase, SearchToken] -TokenList = List[Token] - - -def _is_stop_word(word: str) -> bool: - # TODO Pull these out of the dictionary: - # https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop - return word in {"the", "a", "you", "me", "and", "but"} - - -def _tokenize_query(query: str) -> TokenList: - """ - Convert the user-supplied `query` into a TokenList, which can be translated into - some DB-specific syntax. - - The following constructs are supported: - - - phrase queries using "double quotes" - - case-insensitive `or` and `and` operators - - negation of a keyword via unary `-` - - unary hyphen to denote NOT e.g. 'include -exclude' - - The following differs from websearch_to_tsquery: - - - Stop words are not removed. - - Unclosed phrases are treated differently. - - """ - tokens: TokenList = [] - - # Find phrases. - in_phrase = False - parts = deque(query.split('"')) - for i, part in enumerate(parts): - # The contents inside double quotes is treated as a phrase, a trailing - # double quote is not implied. - in_phrase = bool(i % 2) and i != (len(parts) - 1) - - # Pull out the individual words, discarding any non-word characters. - words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE)) - - # Phrases have simplified handling of words. - if in_phrase: - # Skip stop words. - phrase = [word for word in words if not _is_stop_word(word)] - - # Consecutive words are implicitly ANDed together. - if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or): - tokens.append(SearchToken.And) - - # Add the phrase. - tokens.append(Phrase(phrase)) - continue - - # Otherwise, not in a phrase. - while words: - word = words.popleft() - - if word.startswith("-"): - tokens.append(SearchToken.Not) - - # If there's more word, put it back to be processed again. - word = word[1:] - if word: - words.appendleft(word) - elif word.lower() == "or": - tokens.append(SearchToken.Or) - else: - # Skip stop words. - if _is_stop_word(word): - continue - - # Consecutive words are implicitly ANDed together. - if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or): - tokens.append(SearchToken.And) - - # Add the search term. - tokens.append(word) - - return tokens - - -def _tokens_to_sqlite_match_query(tokens: TokenList) -> str: - """ - Convert the list of tokens to a string suitable for passing to sqlite's MATCH. - Assume sqlite was compiled with enhanced query syntax. - - Ref: https://www.sqlite.org/fts3.html#full_text_index_queries +def _parse_query(database_engine: BaseDatabaseEngine, search_term: str) -> str: + """Takes a plain unicode string from the user and converts it into a form + that can be passed to database. + We use this so that we can add prefix matching, which isn't something + that is supported by default. """ - match_query = [] - for token in tokens: - if isinstance(token, str): - match_query.append(token) - elif isinstance(token, Phrase): - match_query.append('"' + " ".join(token.phrase) + '"') - elif token == SearchToken.Not: - # TODO: SQLite treats NOT as a *binary* operator. Hopefully a search - # term has already been added before this. - match_query.append(" NOT ") - elif token == SearchToken.Or: - match_query.append(" OR ") - elif token == SearchToken.And: - match_query.append(" AND ") - else: - raise ValueError(f"unknown token {token}") - - return "".join(match_query) + # Pull out the individual words, discarding any non-word characters. + results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) -def _parse_query_for_sqlite(search_term: str) -> str: - """Takes a plain unicode string from the user and converts it into a form - that can be passed to sqllite's matchinfo(). - """ - return _tokens_to_sqlite_match_query(_tokenize_query(search_term)) + if isinstance(database_engine, PostgresEngine): + return " & ".join(result + ":*" for result in results) + elif isinstance(database_engine, Sqlite3Engine): + return " & ".join(result + "*" for result in results) + else: + # This should be unreachable. + raise Exception("Unrecognized database engine") diff --git a/synapse/storage/engines/postgres.py b/synapse/storage/engines/postgres.py index 9bf74bbf5920..d8c0f64d9a44 100644 --- a/synapse/storage/engines/postgres.py +++ b/synapse/storage/engines/postgres.py @@ -170,22 +170,6 @@ def supports_returning(self) -> bool: """Do we support the `RETURNING` clause in insert/update/delete?""" return True - @property - def tsquery_func(self) -> str: - """ - Selects a tsquery_* func to use. - - Ref: https://www.postgresql.org/docs/current/textsearch-controls.html - - Returns: - The function name. - """ - # Postgres 11 added support for websearch_to_tsquery. - assert self._version is not None - if self._version >= 110000: - return "websearch_to_tsquery" - return "plainto_tsquery" - def is_deadlock(self, error: Exception) -> bool: if isinstance(error, psycopg2.DatabaseError): # https://www.postgresql.org/docs/current/static/errcodes-appendix.html diff --git a/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py b/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py deleted file mode 100644 index 3de0a709eba7..000000000000 --- a/synapse/storage/schema/main/delta/73/10_update_sqlite_fts4_tokenizer.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2022 The Matrix.org Foundation C.I.C. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json - -from synapse.storage.engines import BaseDatabaseEngine, Sqlite3Engine -from synapse.storage.types import Cursor - - -def run_create(cur: Cursor, database_engine: BaseDatabaseEngine) -> None: - """ - Upgrade the event_search table to use the porter tokenizer if it isn't already - - Applies only for sqlite. - """ - if not isinstance(database_engine, Sqlite3Engine): - return - - # Rebuild the table event_search table with tokenize=porter configured. - cur.execute("DROP TABLE event_search") - cur.execute( - """ - CREATE VIRTUAL TABLE event_search - USING fts4 (tokenize=porter, event_id, room_id, sender, key, value ) - """ - ) - - # Re-run the background job to re-populate the event_search table. - cur.execute("SELECT MIN(stream_ordering) FROM events") - row = cur.fetchone() - min_stream_id = row[0] - - # If there are not any events, nothing to do. - if min_stream_id is None: - return - - cur.execute("SELECT MAX(stream_ordering) FROM events") - row = cur.fetchone() - max_stream_id = row[0] - - progress = { - "target_min_stream_id_inclusive": min_stream_id, - "max_stream_id_exclusive": max_stream_id + 1, - } - progress_json = json.dumps(progress) - - sql = """ - INSERT into background_updates (ordering, update_name, progress_json) - VALUES (?, ?, ?) - """ - - cur.execute(sql, (7310, "event_search", progress_json)) diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index 9ddc19900afc..e747c6b50eb9 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -12,22 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, Union -from unittest.case import SkipTest -from unittest.mock import PropertyMock, patch - -from twisted.test.proto_helpers import MemoryReactor - import synapse.rest.admin from synapse.api.constants import EventTypes from synapse.api.errors import StoreError from synapse.rest.client import login, room -from synapse.server import HomeServer -from synapse.storage.databases.main import DataStore -from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query from synapse.storage.engines import PostgresEngine -from synapse.storage.engines.sqlite import Sqlite3Engine -from synapse.util import Clock from tests.unittest import HomeserverTestCase, skip_unless from tests.utils import USE_POSTGRES_FOR_TESTS @@ -198,205 +187,3 @@ def test_sqlite_non_string_deletion_background_update(self): ), ) self.assertCountEqual(values, ["hi", "2"]) - - -class MessageSearchTest(HomeserverTestCase): - """ - Check message search. - - A powerful way to check the behaviour is to run the following in Postgres >= 11: - - # SELECT websearch_to_tsquery('english', ); - - The result can be compared to the tokenized version for SQLite and Postgres < 11. - - """ - - servlets = [ - synapse.rest.admin.register_servlets_for_client_rest_resource, - login.register_servlets, - room.register_servlets, - ] - - PHRASE = "the quick brown fox jumps over the lazy dog" - - # Each entry is a search query, followed by either a boolean of whether it is - # in the phrase OR a tuple of booleans: whether it matches using websearch - # and using plain search. - COMMON_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [ - ("nope", False), - ("brown", True), - ("quick brown", True), - ("brown quick", True), - ("quick \t brown", True), - ("jump", True), - ("brown nope", False), - ('"brown quick"', (False, True)), - ('"jumps over"', True), - ('"quick fox"', (False, True)), - ("nope OR doublenope", False), - ("furphy OR fox", (True, False)), - ("fox -nope", (True, False)), - ("fox -brown", (False, True)), - ('"fox" quick', True), - ('"fox quick', True), - ('"quick brown', True), - ('" quick "', True), - ('" nope"', False), - ] - # TODO Test non-ASCII cases. - - # Case that fail on SQLite. - POSTGRES_CASES: List[Tuple[str, Union[bool, Tuple[bool, bool]]]] = [ - # SQLite treats NOT as a binary operator. - ("- fox", (False, True)), - ("- nope", (True, False)), - ('"-fox quick', (False, True)), - # PostgreSQL skips stop words. - ('"the quick brown"', True), - ('"over lazy"', True), - ] - - def prepare( - self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer - ) -> None: - # Register a user and create a room, create some messages - self.register_user("alice", "password") - self.access_token = self.login("alice", "password") - self.room_id = self.helper.create_room_as("alice", tok=self.access_token) - - # Send the phrase as a message and check it was created - response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token) - self.assertIn("event_id", response) - - def test_tokenize_query(self) -> None: - """Test the custom logic to tokenize a user's query.""" - cases = ( - ("brown", ["brown"]), - ("quick brown", ["quick", SearchToken.And, "brown"]), - ("quick \t brown", ["quick", SearchToken.And, "brown"]), - ('"brown quick"', [Phrase(["brown", "quick"])]), - ("furphy OR fox", ["furphy", SearchToken.Or, "fox"]), - ("fox -brown", ["fox", SearchToken.Not, "brown"]), - ("- fox", [SearchToken.Not, "fox"]), - ('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]), - # No trailing double quoe. - ('"fox quick', ["fox", SearchToken.And, "quick"]), - ('"-fox quick', [SearchToken.Not, "fox", SearchToken.And, "quick"]), - ('" quick "', [Phrase(["quick"])]), - ( - 'q"uick brow"n', - [ - "q", - SearchToken.And, - Phrase(["uick", "brow"]), - SearchToken.And, - "n", - ], - ), - ( - '-"quick brown"', - [SearchToken.Not, Phrase(["quick", "brown"])], - ), - ) - - for query, expected in cases: - tokenized = _tokenize_query(query) - self.assertEqual( - tokenized, expected, f"{tokenized} != {expected} for {query}" - ) - - def _check_test_cases( - self, - store: DataStore, - cases: List[Tuple[str, Union[bool, Tuple[bool, bool]]]], - index=0, - ) -> None: - # Run all the test cases versus search_msgs - for query, expect_to_contain in cases: - if isinstance(expect_to_contain, tuple): - expect_to_contain = expect_to_contain[index] - - result = self.get_success( - store.search_msgs([self.room_id], query, ["content.body"]) - ) - self.assertEquals( - result["count"], - 1 if expect_to_contain else 0, - f"expected '{query}' to match '{self.PHRASE}'" - if expect_to_contain - else f"'{query}' unexpectedly matched '{self.PHRASE}'", - ) - self.assertEquals( - len(result["results"]), - 1 if expect_to_contain else 0, - "results array length should match count", - ) - - # Run them again versus search_rooms - for query, expect_to_contain in cases: - if isinstance(expect_to_contain, tuple): - expect_to_contain = expect_to_contain[index] - - result = self.get_success( - store.search_rooms([self.room_id], query, ["content.body"], 10) - ) - self.assertEquals( - result["count"], - 1 if expect_to_contain else 0, - f"expected '{query}' to match '{self.PHRASE}'" - if expect_to_contain - else f"'{query}' unexpectedly matched '{self.PHRASE}'", - ) - self.assertEquals( - len(result["results"]), - 1 if expect_to_contain else 0, - "results array length should match count", - ) - - def test_postgres_web_search_for_phrase(self): - """ - Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery. - This test is skipped unless the postgres instance supports websearch_to_tsquery. - """ - - store = self.hs.get_datastores().main - if not isinstance(store.database_engine, PostgresEngine): - raise SkipTest("Test only applies when postgres is used as the database") - - if store.database_engine.tsquery_func != "websearch_to_tsquery": - raise SkipTest( - "Test only applies when postgres supporting websearch_to_tsquery is used as the database" - ) - - self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES, index=0) - - def test_postgres_non_web_search_for_phrase(self): - """ - Test postgres searching for phrases without using web search, which is used when websearch_to_tsquery isn't - supported by the current postgres version. - """ - - store = self.hs.get_datastores().main - if not isinstance(store.database_engine, PostgresEngine): - raise SkipTest("Test only applies when postgres is used as the database") - - # Patch supports_websearch_to_tsquery to always return False to ensure we're testing the plainto_tsquery path. - with patch( - "synapse.storage.engines.postgres.PostgresEngine.tsquery_func", - new_callable=PropertyMock, - ) as supports_websearch_to_tsquery: - supports_websearch_to_tsquery.return_value = "plainto_tsquery" - self._check_test_cases( - store, self.COMMON_CASES + self.POSTGRES_CASES, index=1 - ) - - def test_sqlite_search(self): - """ - Test sqlite searching for phrases. - """ - store = self.hs.get_datastores().main - if not isinstance(store.database_engine, Sqlite3Engine): - raise SkipTest("Test only applies when sqlite is used as the database") - - self._check_test_cases(store, self.COMMON_CASES, index=0) From 24409c7c158c108fa31433dd20ee33d423f5cb04 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 14 Feb 2018 13:47:14 +0000 Subject: [PATCH 4/4] Disable auto search for prefixes in event search --- synapse/storage/databases/main/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index 1b79acf95559..ddcda6f1a745 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -771,7 +771,7 @@ def _parse_query(database_engine: BaseDatabaseEngine, search_term: str) -> str: results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) if isinstance(database_engine, PostgresEngine): - return " & ".join(result + ":*" for result in results) + return " & ".join(result for result in results) elif isinstance(database_engine, Sqlite3Engine): return " & ".join(result + "*" for result in results) else: