From 41cf4c2cf6432336cc7477f130a2847449cff99a Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Thu, 24 Feb 2022 11:52:28 +0000 Subject: [PATCH] Fix non-strings in the `event_search` table (#12037) Don't attempt to add non-string `value`s to `event_search` and add a background update to clear out bad rows from `event_search` when using sqlite. Signed-off-by: Sean Quah --- changelog.d/12037.bugfix | 1 + synapse/storage/databases/main/events.py | 18 +-- synapse/storage/databases/main/search.py | 26 ++++ ...e_non_strings_from_event_search.sql.sqlite | 22 ++++ tests/storage/test_room_search.py | 117 +++++++++++++++++- 5 files changed, 173 insertions(+), 11 deletions(-) create mode 100644 changelog.d/12037.bugfix create mode 100644 synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql.sqlite diff --git a/changelog.d/12037.bugfix b/changelog.d/12037.bugfix new file mode 100644 index 000000000000..9295cb4dc0d9 --- /dev/null +++ b/changelog.d/12037.bugfix @@ -0,0 +1 @@ +Properly fix a long-standing bug where wrong data could be inserted in the `event_search` table when using sqlite. This could block running `synapse_port_db` with an "argument of type 'int' is not iterable" error. This bug was partially fixed by a change in Synapse 1.44.0. diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index a1d7a9b41300..e53e84054ad0 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -1473,10 +1473,10 @@ def _store_rejected_events_txn(self, txn, events_and_contexts): def _update_metadata_tables_txn( self, - txn, + txn: LoggingTransaction, *, - events_and_contexts, - all_events_and_contexts, + events_and_contexts: List[Tuple[EventBase, EventContext]], + all_events_and_contexts: List[Tuple[EventBase, EventContext]], inhibit_local_membership_updates: bool = False, ): """Update all the miscellaneous tables for new events @@ -1953,20 +1953,20 @@ def _handle_redaction(self, txn, redacted_event_id): txn, table="event_relations", keyvalues={"event_id": redacted_event_id} ) - def _store_room_topic_txn(self, txn, event): - if hasattr(event, "content") and "topic" in event.content: + def _store_room_topic_txn(self, txn: LoggingTransaction, event: EventBase): + if isinstance(event.content.get("topic"), str): self.store_event_search_txn( txn, event, "content.topic", event.content["topic"] ) - def _store_room_name_txn(self, txn, event): - if hasattr(event, "content") and "name" in event.content: + def _store_room_name_txn(self, txn: LoggingTransaction, event: EventBase): + if isinstance(event.content.get("name"), str): self.store_event_search_txn( txn, event, "content.name", event.content["name"] ) - def _store_room_message_txn(self, txn, event): - if hasattr(event, "content") and "body" in event.content: + def _store_room_message_txn(self, txn: LoggingTransaction, event: EventBase): + if isinstance(event.content.get("body"), str): self.store_event_search_txn( txn, event, "content.body", event.content["body"] ) diff --git a/synapse/storage/databases/main/search.py b/synapse/storage/databases/main/search.py index acea300ed343..e23b1190726b 100644 --- a/synapse/storage/databases/main/search.py +++ b/synapse/storage/databases/main/search.py @@ -115,6 +115,7 @@ class SearchBackgroundUpdateStore(SearchWorkerStore): EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order" EVENT_SEARCH_USE_GIST_POSTGRES_NAME = "event_search_postgres_gist" EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin" + EVENT_SEARCH_DELETE_NON_STRINGS = "event_search_sqlite_delete_non_strings" def __init__( self, @@ -147,6 +148,10 @@ def __init__( self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search ) + self.db_pool.updates.register_background_update_handler( + self.EVENT_SEARCH_DELETE_NON_STRINGS, self._background_delete_non_strings + ) + async def _background_reindex_search(self, progress, batch_size): # we work through the events table from highest stream id to lowest target_min_stream_id = progress["target_min_stream_id_inclusive"] @@ -372,6 +377,27 @@ def reindex_search_txn(txn): return num_rows + async def _background_delete_non_strings( + self, progress: JsonDict, batch_size: int + ) -> int: + """Deletes rows with non-string `value`s from `event_search` if using sqlite. + + Prior to Synapse 1.44.0, malformed events received over federation could cause integers + to be inserted into the `event_search` table when using sqlite. + """ + + def delete_non_strings_txn(txn: LoggingTransaction) -> None: + txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'") + + await self.db_pool.runInteraction( + self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn + ) + + await self.db_pool.updates._end_background_update( + self.EVENT_SEARCH_DELETE_NON_STRINGS + ) + return 1 + class SearchStore(SearchBackgroundUpdateStore): def __init__( diff --git a/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql.sqlite b/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql.sqlite new file mode 100644 index 000000000000..140df652642d --- /dev/null +++ b/synapse/storage/schema/main/delta/68/05_delete_non_strings_from_event_search.sql.sqlite @@ -0,0 +1,22 @@ +/* Copyright 2022 The Matrix.org Foundation C.I.C + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +-- Delete rows with non-string `value`s from `event_search` if using sqlite. +-- +-- Prior to Synapse 1.44.0, malformed events received over federation could +-- cause integers to be inserted into the `event_search` table. +INSERT INTO background_updates (ordering, update_name, progress_json) VALUES + (6805, 'event_search_sqlite_delete_non_strings', '{}'); diff --git a/tests/storage/test_room_search.py b/tests/storage/test_room_search.py index befaa0fcee08..d62e01726cba 100644 --- a/tests/storage/test_room_search.py +++ b/tests/storage/test_room_search.py @@ -13,13 +13,16 @@ # limitations under the License. import synapse.rest.admin +from synapse.api.constants import EventTypes +from synapse.api.errors import StoreError from synapse.rest.client import login, room from synapse.storage.engines import PostgresEngine -from tests.unittest import HomeserverTestCase +from tests.unittest import HomeserverTestCase, skip_unless +from tests.utils import USE_POSTGRES_FOR_TESTS -class NullByteInsertionTest(HomeserverTestCase): +class EventSearchInsertionTest(HomeserverTestCase): servlets = [ synapse.rest.admin.register_servlets_for_client_rest_resource, login.register_servlets, @@ -72,3 +75,113 @@ def test_null_byte(self): ) if isinstance(store.database_engine, PostgresEngine): self.assertIn("alice", result.get("highlights")) + + def test_non_string(self): + """Test that non-string `value`s are not inserted into `event_search`. + + This is particularly important when using sqlite, since a sqlite column can hold + both strings and integers. When using Postgres, integers are automatically + converted to strings. + + Regression test for #11918. + """ + store = self.hs.get_datastores().main + + # Register a user and create a room + user_id = self.register_user("alice", "password") + access_token = self.login("alice", "password") + room_id = self.helper.create_room_as("alice", tok=access_token) + room_version = self.get_success(store.get_room_version(room_id)) + + # Construct a message with a numeric body to be received over federation + # The message can't be sent using the client API, since Synapse's event + # validation will reject it. + prev_event_ids = self.get_success(store.get_prev_events_for_room(room_id)) + prev_event = self.get_success(store.get_event(prev_event_ids[0])) + prev_state_map = self.get_success( + self.hs.get_storage().state.get_state_ids_for_event(prev_event_ids[0]) + ) + + event_dict = { + "type": EventTypes.Message, + "content": {"msgtype": "m.text", "body": 2}, + "room_id": room_id, + "sender": user_id, + "depth": prev_event.depth + 1, + "prev_events": prev_event_ids, + "origin_server_ts": self.clock.time_msec(), + } + builder = self.hs.get_event_builder_factory().for_room_version( + room_version, event_dict + ) + event = self.get_success( + builder.build( + prev_event_ids=prev_event_ids, + auth_event_ids=self.hs.get_event_auth_handler().compute_auth_events( + builder, + prev_state_map, + for_verification=False, + ), + depth=event_dict["depth"], + ) + ) + + # Receive the event + self.get_success( + self.hs.get_federation_event_handler().on_receive_pdu( + self.hs.hostname, event + ) + ) + + # The event should not have an entry in the `event_search` table + f = self.get_failure( + store.db_pool.simple_select_one_onecol( + "event_search", + {"room_id": room_id, "event_id": event.event_id}, + "event_id", + ), + StoreError, + ) + self.assertEqual(f.value.code, 404) + + @skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite") + def test_sqlite_non_string_deletion_background_update(self): + """Test the background update to delete bad rows from `event_search`.""" + store = self.hs.get_datastores().main + + # Populate `event_search` with dummy data + self.get_success( + store.db_pool.simple_insert_many( + "event_search", + keys=["event_id", "room_id", "key", "value"], + values=[ + ("event1", "room_id", "content.body", "hi"), + ("event2", "room_id", "content.body", "2"), + ("event3", "room_id", "content.body", 3), + ], + desc="populate_event_search", + ) + ) + + # Run the background update + store.db_pool.updates._all_done = False + self.get_success( + store.db_pool.simple_insert( + "background_updates", + { + "update_name": "event_search_sqlite_delete_non_strings", + "progress_json": "{}", + }, + ) + ) + self.wait_for_background_updates() + + # The non-string `value`s ought to be gone now. + values = self.get_success( + store.db_pool.simple_select_onecol( + "event_search", + {"room_id": "room_id"}, + "value", + ), + ) + self.assertCountEqual(values, ["hi", "2"])