From 7c8c97e635811609c5a7ae4c0bb94e6573c30753 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 30 Oct 2019 15:12:49 +0000 Subject: [PATCH 01/11] Split purge API into events vs state --- synapse/handlers/pagination.py | 7 +- synapse/storage/__init__.py | 2 + synapse/storage/data_stores/main/events.py | 328 ++++++++++----------- synapse/storage/data_stores/main/state.py | 23 ++ synapse/storage/purge_events.py | 117 ++++++++ tests/storage/test_purge.py | 15 +- 6 files changed, 308 insertions(+), 184 deletions(-) create mode 100644 synapse/storage/purge_events.py diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py index 5744f4579d21..9088ba14cd15 100644 --- a/synapse/handlers/pagination.py +++ b/synapse/handlers/pagination.py @@ -69,6 +69,7 @@ def __init__(self, hs): self.hs = hs self.auth = hs.get_auth() self.store = hs.get_datastore() + self.storage = hs.get_storage() self.clock = hs.get_clock() self._server_name = hs.hostname @@ -125,7 +126,9 @@ def _purge_history(self, purge_id, room_id, token, delete_local_events): self._purges_in_progress_by_room.add(room_id) try: with (yield self.pagination_lock.write(room_id)): - yield self.store.purge_history(room_id, token, delete_local_events) + yield self.storage.purge_events.purge_history( + room_id, token, delete_local_events + ) logger.info("[purge] complete") self._purges_by_id[purge_id].status = PurgeStatus.STATUS_COMPLETE except Exception: @@ -168,7 +171,7 @@ async def purge_room(self, room_id): if joined: raise SynapseError(400, "Users are still joined to this room") - await self.store.purge_room(room_id) + await self.storage.purge_events.purge_room(room_id) @defer.inlineCallbacks def get_messages( diff --git a/synapse/storage/__init__.py b/synapse/storage/__init__.py index a6429d17ed61..3646ebd00752 100644 --- a/synapse/storage/__init__.py +++ b/synapse/storage/__init__.py @@ -30,6 +30,7 @@ from synapse.storage.data_stores import DataStores from synapse.storage.data_stores.main import DataStore from synapse.storage.persist_events import EventsPersistenceStorage +from synapse.storage.purge_events import PurgeEventsStorage __all__ = ["DataStores", "DataStore"] @@ -45,6 +46,7 @@ def __init__(self, hs, stores: DataStores): self.main = stores.main self.persistence = EventsPersistenceStorage(hs, stores) + self.purge_events = PurgeEventsStorage(hs, stores) def are_all_users_on_domain(txn, database_engine, domain): diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 7c3607f30838..4eacba8058e3 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1368,6 +1368,10 @@ def purge_history(self, room_id, token, delete_local_events): if True, we will delete local events as well as remote ones (instead of just marking them as outliers and deleting their state groups). + + Returns: + Deferred[set[int]]: The set of state groups that reference deleted + events. """ return self.runInteraction( @@ -1521,60 +1525,6 @@ def _purge_history_txn(self, txn, room_id, token_str, delete_local_events): "[purge] found %i referenced state groups", len(referenced_state_groups) ) - logger.info("[purge] finding state groups that can be deleted") - - _ = self._find_unreferenced_groups_during_purge(txn, referenced_state_groups) - state_groups_to_delete, remaining_state_groups = _ - - logger.info( - "[purge] found %i state groups to delete", len(state_groups_to_delete) - ) - - logger.info( - "[purge] de-delta-ing %i remaining state groups", - len(remaining_state_groups), - ) - - # Now we turn the state groups that reference to-be-deleted state - # groups to non delta versions. - for sg in remaining_state_groups: - logger.info("[purge] de-delta-ing remaining state group %s", sg) - curr_state = self._get_state_groups_from_groups_txn(txn, [sg]) - curr_state = curr_state[sg] - - self._simple_delete_txn( - txn, table="state_groups_state", keyvalues={"state_group": sg} - ) - - self._simple_delete_txn( - txn, table="state_group_edges", keyvalues={"state_group": sg} - ) - - self._simple_insert_many_txn( - txn, - table="state_groups_state", - values=[ - { - "state_group": sg, - "room_id": room_id, - "type": key[0], - "state_key": key[1], - "event_id": state_id, - } - for key, state_id in iteritems(curr_state) - ], - ) - - logger.info("[purge] removing redundant state groups") - txn.executemany( - "DELETE FROM state_groups_state WHERE state_group = ?", - ((sg,) for sg in state_groups_to_delete), - ) - txn.executemany( - "DELETE FROM state_groups WHERE id = ?", - ((sg,) for sg in state_groups_to_delete), - ) - logger.info("[purge] removing events from event_to_state_groups") txn.execute( "DELETE FROM event_to_state_groups " @@ -1661,87 +1611,7 @@ def _purge_history_txn(self, txn, room_id, token_str, delete_local_events): logger.info("[purge] done") - def _find_unreferenced_groups_during_purge(self, txn, state_groups): - """Used when purging history to figure out which state groups can be - deleted and which need to be de-delta'ed (due to one of its prev groups - being scheduled for deletion). - - Args: - txn - state_groups (set[int]): Set of state groups referenced by events - that are going to be deleted. - - Returns: - tuple[set[int], set[int]]: The set of state groups that can be - deleted and the set of state groups that need to be de-delta'ed - """ - # Graph of state group -> previous group - graph = {} - - # Set of events that we have found to be referenced by events - referenced_groups = set() - - # Set of state groups we've already seen - state_groups_seen = set(state_groups) - - # Set of state groups to handle next. - next_to_search = set(state_groups) - while next_to_search: - # We bound size of groups we're looking up at once, to stop the - # SQL query getting too big - if len(next_to_search) < 100: - current_search = next_to_search - next_to_search = set() - else: - current_search = set(itertools.islice(next_to_search, 100)) - next_to_search -= current_search - - # Check if state groups are referenced - sql = """ - SELECT DISTINCT state_group FROM event_to_state_groups - LEFT JOIN events_to_purge AS ep USING (event_id) - WHERE ep.event_id IS NULL AND - """ - clause, args = make_in_list_sql_clause( - txn.database_engine, "state_group", current_search - ) - txn.execute(sql + clause, list(args)) - - referenced = set(sg for sg, in txn) - referenced_groups |= referenced - - # We don't continue iterating up the state group graphs for state - # groups that are referenced. - current_search -= referenced - - rows = self._simple_select_many_txn( - txn, - table="state_group_edges", - column="prev_state_group", - iterable=current_search, - keyvalues={}, - retcols=("prev_state_group", "state_group"), - ) - - prevs = set(row["state_group"] for row in rows) - # We don't bother re-handling groups we've already seen - prevs -= state_groups_seen - next_to_search |= prevs - state_groups_seen |= prevs - - for row in rows: - # Note: Each state group can have at most one prev group - graph[row["state_group"]] = row["prev_state_group"] - - to_delete = state_groups_seen - referenced_groups - - to_dedelta = set() - for sg in referenced_groups: - prev_sg = graph.get(sg) - if prev_sg and prev_sg in to_delete: - to_dedelta.add(sg) - - return to_delete, to_dedelta + return referenced_state_groups def purge_room(self, room_id): """Deletes all record of a room @@ -1753,46 +1623,7 @@ def purge_room(self, room_id): return self.runInteraction("purge_room", self._purge_room_txn, room_id) def _purge_room_txn(self, txn, room_id): - # first we have to delete the state groups states - logger.info("[purge] removing %s from state_groups_state", room_id) - - txn.execute( - """ - DELETE FROM state_groups_state WHERE state_group IN ( - SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) - WHERE events.room_id=? - ) - """, - (room_id,), - ) - - # ... and the state group edges - logger.info("[purge] removing %s from state_group_edges", room_id) - - txn.execute( - """ - DELETE FROM state_group_edges WHERE state_group IN ( - SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) - WHERE events.room_id=? - ) - """, - (room_id,), - ) - - # ... and the state groups - logger.info("[purge] removing %s from state_groups", room_id) - - txn.execute( - """ - DELETE FROM state_groups WHERE id IN ( - SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) - WHERE events.room_id=? - ) - """, - (room_id,), - ) - - # and then tables which lack an index on room_id but have one on event_id + # First delete tables which lack an index on room_id but have one on event_id for table in ( "event_auth", "event_edges", @@ -1881,6 +1712,153 @@ def _purge_room_txn(self, txn, room_id): logger.info("[purge] done") + def purge_unreferenced_state_groups(self, room_id, state_groups_to_delete): + """Deletes no longer referenced state groups and de-deltas any state + groups that reference them. + """ + + return self.runInteraction( + "purge_unreferenced_state_groups", + self._purge_unreferenced_state_groups, + room_id, + state_groups_to_delete, + ) + + def _purge_unreferenced_state_groups(self, txn, room_id, state_groups_to_delete): + logger.info( + "[purge] found %i state groups to delete", len(state_groups_to_delete) + ) + + rows = self._simple_select_many_txn( + txn, + table="state_group_edges", + column="prev_state_group", + iterable=state_groups_to_delete, + keyvalues={}, + retcols=("state_group",), + ) + + remaining_state_groups = set( + row["state_group"] + for row in rows + if row["state_group"] not in state_groups_to_delete + ) + + logger.info( + "[purge] de-delta-ing %i remaining state groups", + len(remaining_state_groups), + ) + + # Now we turn the state groups that reference to-be-deleted state + # groups to non delta versions. + for sg in remaining_state_groups: + logger.info("[purge] de-delta-ing remaining state group %s", sg) + curr_state = self._get_state_groups_from_groups_txn(txn, [sg]) + curr_state = curr_state[sg] + + self._simple_delete_txn( + txn, table="state_groups_state", keyvalues={"state_group": sg} + ) + + self._simple_delete_txn( + txn, table="state_group_edges", keyvalues={"state_group": sg} + ) + + self._simple_insert_many_txn( + txn, + table="state_groups_state", + values=[ + { + "state_group": sg, + "room_id": room_id, + "type": key[0], + "state_key": key[1], + "event_id": state_id, + } + for key, state_id in iteritems(curr_state) + ], + ) + + logger.info("[purge] removing redundant state groups") + txn.executemany( + "DELETE FROM state_groups_state WHERE state_group = ?", + ((sg,) for sg in state_groups_to_delete), + ) + txn.executemany( + "DELETE FROM state_groups WHERE id = ?", + ((sg,) for sg in state_groups_to_delete), + ) + + @defer.inlineCallbacks + def get_previous_state_groups(self, state_groups): + """Fetch the previous groups of the given state groups. + + Args: + state_groups (Iterable[int]) + + Returns: + Deferred[dict[int, int]]: mapping from state group to previous + state group. + """ + + rows = yield self._simple_select_many_batch( + table="state_group_edges", + column="prev_state_group", + iterable=state_groups, + keyvalues={}, + retcols=("prev_state_group", "state_group"), + desc="get_previous_state_groups", + ) + + return {row["state_group"]: row["prev_state_group"] for row in rows} + + def purge_room_state(self, room_id): + """Deletes all record of a room from state tables + + Args: + room_id (str): + """ + + return self.runInteraction( + "purge_room_state", self._purge_room_state_txn, room_id + ) + + def _purge_room_state_txn(self, txn, room_id): + # first we have to delete the state groups states + logger.info("[purge] removing %s from state_groups_state", room_id) + + txn.execute( + """ + DELETE FROM state_groups_state + INNER JOIN state_groups USING (event_id) + WHEREE state_groups.room_id = ? + """, + (room_id,), + ) + + # ... and the state group edges + logger.info("[purge] removing %s from state_group_edges", room_id) + + txn.execute( + """ + DELETE FROM state_group_edges + INNER JOIN state_groups USING (event_id) + WHEREE state_groups.room_id = ? + ) + """, + (room_id,), + ) + + # ... and the state groups + logger.info("[purge] removing %s from state_groups", room_id) + + txn.execute( + """ + DELETE FROM state_groups WHEREE room_id = ? + """, + (room_id,), + ) + async def is_event_after(self, event_id1, event_id2): """Returns True if event_id1 is after event_id2 in the stream """ diff --git a/synapse/storage/data_stores/main/state.py b/synapse/storage/data_stores/main/state.py index 9b2207075b49..36be8f0a9df9 100644 --- a/synapse/storage/data_stores/main/state.py +++ b/synapse/storage/data_stores/main/state.py @@ -989,6 +989,29 @@ def _store_state_group_txn(txn): return self.runInteraction("store_state_group", _store_state_group_txn) + @defer.inlineCallbacks + def get_referenced_state_groups(self, state_groups): + """Check if the state groups are referenced by events. + + Args: + state_groups (Iterable[int]) + + Returns: + Deferred[set[int]]: The subset of state groups that are + referenced. + """ + + rows = yield self._simple_select_many_batch( + table="event_to_state_groups", + column="state_group", + iterable=state_groups, + keyvalues={}, + retcols=("DISTINCT state_group",), + desc="get_referenced_state_groups", + ) + + return set(row["state_group"] for row in rows) + class StateBackgroundUpdateStore( StateGroupBackgroundUpdateStore, BackgroundUpdateStore diff --git a/synapse/storage/purge_events.py b/synapse/storage/purge_events.py new file mode 100644 index 000000000000..dd45df0c881d --- /dev/null +++ b/synapse/storage/purge_events.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +# Copyright 2019 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import logging + +from twisted.internet import defer + +logger = logging.getLogger(__name__) + + +class PurgeEventsStorage(object): + """High level interface for purging rooms and event history. + """ + + def __init__(self, hs, stores): + self.stores = stores + + @defer.inlineCallbacks + def purge_room(self, room_id: str): + """Deletes all record of a room + """ + + yield self.stores.main.purge_room(room_id) + yield self.stores.main.purge_room_state(room_id) + + @defer.inlineCallbacks + def purge_history(self, room_id, token, delete_local_events): + """Deletes room history before a certain point + + Args: + room_id (str): + + token (str): A topological token to delete events before + + delete_local_events (bool): + if True, we will delete local events as well as remote ones + (instead of just marking them as outliers and deleting their + state groups). + """ + state_groups = yield self.stores.main.purge_history( + room_id, token, delete_local_events + ) + + logger.info("[purge] finding state groups that can be deleted") + + sg_to_delete = yield self._find_unreferenced_groups(state_groups) + + yield self.stores.main.purge_unreferenced_state_groups(room_id, sg_to_delete) + + @defer.inlineCallbacks + def _find_unreferenced_groups(self, state_groups): + """Used when purging history to figure out which state groups can be + deleted. + + Args: + state_groups (set[int]): Set of state groups referenced by events + that are going to be deleted. + + Returns: + Deferred[set[int]] The set of state groups that can be deleted. + """ + # Graph of state group -> previous group + graph = {} + + # Set of events that we have found to be referenced by events + referenced_groups = set() + + # Set of state groups we've already seen + state_groups_seen = set(state_groups) + + # Set of state groups to handle next. + next_to_search = set(state_groups) + while next_to_search: + # We bound size of groups we're looking up at once, to stop the + # SQL query getting too big + if len(next_to_search) < 100: + current_search = next_to_search + next_to_search = set() + else: + current_search = set(itertools.islice(next_to_search, 100)) + next_to_search -= current_search + + referenced = yield self.stores.main.get_referenced_state_groups( + current_search + ) + referenced_groups |= referenced + + # We don't continue iterating up the state group graphs for state + # groups that are referenced. + current_search -= referenced + + edges = yield self.stores.main.get_previous_state_groups(current_search) + + prevs = set(edges.values()) + # We don't bother re-handling groups we've already seen + prevs -= state_groups_seen + next_to_search |= prevs + state_groups_seen |= prevs + + graph.update(edges) + + to_delete = state_groups_seen - referenced_groups + + return to_delete diff --git a/tests/storage/test_purge.py b/tests/storage/test_purge.py index f671599cb89a..b9fafaa1a667 100644 --- a/tests/storage/test_purge.py +++ b/tests/storage/test_purge.py @@ -40,23 +40,24 @@ def test_purge(self): third = self.helper.send(self.room_id, body="test3") last = self.helper.send(self.room_id, body="test4") - storage = self.hs.get_datastore() + store = self.hs.get_datastore() + storage = self.hs.get_storage() # Get the topological token - event = storage.get_topological_token_for_event(last["event_id"]) + event = store.get_topological_token_for_event(last["event_id"]) self.pump() event = self.successResultOf(event) # Purge everything before this topological token - purge = storage.purge_history(self.room_id, event, True) + purge = storage.purge_events.purge_history(self.room_id, event, True) self.pump() self.assertEqual(self.successResultOf(purge), None) # Try and get the events - get_first = storage.get_event(first["event_id"]) - get_second = storage.get_event(second["event_id"]) - get_third = storage.get_event(third["event_id"]) - get_last = storage.get_event(last["event_id"]) + get_first = store.get_event(first["event_id"]) + get_second = store.get_event(second["event_id"]) + get_third = store.get_event(third["event_id"]) + get_last = store.get_event(last["event_id"]) self.pump() # 1-3 should fail and last will succeed, meaning that 1-3 are deleted From ecfba89a784db12b48074ade3a44092267ba9cf7 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 30 Oct 2019 15:14:29 +0000 Subject: [PATCH 02/11] Newsfile --- changelog.d/6295.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/6295.misc diff --git a/changelog.d/6295.misc b/changelog.d/6295.misc new file mode 100644 index 000000000000..a3e6b8296e6c --- /dev/null +++ b/changelog.d/6295.misc @@ -0,0 +1 @@ +Split out state storage into separate data store. From 8f5bbdb987c92ee3e9b4170cc8ce296d87b1555d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 31 Oct 2019 15:22:08 +0000 Subject: [PATCH 03/11] Fix purge room API --- synapse/storage/data_stores/main/events.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 63b09a09e85d..a904a7157039 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1829,8 +1829,10 @@ def _purge_room_state_txn(self, txn, room_id): txn.execute( """ DELETE FROM state_groups_state - INNER JOIN state_groups USING (event_id) - WHEREE state_groups.room_id = ? + WHERE state_group IN ( + SELECT state_group FROM state_groups + WHERE room_id = ? + ) """, (room_id,), ) @@ -1841,8 +1843,9 @@ def _purge_room_state_txn(self, txn, room_id): txn.execute( """ DELETE FROM state_group_edges - INNER JOIN state_groups USING (event_id) - WHEREE state_groups.room_id = ? + WHERE state_group IN ( + SELECT state_group FROM state_groups + WHERE room_id = ? ) """, (room_id,), @@ -1853,7 +1856,7 @@ def _purge_room_state_txn(self, txn, room_id): txn.execute( """ - DELETE FROM state_groups WHEREE room_id = ? + DELETE FROM state_groups WHERE room_id = ? """, (room_id,), ) From f91f2a1f92ee2eb5758184ef0df66490592587d1 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 31 Oct 2019 15:25:21 +0000 Subject: [PATCH 04/11] Docstrings --- synapse/storage/data_stores/main/events.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index a904a7157039..108b2e8bd5ea 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -19,6 +19,7 @@ import logging from collections import Counter as c_counter, OrderedDict, namedtuple from functools import wraps +from typing import Set from six import iteritems, text_type from six.moves import range @@ -1370,8 +1371,8 @@ def purge_history(self, room_id, token, delete_local_events): state groups). Returns: - Deferred[set[int]]: The set of state groups that reference deleted - events. + Deferred[set[int]]: The set of state groups that are referenced by + deleted events. """ return self.runInteraction( @@ -1711,9 +1712,16 @@ def _purge_room_txn(self, txn, room_id): logger.info("[purge] done") - def purge_unreferenced_state_groups(self, room_id, state_groups_to_delete): + def purge_unreferenced_state_groups( + self, room_id: str, state_groups_to_delete: Set[int] + ) -> defer.Deferred: """Deletes no longer referenced state groups and de-deltas any state groups that reference them. + + Args: + room_id: The room the state groups belong to (must all be in the + same room). + state_groups_to_delete: Set of all state groups to delete. """ return self.runInteraction( From 61be1a29262a9a3553537f257a4187ef355684f5 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 31 Oct 2019 15:39:26 +0000 Subject: [PATCH 05/11] Add state_groups.room_id index --- .../schema/delta/56/state_group_room_idx.sql | 17 +++++++++++++++++ synapse/storage/data_stores/main/state.py | 7 +++++++ 2 files changed, 24 insertions(+) create mode 100644 synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql diff --git a/synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql b/synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql new file mode 100644 index 000000000000..7916ef18b244 --- /dev/null +++ b/synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql @@ -0,0 +1,17 @@ +/* Copyright 2019 The Matrix.org Foundation C.I.C. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +INSERT INTO background_updates (update_name, progress_json) VALUES + ('state_groups_room_id_idx', '{}'); diff --git a/synapse/storage/data_stores/main/state.py b/synapse/storage/data_stores/main/state.py index 36be8f0a9df9..bf6de4ca22ae 100644 --- a/synapse/storage/data_stores/main/state.py +++ b/synapse/storage/data_stores/main/state.py @@ -1021,6 +1021,7 @@ class StateBackgroundUpdateStore( STATE_GROUP_INDEX_UPDATE_NAME = "state_group_state_type_index" CURRENT_STATE_INDEX_UPDATE_NAME = "current_state_members_idx" EVENT_STATE_GROUP_INDEX_UPDATE_NAME = "event_to_state_groups_sg_index" + STATE_GROUPS_ROOM_INDEX_UPDATE_NAME = "state_groups_room_id_idx" def __init__(self, db_conn, hs): super(StateBackgroundUpdateStore, self).__init__(db_conn, hs) @@ -1044,6 +1045,12 @@ def __init__(self, db_conn, hs): table="event_to_state_groups", columns=["state_group"], ) + self.register_background_index_update( + self.STATE_GROUPS_ROOM_INDEX_UPDATE_NAME, + index_name="state_groups_room_id_idx", + table="state_groups", + columns=["room_id"], + ) @defer.inlineCallbacks def _background_deduplicate_state(self, progress, batch_size): From fb1a6914cf953ed237235274a9aab62aafec5b4f Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 31 Oct 2019 15:45:48 +0000 Subject: [PATCH 06/11] Update log line to lie a little less --- synapse/storage/data_stores/main/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 108b2e8bd5ea..3a9b6bed45b2 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1509,7 +1509,7 @@ def _purge_history_txn(self, txn, room_id, token_str, delete_local_events): [(room_id, event_id) for event_id, in new_backwards_extrems], ) - logger.info("[purge] finding redundant state groups") + logger.info("[purge] finding state groups referenced by deleted events") # Get all state groups that are referenced by events that are to be # deleted. We then go and check if they are referenced by other events From 669b6cbda3e545f277def545954948090aec8980 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 1 Nov 2019 11:32:20 +0000 Subject: [PATCH 07/11] Fix up comment --- synapse/storage/data_stores/main/events.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 3a9b6bed45b2..a71d7346d2b1 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1512,8 +1512,7 @@ def _purge_history_txn(self, txn, room_id, token_str, delete_local_events): logger.info("[purge] finding state groups referenced by deleted events") # Get all state groups that are referenced by events that are to be - # deleted. We then go and check if they are referenced by other events - # or state groups, and if not we delete them. + # deleted. txn.execute( """ SELECT DISTINCT state_group FROM events_to_purge From 7134ca7daa7ead8104e3c51ba4bc730d99d098e3 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 4 Nov 2019 13:36:57 +0000 Subject: [PATCH 08/11] Change to not require a state_groups.room_id index. This does mean that we won't clean up orphaned state groups (i.e. state groups that were persisted but the associated event wasn't). --- synapse/storage/data_stores/main/events.py | 70 ++++++++++++------- .../schema/delta/56/state_group_room_idx.sql | 17 ----- synapse/storage/data_stores/main/state.py | 7 -- synapse/storage/purge_events.py | 4 +- 4 files changed, 45 insertions(+), 53 deletions(-) delete mode 100644 synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 68f27078c459..3049a21dc553 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1624,7 +1624,10 @@ def purge_room(self, room_id): """Deletes all record of a room Args: - room_id (str): + room_id (str) + + Returns: + Deferred[List[int]]: The list of state groups to delete. """ return self.runInteraction("purge_room", self._purge_room_txn, room_id) @@ -1714,10 +1717,24 @@ def _purge_room_txn(self, txn, room_id): # index on them. In any case we should be clearing out 'stream' tables # periodically anyway (#5888) + # Now we fetch all the state groups that should be deleted. + txn.execute( + """ + SELECT DISTINCT state_group FROM events + INNER JOIN event_to_state_groups USING(event_id) + WHERE events.room_id = ? + """, + (room_id,), + ) + + state_groups = [row[0] for row in txn] + # TODO: we could probably usefully do a bunch of cache invalidation here logger.info("[purge] done") + return state_groups + def purge_unreferenced_state_groups( self, room_id: str, state_groups_to_delete: Set[int] ) -> defer.Deferred: @@ -1825,54 +1842,53 @@ def get_previous_state_groups(self, state_groups): return {row["state_group"]: row["prev_state_group"] for row in rows} - def purge_room_state(self, room_id): + def purge_room_state(self, room_id, state_groups_to_delete): """Deletes all record of a room from state tables Args: room_id (str): + state_groups_to_delete (list[int]): State groups to delete """ return self.runInteraction( - "purge_room_state", self._purge_room_state_txn, room_id + "purge_room_state", + self._purge_room_state_txn, + room_id, + state_groups_to_delete, ) - def _purge_room_state_txn(self, txn, room_id): + def _purge_room_state_txn(self, txn, room_id, state_groups_to_delete): # first we have to delete the state groups states logger.info("[purge] removing %s from state_groups_state", room_id) - txn.execute( - """ - DELETE FROM state_groups_state - WHERE state_group IN ( - SELECT state_group FROM state_groups - WHERE room_id = ? - ) - """, - (room_id,), + self._simple_delete_many_txn( + txn, + table="state_groups_state", + column="state_group", + iterable=state_groups_to_delete, + keyvalues={}, ) # ... and the state group edges logger.info("[purge] removing %s from state_group_edges", room_id) - txn.execute( - """ - DELETE FROM state_group_edges - WHERE state_group IN ( - SELECT state_group FROM state_groups - WHERE room_id = ? - ) - """, - (room_id,), + self._simple_delete_many_txn( + txn, + table="state_group_edges", + column="state_group", + iterable=state_groups_to_delete, + keyvalues={}, ) # ... and the state groups logger.info("[purge] removing %s from state_groups", room_id) - txn.execute( - """ - DELETE FROM state_groups WHERE room_id = ? - """, - (room_id,), + self._simple_delete_many_txn( + txn, + table="state_groups", + column="id", + iterable=state_groups_to_delete, + keyvalues={}, ) async def is_event_after(self, event_id1, event_id2): diff --git a/synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql b/synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql deleted file mode 100644 index 7916ef18b244..000000000000 --- a/synapse/storage/data_stores/main/schema/delta/56/state_group_room_idx.sql +++ /dev/null @@ -1,17 +0,0 @@ -/* Copyright 2019 The Matrix.org Foundation C.I.C. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -INSERT INTO background_updates (update_name, progress_json) VALUES - ('state_groups_room_id_idx', '{}'); diff --git a/synapse/storage/data_stores/main/state.py b/synapse/storage/data_stores/main/state.py index e1d3041c7cbf..5c5b15840ed5 100644 --- a/synapse/storage/data_stores/main/state.py +++ b/synapse/storage/data_stores/main/state.py @@ -1023,7 +1023,6 @@ class StateBackgroundUpdateStore( STATE_GROUP_INDEX_UPDATE_NAME = "state_group_state_type_index" CURRENT_STATE_INDEX_UPDATE_NAME = "current_state_members_idx" EVENT_STATE_GROUP_INDEX_UPDATE_NAME = "event_to_state_groups_sg_index" - STATE_GROUPS_ROOM_INDEX_UPDATE_NAME = "state_groups_room_id_idx" def __init__(self, db_conn, hs): super(StateBackgroundUpdateStore, self).__init__(db_conn, hs) @@ -1047,12 +1046,6 @@ def __init__(self, db_conn, hs): table="event_to_state_groups", columns=["state_group"], ) - self.register_background_index_update( - self.STATE_GROUPS_ROOM_INDEX_UPDATE_NAME, - index_name="state_groups_room_id_idx", - table="state_groups", - columns=["room_id"], - ) @defer.inlineCallbacks def _background_deduplicate_state(self, progress, batch_size): diff --git a/synapse/storage/purge_events.py b/synapse/storage/purge_events.py index dd45df0c881d..a36818203454 100644 --- a/synapse/storage/purge_events.py +++ b/synapse/storage/purge_events.py @@ -33,8 +33,8 @@ def purge_room(self, room_id: str): """Deletes all record of a room """ - yield self.stores.main.purge_room(room_id) - yield self.stores.main.purge_room_state(room_id) + state_groups_to_delete = yield self.stores.main.purge_room(room_id) + yield self.stores.main.purge_room_state(room_id, state_groups_to_delete) @defer.inlineCallbacks def purge_history(self, room_id, token, delete_local_events): From 71f3bd734fe9f8f903c5a496720e7dcf926f2f4a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 6 Nov 2019 17:00:18 +0000 Subject: [PATCH 09/11] Use correct type annotation --- synapse/storage/data_stores/main/events.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 3049a21dc553..d69c59f5a14e 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -19,7 +19,7 @@ import logging from collections import Counter as c_counter, OrderedDict, namedtuple from functools import wraps -from typing import Set +from typing import Collection from six import iteritems, text_type from six.moves import range @@ -1736,7 +1736,7 @@ def _purge_room_txn(self, txn, room_id): return state_groups def purge_unreferenced_state_groups( - self, room_id: str, state_groups_to_delete: Set[int] + self, room_id: str, state_groups_to_delete: Collection[int] ) -> defer.Deferred: """Deletes no longer referenced state groups and de-deltas any state groups that reference them. From 5c3363233cca7044a333b7e19ba239eaf5587ff8 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 6 Nov 2019 17:02:05 +0000 Subject: [PATCH 10/11] Fix deleting state groups during room purge. And fix the tests to actually test that things got deleted. --- synapse/storage/data_stores/main/events.py | 27 +++++++++++----------- tests/rest/admin/test_admin.py | 4 +++- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index d69c59f5a14e..946823876a31 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -1633,7 +1633,20 @@ def purge_room(self, room_id): return self.runInteraction("purge_room", self._purge_room_txn, room_id) def _purge_room_txn(self, txn, room_id): - # First delete tables which lack an index on room_id but have one on event_id + # First we fetch all the state groups that should be deleted, before + # we delete that information. + txn.execute( + """ + SELECT DISTINCT state_group FROM events + INNER JOIN event_to_state_groups USING(event_id) + WHERE events.room_id = ? + """, + (room_id,), + ) + + state_groups = [row[0] for row in txn] + + # Now we delete tables which lack an index on room_id but have one on event_id for table in ( "event_auth", "event_edges", @@ -1717,18 +1730,6 @@ def _purge_room_txn(self, txn, room_id): # index on them. In any case we should be clearing out 'stream' tables # periodically anyway (#5888) - # Now we fetch all the state groups that should be deleted. - txn.execute( - """ - SELECT DISTINCT state_group FROM events - INNER JOIN event_to_state_groups USING(event_id) - WHERE events.room_id = ? - """, - (room_id,), - ) - - state_groups = [row[0] for row in txn] - # TODO: we could probably usefully do a bunch of cache invalidation here logger.info("[purge] done") diff --git a/tests/rest/admin/test_admin.py b/tests/rest/admin/test_admin.py index 8e1ca8b738dc..d9f1b95cb009 100644 --- a/tests/rest/admin/test_admin.py +++ b/tests/rest/admin/test_admin.py @@ -628,10 +628,12 @@ def test_purge_room(self): "local_invites", "room_account_data", "room_tags", + "state_groups", + "state_groups_state", ): count = self.get_success( self.store._simple_select_one_onecol( - table="events", + table=table, keyvalues={"room_id": room_id}, retcol="COUNT(*)", desc="test_purge_room", From e4ec82ce0fdd17f912fad76defd53ec99f782528 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 8 Nov 2019 09:50:48 +0000 Subject: [PATCH 11/11] Move type annotation into docstring --- synapse/storage/data_stores/main/events.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/storage/data_stores/main/events.py b/synapse/storage/data_stores/main/events.py index 946823876a31..878f7568a63d 100644 --- a/synapse/storage/data_stores/main/events.py +++ b/synapse/storage/data_stores/main/events.py @@ -19,7 +19,6 @@ import logging from collections import Counter as c_counter, OrderedDict, namedtuple from functools import wraps -from typing import Collection from six import iteritems, text_type from six.moves import range @@ -1737,7 +1736,7 @@ def _purge_room_txn(self, txn, room_id): return state_groups def purge_unreferenced_state_groups( - self, room_id: str, state_groups_to_delete: Collection[int] + self, room_id: str, state_groups_to_delete ) -> defer.Deferred: """Deletes no longer referenced state groups and de-deltas any state groups that reference them. @@ -1745,7 +1744,8 @@ def purge_unreferenced_state_groups( Args: room_id: The room the state groups belong to (must all be in the same room). - state_groups_to_delete: Set of all state groups to delete. + state_groups_to_delete (Collection[int]): Set of all state groups + to delete. """ return self.runInteraction(