Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add rooms.room_version column #6729

Merged
merged 13 commits into from
Jan 27, 2020
Merged
73 changes: 71 additions & 2 deletions synapse/storage/data_stores/main/room.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

from synapse.api.constants import EventTypes
from synapse.api.errors import StoreError
from synapse.api.room_versions import RoomVersion
from synapse.api.room_versions import RoomVersion, RoomVersions
from synapse.storage._base import SQLBaseStore
from synapse.storage.data_stores.main.search import SearchStore
from synapse.storage.database import Database
from synapse.storage.database import Database, LoggingTransaction
from synapse.types import ThirdPartyInstanceID
from synapse.util.caches.descriptors import cached, cachedInlineCallbacks

Expand Down Expand Up @@ -612,6 +612,7 @@ def _quarantine_media_txn(

class RoomBackgroundUpdateStore(SQLBaseStore):
REMOVE_TOMESTONED_ROOMS_BG_UPDATE = "remove_tombstoned_rooms_from_directory"
ADD_ROOMS_ROOM_VERSION_COLUMN = "add_rooms_room_version_column"

def __init__(self, database: Database, db_conn, hs):
super(RoomBackgroundUpdateStore, self).__init__(database, db_conn, hs)
Expand All @@ -627,6 +628,11 @@ def __init__(self, database: Database, db_conn, hs):
self._remove_tombstoned_rooms_from_directory,
)

self.db.updates.register_background_update_handler(
self.ADD_ROOMS_ROOM_VERSION_COLUMN,
self._background_add_rooms_room_version_column,
)

@defer.inlineCallbacks
def _background_insert_retention(self, progress, batch_size):
"""Retrieves a list of all rooms within a range and inserts an entry for each of
Expand Down Expand Up @@ -695,6 +701,69 @@ def _background_insert_retention_txn(txn):

defer.returnValue(batch_size)

async def _background_add_rooms_room_version_column(
self, progress: dict, batch_size: int
):
"""Background update to go and add room version inforamtion to `rooms`
table from `current_state_events` table.
"""

last_room_id = progress.get("room_id", "")

def _background_add_rooms_room_version_column_txn(txn: LoggingTransaction):
sql = """
SELECT room_id, json FROM current_state_events
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this going to table-scan CSE? shouldn't we base it on rooms instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a unique index on (room_id, type, state_key) so it just walks the index.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, but it's going to need to walk the entire index? Or am I misunderstanding how btree indexes work?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it'll iterate over each room and then look up the create event for each within that, AIUI.

The query plan for it on jki.re seems to show it doing that:

 Limit  (cost=1.11..12920.09 rows=1 width=1090)
   ->  Nested Loop  (cost=1.11..12920.09 rows=1 width=1090)
         ->  Index Scan using current_state_events_room_id_type_state_key_key on current_state_events  (cost=0.55..12846.65 rows=16 width=64)
               Index Cond: ((room_id > ''::text) AND (type = 'm.room.create'::text) AND (state_key = ''::text))
         ->  Index Scan using event_json_event_id_key on event_json  (cost=0.56..4.58 rows=1 width=1125)
               Index Cond: (event_id = current_state_events.event_id)
               Filter: (current_state_events.room_id = room_id)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, my concern is the index scan over that entire index, which is presumably huge on big servers, and it seems like a bunch of unnecessary I/O when we don't care about all the other events in that index.

this isn't a blocking comment; if you think it's going to be ok then that's fine. it was just a thought.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm reasonably sure it'll be fine. It will efficiently jump straight to the correct room ID, and jumping to the next room ID is fast (it won't have to iterate over all the state in the room, as it uses the tree structure to do it), so it should just be looking up the next room ID in the btree index, using the index to pull out the create event ID, repeating.

INNER JOIN event_json USING (room_id, event_id)
WHERE room_id > ? AND type = 'm.room.create' AND state_key = ''
ORDER BY room_id
LIMIT ?
"""

txn.execute(sql, (last_room_id, batch_size))

updates = []
for room_id, event_json in txn:
event_dict = json.loads(event_json)
room_version_id = event_dict.get("content", {}).get(
"room_version", RoomVersions.V1.identifier
)

creator = event_dict.get("content").get("creator")

updates.append((room_id, creator, room_version_id))

if not updates:
return True

new_last_room_id = ""
for room_id, creator, room_version_id in updates:
self.db.simple_upsert_txn(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... we're expecting to find rooms which aren't currently in rooms here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nooooooooooooooooo not really. But equally I don't fancy betting on that fact, given we don't insert it in the same places we insert current_state_events. If the rooms column doesn't exist when there is an entry in current state events then that could end up leading to badness when handling events related to it so I'm inclined to be a bit paranoid.

I could add a comment

txn,
table="rooms",
keyvalues={"room_id": room_id},
values={"room_version": room_version_id},
insertion_values={"is_public": False, "creator": creator},
)
new_last_room_id = room_id

self.db.updates._background_update_progress_txn(
txn, self.ADD_ROOMS_ROOM_VERSION_COLUMN, {"room_id": new_last_room_id}
)

return False

end = await self.db.runInteraction(
"_background_add_rooms_room_version_column",
_background_add_rooms_room_version_column_txn,
)

if end:
await self.db.updates._end_background_update(
self.ADD_ROOMS_ROOM_VERSION_COLUMN
)

return batch_size

async def _remove_tombstoned_rooms_from_directory(
self, progress, batch_size
) -> int:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@
-- `current_state_events` so that we can delete stale entries from it without
-- losing the information.
ALTER TABLE rooms ADD COLUMN room_version TEXT;


INSERT into background_updates (update_name, progress_json)
VALUES ('add_rooms_room_version_column', '{}');