diff --git a/changelog.d/13545.misc b/changelog.d/13545.misc new file mode 100644 index 000000000000..1cdbef179e94 --- /dev/null +++ b/changelog.d/13545.misc @@ -0,0 +1 @@ +Update metrics to track `/messages` response time by room size. diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index d29417fafc63..13bc9482c535 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -16,6 +16,7 @@ """ This module contains REST servlets to do with rooms: /rooms/ """ import logging import re +from enum import Enum from typing import TYPE_CHECKING, Awaitable, Dict, List, Optional, Tuple from urllib import parse as urlparse @@ -48,6 +49,7 @@ parse_strings_from_args, ) from synapse.http.site import SynapseRequest +from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag from synapse.rest.client._base import client_patterns from synapse.rest.client.transactions import HttpTransactionCache @@ -62,6 +64,33 @@ logger = logging.getLogger(__name__) + +class _RoomSize(Enum): + """ + Enum to differentiate sizes of rooms. This is a pretty good approximation + about how hard it will be to get events in the room. We could also look at + room "complexity". + """ + + # This doesn't necessarily mean the room is a DM, just that there is a DM + # amount of people there. + DM_SIZE = "direct_message_size" + SMALL = "small" + SUBSTANTIAL = "substantial" + LARGE = "large" + + @staticmethod + def from_member_count(member_count: int) -> "_RoomSize": + if member_count <= 2: + return _RoomSize.DM_SIZE + elif member_count < 100: + return _RoomSize.SMALL + elif member_count < 1000: + return _RoomSize.SUBSTANTIAL + else: + return _RoomSize.LARGE + + # This is an extra metric on top of `synapse_http_server_response_time_seconds` # which times the same sort of thing but this one allows us to see values # greater than 10s. We use a separate dedicated histogram with its own buckets @@ -70,7 +99,11 @@ messsages_response_timer = Histogram( "synapse_room_message_list_rest_servlet_response_time_seconds", "sec", - [], + # We have a label for room size so we can try to see a more realistic + # picture of /messages response time for bigger rooms. We don't want the + # tiny rooms that can always respond fast skewing our results when we're trying + # to optimize the bigger cases. + ["room_size"], buckets=( 0.005, 0.01, @@ -587,14 +620,26 @@ class RoomMessageListRestServlet(RestServlet): def __init__(self, hs: "HomeServer"): super().__init__() self._hs = hs + self.clock = hs.get_clock() self.pagination_handler = hs.get_pagination_handler() self.auth = hs.get_auth() self.store = hs.get_datastores().main - @messsages_response_timer.time() async def on_GET( self, request: SynapseRequest, room_id: str ) -> Tuple[int, JsonDict]: + processing_start_time = self.clock.time_msec() + # Fire off and hope that we get a result by the end. + # + # We're using the mypy type ignore comment because the `@cached` + # decorator on `get_number_joined_users_in_room` doesn't play well with + # the type system. Maybe in the future, it can use some ParamSpec + # wizardry to fix it up. + room_member_count_deferred = run_in_background( # type: ignore[call-arg] + self.store.get_number_joined_users_in_room, + room_id, # type: ignore[arg-type] + ) + requester = await self.auth.get_user_by_req(request, allow_guest=True) pagination_config = await PaginationConfig.from_request( self.store, request, default_limit=10 @@ -625,6 +670,12 @@ async def on_GET( event_filter=event_filter, ) + processing_end_time = self.clock.time_msec() + room_member_count = await make_deferred_yieldable(room_member_count_deferred) + messsages_response_timer.labels( + room_size=_RoomSize.from_member_count(room_member_count) + ).observe((processing_start_time - processing_end_time) / 1000) + return 200, msgs