From ffcd5fb0492c2f857603620c008bee30284c5fd7 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Tue, 1 Feb 2022 17:22:31 +0100 Subject: [PATCH 01/12] Bail on generating URL previews for audio and video content types. --- synapse/http/client.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/synapse/http/client.py b/synapse/http/client.py index 743a7ffcb159..a70a79abc303 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -726,6 +726,14 @@ async def get_file( HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN ) + if b"Content-Type" in resp_headers: + content_type = resp_headers[b"Content-Type"][0] + if _is_av_media(content_type): + raise SynapseError( + HTTPStatus.BAD_GATEWAY, + ("Unsupported content type for URL previews: %r" % content_type), + ) + # TODO: if our Content-Type is HTML or something, just read the first # N bytes into RAM rather than saving it all to disk only to read it # straight back in again @@ -762,6 +770,12 @@ async def get_file( ) +def _is_av_media(content_type: bytes) -> bool: + return content_type.lower().startswith( + b"video/" + ) or content_type.lower().startswith(b"audio/") + + def _timeout_to_request_timed_out_error(f: Failure): if f.check(twisted_error.TimeoutError, twisted_error.ConnectingCancelledError): # The TCP connection has its own timeout (set by the 'connectTimeout' param From ac49e720006ed1771c1ebe8f3e27f0d28d5d0d4e Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Tue, 1 Feb 2022 17:23:18 +0100 Subject: [PATCH 02/12] Add test. --- tests/rest/media/v1/test_url_preview.py | 72 +++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 53f618621305..c060211497d4 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -243,6 +243,78 @@ def test_non_ascii_preview_httpequiv(self): self.assertEqual(channel.code, 200) self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") + def test_video_rejected(self): + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] + + end_content = b"anything" + + channel = self.make_request( + "GET", + "preview_url?url=http://matrix.org", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b"Content-Type: video/mp4\r\n\r\n" + ) + % (len(end_content)) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 502) + self.assertEqual( + channel.json_body, + { + "errcode": "M_UNKNOWN", + "error": "Unsupported content type for URL previews: b'video/mp4'", + }, + ) + + def test_audio_rejected(self): + self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] + + end_content = b"anything" + + channel = self.make_request( + "GET", + "preview_url?url=http://matrix.org", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b"Content-Type: audio/aac\r\n\r\n" + ) + % (len(end_content)) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 502) + self.assertEqual( + channel.json_body, + { + "errcode": "M_UNKNOWN", + "error": "Unsupported content type for URL previews: b'audio/aac'", + }, + ) + def test_non_ascii_preview_content_type(self): self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")] From 59b39cea8c037218e1c762854761c912ae374aec Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Tue, 8 Feb 2022 12:15:39 +0100 Subject: [PATCH 03/12] Add changelog. --- changelog.d/11936.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/11936.bugfix diff --git a/changelog.d/11936.bugfix b/changelog.d/11936.bugfix new file mode 100644 index 000000000000..e468c756d0bb --- /dev/null +++ b/changelog.d/11936.bugfix @@ -0,0 +1 @@ +Don't attempt URL previews for URLs with audio/* or video/* content types. This prevents Synapse from making useless longer-lived connections to streaming media servers. From 86902770d6cdbcfcfcb462025ae5794f2113f603 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Tue, 8 Feb 2022 13:44:03 +0000 Subject: [PATCH 04/12] Fix changelog style per review. Co-authored-by: Patrick Cloke --- changelog.d/11936.bugfix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/11936.bugfix b/changelog.d/11936.bugfix index e468c756d0bb..2f8efa960494 100644 --- a/changelog.d/11936.bugfix +++ b/changelog.d/11936.bugfix @@ -1 +1 @@ -Don't attempt URL previews for URLs with audio/* or video/* content types. This prevents Synapse from making useless longer-lived connections to streaming media servers. +Don't attempt URL previews for URLs with `audio/*` or `video/*` content types. This prevents Synapse from making useless longer-lived connections to streaming media servers. From fa58bccc036fc6fb586782163631c6473ed74004 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Tue, 8 Feb 2022 16:10:55 +0100 Subject: [PATCH 05/12] Specify allowed content types for get_file via a predicate. --- synapse/http/client.py | 17 +++++++++-------- synapse/rest/media/v1/preview_url_resource.py | 7 +++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index a70a79abc303..b430c372fe4c 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -20,6 +20,7 @@ TYPE_CHECKING, Any, BinaryIO, + Callable, Dict, Iterable, List, @@ -693,12 +694,18 @@ async def get_file( output_stream: BinaryIO, max_size: Optional[int] = None, headers: Optional[RawHeaders] = None, + is_allowed_content_type: Optional[Callable[[bytes], bool]] = None, ) -> Tuple[int, Dict[bytes, List[bytes]], str, int]: """GETs a file from a given URL Args: url: The URL to GET output_stream: File to write the response body to. headers: A map from header name to a list of values for that header + is_allowed_content_type: A predicate to determine whether the + content type of the file we're downloading is allowed. If set and + it evaluates to False when called with the content type, the + request will be terminated before completing the download by + raising SynapseError. Returns: A tuple of the file length, dict of the response headers, absolute URI of the response and HTTP response code. @@ -726,9 +733,9 @@ async def get_file( HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN ) - if b"Content-Type" in resp_headers: + if is_allowed_content_type and b"Content-Type" in resp_headers: content_type = resp_headers[b"Content-Type"][0] - if _is_av_media(content_type): + if not is_allowed_content_type(content_type): raise SynapseError( HTTPStatus.BAD_GATEWAY, ("Unsupported content type for URL previews: %r" % content_type), @@ -770,12 +777,6 @@ async def get_file( ) -def _is_av_media(content_type: bytes) -> bool: - return content_type.lower().startswith( - b"video/" - ) or content_type.lower().startswith(b"audio/") - - def _timeout_to_request_timed_out_error(f: Failure): if f.check(twisted_error.TimeoutError, twisted_error.ConnectingCancelledError): # The TCP connection has its own timeout (set by the 'connectTimeout' param diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index efd84ced8f54..65137ca87400 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -403,6 +403,7 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu output_stream=output_stream, max_size=self.max_spider_size, headers={"Accept-Language": self.url_preview_accept_language}, + is_allowed_content_type=_is_not_av_media, ) except SynapseError: # Pass SynapseErrors through directly, so that the servlet @@ -761,3 +762,9 @@ def _is_html(content_type: str) -> bool: def _is_json(content_type: str) -> bool: return content_type.lower().startswith("application/json") + + +def _is_not_av_media(content_type: bytes) -> bool: + return not content_type.lower().startswith( + b"video/" + ) and not content_type.lower().startswith(b"audio/") From 5a97a0839ce75e5591d7b35c434c0f2f5ccb3c69 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Wed, 9 Feb 2022 10:19:23 +0000 Subject: [PATCH 06/12] Improve readability of `_is_not_av_media`. Co-authored-by: Patrick Cloke --- synapse/rest/media/v1/preview_url_resource.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 65137ca87400..cab0962eb63c 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -765,6 +765,8 @@ def _is_json(content_type: str) -> bool: def _is_not_av_media(content_type: bytes) -> bool: - return not content_type.lower().startswith( + """Returns False if the content type is audio or video.""" + content_type = content_type.lower() + return not content_type.startswith( b"video/" - ) and not content_type.lower().startswith(b"audio/") + ) and not content_type.startswith(b"audio/") From b035155505d3d4e76d21affcce618cd4f67a6bf3 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Wed, 9 Feb 2022 11:48:23 +0100 Subject: [PATCH 07/12] Make error message more general. --- synapse/http/client.py | 5 ++++- tests/rest/media/v1/test_url_preview.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index b430c372fe4c..49a2f3fb39ed 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -738,7 +738,10 @@ async def get_file( if not is_allowed_content_type(content_type): raise SynapseError( HTTPStatus.BAD_GATEWAY, - ("Unsupported content type for URL previews: %r" % content_type), + ( + "Requested file's content type not allowed for this operation: %s" + % content_type.decode(errors="ignore") + ), ) # TODO: if our Content-Type is HTML or something, just read the first diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index c060211497d4..da2c53326019 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -275,7 +275,7 @@ def test_video_rejected(self): channel.json_body, { "errcode": "M_UNKNOWN", - "error": "Unsupported content type for URL previews: b'video/mp4'", + "error": "Requested file's content type not allowed for this operation: video/mp4", }, ) @@ -311,7 +311,7 @@ def test_audio_rejected(self): channel.json_body, { "errcode": "M_UNKNOWN", - "error": "Unsupported content type for URL previews: b'audio/aac'", + "error": "Requested file's content type not allowed for this operation: audio/aac", }, ) From 65dd4f1f0cc937bdc7c6637865896bebb02c4221 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Wed, 9 Feb 2022 11:55:57 +0100 Subject: [PATCH 08/12] Formatting. --- synapse/rest/media/v1/preview_url_resource.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index cab0962eb63c..b52e92305c60 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -767,6 +767,6 @@ def _is_json(content_type: str) -> bool: def _is_not_av_media(content_type: bytes) -> bool: """Returns False if the content type is audio or video.""" content_type = content_type.lower() - return not content_type.startswith( - b"video/" - ) and not content_type.startswith(b"audio/") + return not content_type.startswith(b"video/") and not content_type.startswith( + b"audio/" + ) From 2f3bb1f55aab57c48e8bce97b170fcf9615f9df8 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Thu, 10 Feb 2022 11:53:00 +0100 Subject: [PATCH 09/12] Switch to using an allow list for URL previewable content types. --- synapse/rest/media/v1/preview_url_resource.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b52e92305c60..9dd7425929f6 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -403,7 +403,7 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu output_stream=output_stream, max_size=self.max_spider_size, headers={"Accept-Language": self.url_preview_accept_language}, - is_allowed_content_type=_is_not_av_media, + is_allowed_content_type=_is_previewable, ) except SynapseError: # Pass SynapseErrors through directly, so that the servlet @@ -764,9 +764,14 @@ def _is_json(content_type: str) -> bool: return content_type.lower().startswith("application/json") -def _is_not_av_media(content_type: bytes) -> bool: - """Returns False if the content type is audio or video.""" +def _is_previewable(content_type: bytes) -> bool: + """Returns True for content types for which we will perform URL preview and False + otherwise.""" + content_type = content_type.lower() - return not content_type.startswith(b"video/") and not content_type.startswith( - b"audio/" + return ( + content_type.startswith(b"text/html") + or content_type.startswith(b"application/xhtml") + or content_type.startswith(b"image/") + or content_type.startswith(b"application/json") ) From 4376a705f3b4fbd1c637b9e1ab8743e311926197 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Thu, 10 Feb 2022 14:57:36 +0100 Subject: [PATCH 10/12] Reuse existing content type helpers. --- synapse/http/client.py | 4 ++-- synapse/rest/media/v1/preview_url_resource.py | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 49a2f3fb39ed..abe827a85cf4 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -694,7 +694,7 @@ async def get_file( output_stream: BinaryIO, max_size: Optional[int] = None, headers: Optional[RawHeaders] = None, - is_allowed_content_type: Optional[Callable[[bytes], bool]] = None, + is_allowed_content_type: Optional[Callable[[str], bool]] = None, ) -> Tuple[int, Dict[bytes, List[bytes]], str, int]: """GETs a file from a given URL Args: @@ -735,7 +735,7 @@ async def get_file( if is_allowed_content_type and b"Content-Type" in resp_headers: content_type = resp_headers[b"Content-Type"][0] - if not is_allowed_content_type(content_type): + if not is_allowed_content_type(content_type.decode("ascii")): raise SynapseError( HTTPStatus.BAD_GATEWAY, ( diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 9dd7425929f6..8d3d1e54dc9f 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -764,14 +764,8 @@ def _is_json(content_type: str) -> bool: return content_type.lower().startswith("application/json") -def _is_previewable(content_type: bytes) -> bool: +def _is_previewable(content_type: str) -> bool: """Returns True for content types for which we will perform URL preview and False otherwise.""" - content_type = content_type.lower() - return ( - content_type.startswith(b"text/html") - or content_type.startswith(b"application/xhtml") - or content_type.startswith(b"image/") - or content_type.startswith(b"application/json") - ) + return _is_html(content_type) or _is_media(content_type) or _is_json(content_type) From 1d031242eb7a7ac466dd6f4b8a2677abb6511621 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Thu, 10 Feb 2022 15:04:26 +0100 Subject: [PATCH 11/12] Reword changelog to reflect change of approach. --- changelog.d/11936.bugfix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/11936.bugfix b/changelog.d/11936.bugfix index 2f8efa960494..bc149f280106 100644 --- a/changelog.d/11936.bugfix +++ b/changelog.d/11936.bugfix @@ -1 +1 @@ -Don't attempt URL previews for URLs with `audio/*` or `video/*` content types. This prevents Synapse from making useless longer-lived connections to streaming media servers. +Implement an allow list of content types for which we will attempt to preview a URL. This prevents Synapse from making useless longer-lived connections to streaming media servers. From d6edc9bc584906ccb236462d282be2e8294b68e0 Mon Sep 17 00:00:00 2001 From: Denis Kasak Date: Thu, 10 Feb 2022 15:18:28 +0100 Subject: [PATCH 12/12] Small refactoring. --- synapse/http/client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index abe827a85cf4..d617055617a6 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -734,13 +734,13 @@ async def get_file( ) if is_allowed_content_type and b"Content-Type" in resp_headers: - content_type = resp_headers[b"Content-Type"][0] - if not is_allowed_content_type(content_type.decode("ascii")): + content_type = resp_headers[b"Content-Type"][0].decode("ascii") + if not is_allowed_content_type(content_type): raise SynapseError( HTTPStatus.BAD_GATEWAY, ( "Requested file's content type not allowed for this operation: %s" - % content_type.decode(errors="ignore") + % content_type ), )