From 915cd6ab9900e64f63962ec881d7cae7e62fe52e Mon Sep 17 00:00:00 2001 From: Andrew Ryan Date: Mon, 14 Feb 2022 10:47:21 -0800 Subject: [PATCH 1/6] Use bot user agent for openGraph queries We found that some websites return opengraph information based on the user agent. In order to address this, using (bot) in the user agent string may help the website behave correctly. When testing Twitter, we found that the correct metadata is returned with the (bot) user agent while it isn't for the default user agent. This change additionally updates providers.json, as with the user agent fix twitter will now return the expected openGraph metadata. Signed-off-by: Andrew Ryan --- changelog.d/11985.misc | 1 + synapse/res/providers.json | 4 +--- synapse/rest/media/v1/preview_url_resource.py | 7 ++++++- 3 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 changelog.d/11985.misc diff --git a/changelog.d/11985.misc b/changelog.d/11985.misc new file mode 100644 index 000000000000..1364a2b85421 --- /dev/null +++ b/changelog.d/11985.misc @@ -0,0 +1 @@ +Use bot user agent for openGraph queries. diff --git a/synapse/res/providers.json b/synapse/res/providers.json index f1838f955901..7b9958e45464 100644 --- a/synapse/res/providers.json +++ b/synapse/res/providers.json @@ -5,8 +5,6 @@ "endpoints": [ { "schemes": [ - "https://twitter.com/*/status/*", - "https://*.twitter.com/*/status/*", "https://twitter.com/*/moments/*", "https://*.twitter.com/*/moments/*" ], @@ -14,4 +12,4 @@ } ] } -] \ No newline at end of file +] diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 8d3d1e54dc9f..ec9a0380bb7b 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -402,7 +402,12 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu url, output_stream=output_stream, max_size=self.max_spider_size, - headers={"Accept-Language": self.url_preview_accept_language}, + headers={ + b"Accept-Language": self.url_preview_accept_language, + b"User-Agent": [ + "Synapse (bot; +https://github.com/matrix-org/synapse)" + ], + }, is_allowed_content_type=_is_previewable, ) except SynapseError: From 391c58b002af83d111140149cb15dbb1d97c6816 Mon Sep 17 00:00:00 2001 From: Andrew Ryan Date: Tue, 15 Feb 2022 08:26:24 -0800 Subject: [PATCH 2/6] update comment --- synapse/rest/media/v1/preview_url_resource.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index ec9a0380bb7b..507509bf0eb3 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -404,6 +404,9 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu max_size=self.max_spider_size, headers={ b"Accept-Language": self.url_preview_accept_language, + # Use a custom user agent for the preview because some sites will only return + # openGraph metadata to crawler user agents. We specifically omit the version + # string to avoid leaking the this information. b"User-Agent": [ "Synapse (bot; +https://github.com/matrix-org/synapse)" ], From d970531912c15881026f6398d228a6a02a9ce3e2 Mon Sep 17 00:00:00 2001 From: AndrewRyanChama <89478935+AndrewRyanChama@users.noreply.github.com> Date: Thu, 17 Feb 2022 11:29:17 -0800 Subject: [PATCH 3/6] Update changelog.d/11985.misc Co-authored-by: Patrick Cloke --- changelog.d/11985.misc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/11985.misc b/changelog.d/11985.misc index 1364a2b85421..3615c21caa5e 100644 --- a/changelog.d/11985.misc +++ b/changelog.d/11985.misc @@ -1 +1 @@ -Use bot user agent for openGraph queries. +Use a bot User-Agent for URL preview queries. From da93219b499fadc258258e1d63c529c7e9d05e3f Mon Sep 17 00:00:00 2001 From: AndrewRyanChama <89478935+AndrewRyanChama@users.noreply.github.com> Date: Thu, 17 Feb 2022 11:31:20 -0800 Subject: [PATCH 4/6] Update synapse/rest/media/v1/preview_url_resource.py update comment wording Co-authored-by: Patrick Cloke --- synapse/rest/media/v1/preview_url_resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 507509bf0eb3..c08b60d10a09 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -405,8 +405,8 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu headers={ b"Accept-Language": self.url_preview_accept_language, # Use a custom user agent for the preview because some sites will only return - # openGraph metadata to crawler user agents. We specifically omit the version - # string to avoid leaking the this information. + # Open Graph metadata to crawler user agents. Omit the Synapse version + # string to avoid leaking information. b"User-Agent": [ "Synapse (bot; +https://github.com/matrix-org/synapse)" ], From 2c7f8d3a09dc61fb1243d8cb12b3f899c3e1d582 Mon Sep 17 00:00:00 2001 From: Andrew Ryan Date: Fri, 18 Feb 2022 19:26:27 -0800 Subject: [PATCH 5/6] update changelog --- changelog.d/11985.feature | 1 + changelog.d/11985.misc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 changelog.d/11985.feature delete mode 100644 changelog.d/11985.misc diff --git a/changelog.d/11985.feature b/changelog.d/11985.feature new file mode 100644 index 000000000000..e2770d22a2bb --- /dev/null +++ b/changelog.d/11985.feature @@ -0,0 +1 @@ +Fetch images when previewing Twitter URLs. Contributed by @AndrewRyanChama diff --git a/changelog.d/11985.misc b/changelog.d/11985.misc deleted file mode 100644 index 3615c21caa5e..000000000000 --- a/changelog.d/11985.misc +++ /dev/null @@ -1 +0,0 @@ -Use a bot User-Agent for URL preview queries. From 4da66526e88bdb5b540f821acd83e3302f37dd0c Mon Sep 17 00:00:00 2001 From: Andrew Ryan Date: Mon, 21 Feb 2022 05:45:55 -0800 Subject: [PATCH 6/6] update changelog --- changelog.d/11985.feature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/11985.feature b/changelog.d/11985.feature index e2770d22a2bb..120d888a4914 100644 --- a/changelog.d/11985.feature +++ b/changelog.d/11985.feature @@ -1 +1 @@ -Fetch images when previewing Twitter URLs. Contributed by @AndrewRyanChama +Fetch images when previewing Twitter URLs. Contributed by @AndrewRyanChama.