From 947bae27903c6efd0942a40bbdff46702cf829e7 Mon Sep 17 00:00:00 2001 From: Andrew Ryan Date: Mon, 14 Feb 2022 07:39:04 -0800 Subject: [PATCH] Use Facebook user agent for openGraph queries We found that some websites return opengraph information based on the user agent. Since Facebook is the creator of opengraph, using the Facebook user agent when requesting the opengraph metadata should work in the widest variety of situations. https://developers.facebook.com/docs/sharing/webmasters/#user-agent Signed-off-by: Andrew Ryan --- changelog.d/11985.misc | 1 + synapse/res/providers.json | 4 +--- synapse/rest/media/v1/preview_url_resource.py | 10 +++++++--- 3 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 changelog.d/11985.misc diff --git a/changelog.d/11985.misc b/changelog.d/11985.misc new file mode 100644 index 000000000000..dc39eeda1826 --- /dev/null +++ b/changelog.d/11985.misc @@ -0,0 +1 @@ +Use Facebook user agent for openGraph queries. diff --git a/synapse/res/providers.json b/synapse/res/providers.json index f1838f955901..7b9958e45464 100644 --- a/synapse/res/providers.json +++ b/synapse/res/providers.json @@ -5,8 +5,6 @@ "endpoints": [ { "schemes": [ - "https://twitter.com/*/status/*", - "https://*.twitter.com/*/status/*", "https://twitter.com/*/moments/*", "https://*.twitter.com/*/moments/*" ], @@ -14,4 +12,4 @@ } ] } -] \ No newline at end of file +] diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 8d3d1e54dc9f..1ead89df1b0a 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -326,8 +326,9 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: # Compile the Open Graph response by using the scraped # information from the HTML and overlaying any information - # from the oEmbed response. - og = {**og_from_html, **og_from_oembed} + # from the oEmbed response. og tags from the original html + # have priority over oEmbed data. + og = {**og_from_oembed, **og_from_html} await self._precache_image_url(user, media_info, og) else: @@ -402,7 +403,10 @@ async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResu url, output_stream=output_stream, max_size=self.max_spider_size, - headers={"Accept-Language": self.url_preview_accept_language}, + headers={ + "Accept-Language": self.url_preview_accept_language, + b"User-Agent": ["Synapse (bot)"], + }, is_allowed_content_type=_is_previewable, ) except SynapseError: