From 84c4a530a8007698b677771cbfaa4ae25d7cdb95 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 8 Sep 2021 14:45:43 -0400 Subject: [PATCH 01/19] Factor out calculating a description from an HTML tree. --- synapse/rest/media/v1/preview_url_resource.py | 99 ++++++++++++------- 1 file changed, 66 insertions(+), 33 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index f108da05db55..a7b58b1556c8 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -668,7 +668,18 @@ def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str] def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: - # suck our tree into lxml and define our OG response. + """ + Calculate metadata for an HTML document. + + This uses lxml to search the HTML document for Open Graph data. + + Args: + tree: The parsed HTML document. + media_url: The URI used to download the body. + + Returns: + The Open Graph response as a dictionary. + """ # if we see any image URLs in the OG response, then spider them # (although the client could choose to do this by asking for previews of those @@ -742,35 +753,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: if meta_description: og["og:description"] = meta_description[0] else: - # grab any text nodes which are inside the tag... - # unless they are within an HTML5 semantic markup tag... - #
,