From 84c4a530a8007698b677771cbfaa4ae25d7cdb95 Mon Sep 17 00:00:00 2001
From: Patrick Cloke
Date: Wed, 8 Sep 2021 14:45:43 -0400
Subject: [PATCH 01/19] Factor out calculating a description from an HTML tree.
---
synapse/rest/media/v1/preview_url_resource.py | 99 ++++++++++++-------
1 file changed, 66 insertions(+), 33 deletions(-)
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index f108da05db55..a7b58b1556c8 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -668,7 +668,18 @@ def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
- # suck our tree into lxml and define our OG response.
+ """
+ Calculate metadata for an HTML document.
+
+ This uses lxml to search the HTML document for Open Graph data.
+
+ Args:
+ tree: The parsed HTML document.
+ media_url: The URI used to download the body.
+
+ Returns:
+ The Open Graph response as a dictionary.
+ """
# if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those
@@ -742,35 +753,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
if meta_description:
og["og:description"] = meta_description[0]
else:
- # grab any text nodes which are inside the tag...
- # unless they are within an HTML5 semantic markup tag...
- # , , ,
- # ...or if they are within a or tag.
- # This is a very very very coarse approximation to a plain text
- # render of the page.
-
- # We don't just use XPATH here as that is slow on some machines.
-
- from lxml import etree
-
- TAGS_TO_REMOVE = (
- "header",
- "nav",
- "aside",
- "footer",
- "script",
- "noscript",
- "style",
- etree.Comment,
- )
-
- # Split all the text nodes into paragraphs (by splitting on new
- # lines)
- text_nodes = (
- re.sub(r"\s+", "\n", el).strip()
- for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
- )
- og["og:description"] = summarize_paragraphs(text_nodes)
+ og["og:description"] = _calc_description(tree)
elif og["og:description"]:
# This must be a non-empty string at this point.
assert isinstance(og["og:description"], str)
@@ -781,6 +764,46 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
return og
+def _calc_description(tree: "etree.Element") -> Optional[str]:
+ """
+ Calculate a text description based on an HTML document.
+
+ Grabs any text nodes which are inside the