Skip to content

Commit

Permalink
Check missing links for engage pages
Browse files Browse the repository at this point in the history
  • Loading branch information
carkod committed Feb 12, 2021
1 parent 2bca046 commit 874ad2f
Showing 1 changed file with 90 additions and 9 deletions.
99 changes: 90 additions & 9 deletions canonicalwebteam/discourse/parsers/engage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Packages
import dateutil.parser
import humanize
import logging
from bs4 import BeautifulSoup

# Local
Expand Down Expand Up @@ -36,15 +37,16 @@ def parse(self):
raw_index_soup, self.url_prefix, self.index_topic_id, "Metadata"
)

# Avoid markdown error to break site
try:
# Parse list of topics
self.metadata = self._parse_metadata(raw_index_soup, "Metadata")
self.takeovers = self._parse_metadata(raw_index_soup, "Takeovers")
except IndexError:
self.metadata = []
self.takeovers = []
self.warnings.append("Failed to parse metadata correctly")
engage_metadata, engage_errors = self._parse_engage_metadata(
raw_index_soup, "Metadata"
)
self.metadata = engage_metadata
logging.error(engage_errors)
takeovers_metadata, takeovers_errors = self._parse_engage_metadata(
raw_index_soup, "Takeovers"
)
self.takeovers = takeovers_metadata
logging.error(takeovers_errors)

if index_topic["id"] != self.index_topic_id:
# Get body and navigation HTML
Expand Down Expand Up @@ -155,3 +157,82 @@ def _parse_related(self, tags):
"""
index_list = [item for item in self.metadata if item["tags"] in tags]
return index_list

def _parse_engage_metadata(self, index_soup, section_name):
"""
Given the HTML soup of an index topic
extract the metadata from the name designated
by section_name
This section_name section should contain a table
(extra markup around this table doesn't matter)
e.g.:
<h1>Metadata</h1>
<details>
<summary>Mapping table</summary>
<table>
<tr><th>Column 1</th><th>Column 2</th></tr>
<tr>
<td>data 1</td>
<td>data 2</td>
</tr>
<tr>
<td>data 3</td>
<td>data 4</td>
</tr>
</table>
</details>
This will typically be generated in Discourse from Markdown similar to
the following:
# Redirects
[details=Mapping table]
| Column 1| Column 2|
| -- | -- |
| data 1 | data 2 |
| data 3 | data 4 |
The function will return a list of dictionaries of this format:
[
{"column-1": "data 1", "column-2": "data 2"},
{"column-1": "data 3", "column-2": "data 4"},
]
"""
metadata_soup = self._get_section(index_soup, section_name)

topics_metadata = []
metadata_errors = []
if metadata_soup:
titles = [
title_soup.text.lower().replace(" ", "_").replace("-", "_")
for title_soup in metadata_soup.select("th")
]
for row in metadata_soup.select("tr:has(td)"):
row_dict = {}
for index, value in enumerate(row.select("td")):
if value.find("a"):

row_dict["topic_name"] = value.find("a").text

# Only engage pages need a link
if value.findAll("a", href=True):
if value.find("a")["href"] == value.find("a").text:
value.contents[0] = value.find("a").text

else:
metadata_errors.append(
f"{row_dict['topic_name']} contains an error"
)
row_dict = None
break

row_dict[titles[index]] = "".join(
str(content) for content in value.contents
)
if row_dict:
topics_metadata.append(row_dict)

return topics_metadata, metadata_errors

0 comments on commit 874ad2f

Please sign in to comment.