Skip to content

Commit

Permalink
Check missing links for engage pages
Browse files Browse the repository at this point in the history
  • Loading branch information
carkod committed Jun 22, 2021
1 parent ce59b55 commit dc5de23
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 19 deletions.
60 changes: 52 additions & 8 deletions canonicalwebteam/discourse/parsers/base_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Standard library
import os
import re
import flask
from urllib.parse import urlparse, urlunparse

# Packages
Expand All @@ -21,6 +22,18 @@
)


class ParsingError(Exception):
pass


class MissingLinkError(ParsingError):
pass


class MissingTitleError(ParsingError):
pass


class BaseParser:
"""
Parsers used commonly by Tutorials and Engage pages
Expand All @@ -35,6 +48,7 @@ def __init__(self, api, index_topic_id, url_prefix):
self.warnings = []
self.url_map = {}
self.redirect_map = {}
self.metadata_errors = []

def parse_topic(self, topic):
"""
Expand Down Expand Up @@ -286,18 +300,48 @@ def _parse_metadata(self, index_soup, section_name):
if value.find("a"):
row_dict["topic_name"] = value.find("a").text

# Beautiful soup renders URLs as anchors
# Avoid that default behaviour
if value.find("a") and (
value.find("a")["href"] == value.find("a").text
):
value.contents[0] = value.find("a").text
# Only engage pages need a link
if value.findAll("a", href=True):
if value.find("a")["href"] == value.find("a").text:
value.contents[0] = value.find("a").text

else:
error = MissingLinkError(
f"Warning: Link not found when parsing row {index} \"{row_dict['topic_name']}\"\
{titles[index]}. \
This Engage page has been skipped."
)
flask.current_app.extensions[
"sentry"
].captureMessage(error)
self.metadata_errors.append(error)
row_dict = None
break

# Missing path will cause the engage item in index to not
# link to the corresponding page
# Missing type will cause resource_name to be empty in
# thank-you pages
# This error does not need breaking, because it does not
# break the page
if (
(titles[index] == "path") or (titles[index] == "type")
) and ((value.text == "") or (value.text is None)):
error = MissingTitleError(
f"Warning: row {index} \"{row_dict['topic_name']}\"\
{titles[index]} is missing. This Engage page has \
been skipped."
)
flask.current_app.extensions["sentry"].captureMessage(
error
)
self.metadata_errors.append(error)

row_dict[titles[index]] = "".join(
str(content) for content in value.contents
)

topics_metadata.append(row_dict)
if row_dict:
topics_metadata.append(row_dict)

return topics_metadata

Expand Down
12 changes: 2 additions & 10 deletions canonicalwebteam/discourse/parsers/engage.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,8 @@ def parse(self):
raw_index_soup, self.url_prefix, self.index_topic_id, "Metadata"
)

# Avoid markdown error to break site
try:
# Parse list of topics
self.metadata = self._parse_metadata(raw_index_soup, "Metadata")
self.takeovers = self._parse_metadata(raw_index_soup, "Takeovers")
except IndexError:
self.metadata = []
self.takeovers = []
self.warnings.append("Failed to parse metadata correctly")
self.metadata = self._parse_metadata(raw_index_soup, "Metadata")
self.takeovers = self._parse_metadata(raw_index_soup, "Takeovers")

if index_topic["id"] != self.index_topic_id:
# Get body and navigation HTML
Expand Down Expand Up @@ -120,7 +113,6 @@ def parse_topic(self, topic):
),
"related": current_topic_related,
"topic_path": topic_path,
"errors": warnings,
}

def resolve_path(self, relative_path):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name="canonicalwebteam.discourse",
version="4.0.3",
version="4.0.4",
author="Canonical webteam",
author_email="[email protected]",
url="https://github.com/canonical-webteam/canonicalwebteam.docs",
Expand Down

0 comments on commit dc5de23

Please sign in to comment.