Skip to content

Commit

Permalink
Refactor HTML page filtering
Browse files Browse the repository at this point in the history
1. As mentioned in review comments, `test_page_should_be_valid_html` no
longer needs to test whether a page should be validated, since its input
 is now guaranteed to be filtered.

2. `should_validate()` now performs both filtering checks:
A blacklisted page should not be validated, and a non-HTML page should
not be validated.

3. With the above changes, `crawl_only_html()` can now be refactored to
   a one-liner.
  • Loading branch information
lunkwill42 committed May 26, 2023
1 parent 77d6978 commit 40ed7e3
Showing 1 changed file with 6 additions and 11 deletions.
17 changes: 6 additions & 11 deletions tests/integration/web/crawler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,7 @@ def crawl_only_html(self):
"""Only yields crawled pages that have a content-type of html and is not
blacklisted.
"""
for page in self.crawl():
if not page.content_type or 'html' not in page.content_type.lower():
continue
if should_validate(page.url):
yield page
yield from filter(should_validate, self.crawl())

def _visit_with_error_handling(self, url):
try:
Expand Down Expand Up @@ -270,10 +266,6 @@ def _content_as_string(content):
def test_page_should_be_valid_html(page):
if page.response != 200:
pytest.skip("not validating non-reachable page")
if not page.content_type or 'html' not in page.content_type.lower():
pytest.skip("not attempting to validate non-html page")
if not should_validate(page.url):
pytest.skip("skip validation of blacklisted page")
if not page.content:
pytest.skip("page has no content")

Expand All @@ -283,8 +275,11 @@ def test_page_should_be_valid_html(page):
assert not errors, "Found following validation errors:\n" + errors


def should_validate(url):
path = normalize_path(url)
def should_validate(page: Page):
"""Returns True if page is eligible for HTML validation, False if not"""
if not page.content_type or 'html' not in page.content_type.lower():
return False
path = normalize_path(page.url)
for blacklisted_path in TIDY_BLACKLIST:
if path.startswith(blacklisted_path):
return False
Expand Down

0 comments on commit 40ed7e3

Please sign in to comment.