From 77d697810467a05656cb9de9ddd51bed1e40ca3d Mon Sep 17 00:00:00 2001 From: Morten Brekkevold Date: Wed, 24 May 2023 20:39:37 +0200 Subject: [PATCH 1/2] Stop making validation tests for non HTML content The web crawler would generate HTML validation tests for all reachable pages, and marked them as *skipped* if the page is blacklisted or the retrieved page has a content-type that doesn't appear to be HTML-related. Skipped tests should be an indicator of some unusual situation or missing condition that prevents the test from running. However, non-HTML pages can never be validated as such, and the *normal* situation is that these tests are useless. Since a lot of the crawled content isn't actually HTML, a large number of SKIP results is produced in test reports. These results only serve to hide the results of tests that were skipped for *factually* unexpected or extraordinary reasons. This change will filter crawled pages based on blacklist status and content-type before it the result is used to parametrize the `test_page_should_be_valid_html` test. --- tests/integration/web/crawler_test.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/integration/web/crawler_test.py b/tests/integration/web/crawler_test.py index 3373a451f2..4c5da24a25 100644 --- a/tests/integration/web/crawler_test.py +++ b/tests/integration/web/crawler_test.py @@ -113,6 +113,16 @@ def crawl(self): if page: yield page + def crawl_only_html(self): + """Only yields crawled pages that have a content-type of html and is not + blacklisted. + """ + for page in self.crawl(): + if not page.content_type or 'html' not in page.content_type.lower(): + continue + if should_validate(page.url): + yield page + def _visit_with_error_handling(self, url): try: page = self._visit(url) @@ -256,7 +266,7 @@ def _content_as_string(content): "(ADMINUSERNAME, ADMINPASSWORD) , skipping crawler " "tests!", ) -@pytest.mark.parametrize("page", crawler.crawl(), ids=page_id) +@pytest.mark.parametrize("page", crawler.crawl_only_html(), ids=page_id) def test_page_should_be_valid_html(page): if page.response != 200: pytest.skip("not validating non-reachable page") From 40ed7e3b406e535e64857c1a485a177487791990 Mon Sep 17 00:00:00 2001 From: Morten Brekkevold Date: Fri, 26 May 2023 14:38:33 +0200 Subject: [PATCH 2/2] Refactor HTML page filtering 1. As mentioned in review comments, `test_page_should_be_valid_html` no longer needs to test whether a page should be validated, since its input is now guaranteed to be filtered. 2. `should_validate()` now performs both filtering checks: A blacklisted page should not be validated, and a non-HTML page should not be validated. 3. With the above changes, `crawl_only_html()` can now be refactored to a one-liner. --- tests/integration/web/crawler_test.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/integration/web/crawler_test.py b/tests/integration/web/crawler_test.py index 4c5da24a25..13cc39e830 100644 --- a/tests/integration/web/crawler_test.py +++ b/tests/integration/web/crawler_test.py @@ -117,11 +117,7 @@ def crawl_only_html(self): """Only yields crawled pages that have a content-type of html and is not blacklisted. """ - for page in self.crawl(): - if not page.content_type or 'html' not in page.content_type.lower(): - continue - if should_validate(page.url): - yield page + yield from filter(should_validate, self.crawl()) def _visit_with_error_handling(self, url): try: @@ -270,10 +266,6 @@ def _content_as_string(content): def test_page_should_be_valid_html(page): if page.response != 200: pytest.skip("not validating non-reachable page") - if not page.content_type or 'html' not in page.content_type.lower(): - pytest.skip("not attempting to validate non-html page") - if not should_validate(page.url): - pytest.skip("skip validation of blacklisted page") if not page.content: pytest.skip("page has no content") @@ -283,8 +275,11 @@ def test_page_should_be_valid_html(page): assert not errors, "Found following validation errors:\n" + errors -def should_validate(url): - path = normalize_path(url) +def should_validate(page: Page): + """Returns True if page is eligible for HTML validation, False if not""" + if not page.content_type or 'html' not in page.content_type.lower(): + return False + path = normalize_path(page.url) for blacklisted_path in TIDY_BLACKLIST: if path.startswith(blacklisted_path): return False