From 77d697810467a05656cb9de9ddd51bed1e40ca3d Mon Sep 17 00:00:00 2001
From: Morten Brekkevold <morten.brekkevold@sikt.no>
Date: Wed, 24 May 2023 20:39:37 +0200
Subject: [PATCH 1/2] Stop making validation tests for non HTML content

The web crawler would generate HTML validation tests for all reachable
pages, and marked them as *skipped* if the page is blacklisted or the
retrieved page has a content-type that doesn't appear to be
HTML-related.

Skipped tests should be an indicator of some unusual situation or
missing condition that prevents the test from running.  However,
non-HTML pages can never be validated as such, and the *normal*
situation is that these tests are useless.  Since a lot of the crawled
content isn't actually HTML, a large number of SKIP results is produced
in test reports.  These results only serve to hide the results of tests
that were skipped for *factually* unexpected or extraordinary reasons.

This change will filter crawled pages based on blacklist status and
content-type before it the result is used to parametrize the
`test_page_should_be_valid_html` test.
---
 tests/integration/web/crawler_test.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/integration/web/crawler_test.py b/tests/integration/web/crawler_test.py
index 3373a451f2..4c5da24a25 100644
--- a/tests/integration/web/crawler_test.py
+++ b/tests/integration/web/crawler_test.py
@@ -113,6 +113,16 @@ def crawl(self):
             if page:
                 yield page
 
+    def crawl_only_html(self):
+        """Only yields crawled pages that have a content-type of html and is not
+        blacklisted.
+        """
+        for page in self.crawl():
+            if not page.content_type or 'html' not in page.content_type.lower():
+                continue
+            if should_validate(page.url):
+                yield page
+
     def _visit_with_error_handling(self, url):
         try:
             page = self._visit(url)
@@ -256,7 +266,7 @@ def _content_as_string(content):
     "(ADMINUSERNAME, ADMINPASSWORD) , skipping crawler "
     "tests!",
 )
-@pytest.mark.parametrize("page", crawler.crawl(), ids=page_id)
+@pytest.mark.parametrize("page", crawler.crawl_only_html(), ids=page_id)
 def test_page_should_be_valid_html(page):
     if page.response != 200:
         pytest.skip("not validating non-reachable page")

From 40ed7e3b406e535e64857c1a485a177487791990 Mon Sep 17 00:00:00 2001
From: Morten Brekkevold <morten.brekkevold@sikt.no>
Date: Fri, 26 May 2023 14:38:33 +0200
Subject: [PATCH 2/2] Refactor HTML page filtering

1. As mentioned in review comments, `test_page_should_be_valid_html` no
longer needs to test whether a page should be validated, since its input
 is now guaranteed to be filtered.

2. `should_validate()` now performs both filtering checks:
A blacklisted page should not be validated, and a non-HTML page should
not be validated.

3. With the above changes, `crawl_only_html()` can now be refactored to
   a one-liner.
---
 tests/integration/web/crawler_test.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tests/integration/web/crawler_test.py b/tests/integration/web/crawler_test.py
index 4c5da24a25..13cc39e830 100644
--- a/tests/integration/web/crawler_test.py
+++ b/tests/integration/web/crawler_test.py
@@ -117,11 +117,7 @@ def crawl_only_html(self):
         """Only yields crawled pages that have a content-type of html and is not
         blacklisted.
         """
-        for page in self.crawl():
-            if not page.content_type or 'html' not in page.content_type.lower():
-                continue
-            if should_validate(page.url):
-                yield page
+        yield from filter(should_validate, self.crawl())
 
     def _visit_with_error_handling(self, url):
         try:
@@ -270,10 +266,6 @@ def _content_as_string(content):
 def test_page_should_be_valid_html(page):
     if page.response != 200:
         pytest.skip("not validating non-reachable page")
-    if not page.content_type or 'html' not in page.content_type.lower():
-        pytest.skip("not attempting to validate non-html page")
-    if not should_validate(page.url):
-        pytest.skip("skip validation of blacklisted page")
     if not page.content:
         pytest.skip("page has no content")
 
@@ -283,8 +275,11 @@ def test_page_should_be_valid_html(page):
     assert not errors, "Found following validation errors:\n" + errors
 
 
-def should_validate(url):
-    path = normalize_path(url)
+def should_validate(page: Page):
+    """Returns True if page is eligible for HTML validation, False if not"""
+    if not page.content_type or 'html' not in page.content_type.lower():
+        return False
+    path = normalize_path(page.url)
     for blacklisted_path in TIDY_BLACKLIST:
         if path.startswith(blacklisted_path):
             return False