Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop making validation tests for non HTML content #2623

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions tests/integration/web/crawler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ def crawl(self):
if page:
yield page

def crawl_only_html(self):
"""Only yields crawled pages that have a content-type of html and is not
blacklisted.
"""
yield from filter(should_validate, self.crawl())

def _visit_with_error_handling(self, url):
try:
page = self._visit(url)
Expand Down Expand Up @@ -256,14 +262,10 @@ def _content_as_string(content):
"(ADMINUSERNAME, ADMINPASSWORD) , skipping crawler "
"tests!",
)
@pytest.mark.parametrize("page", crawler.crawl(), ids=page_id)
@pytest.mark.parametrize("page", crawler.crawl_only_html(), ids=page_id)
def test_page_should_be_valid_html(page):
if page.response != 200:
pytest.skip("not validating non-reachable page")
if not page.content_type or 'html' not in page.content_type.lower():
pytest.skip("not attempting to validate non-html page")
if not should_validate(page.url):
pytest.skip("skip validation of blacklisted page")
if not page.content:
pytest.skip("page has no content")

Expand All @@ -273,8 +275,11 @@ def test_page_should_be_valid_html(page):
assert not errors, "Found following validation errors:\n" + errors


def should_validate(url):
path = normalize_path(url)
def should_validate(page: Page):
"""Returns True if page is eligible for HTML validation, False if not"""
if not page.content_type or 'html' not in page.content_type.lower():
return False
path = normalize_path(page.url)
for blacklisted_path in TIDY_BLACKLIST:
if path.startswith(blacklisted_path):
return False
Expand Down