From 846c40fdd1841829914fb2b7a7f622fd5bff228d Mon Sep 17 00:00:00 2001 From: DoomTay Date: Tue, 19 Dec 2017 21:01:45 -0500 Subject: [PATCH 1/2] Tweaked background URL regex --- wpull/document/css.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wpull/document/css.py b/wpull/document/css.py index 2ee8d118..e81cf60b 100644 --- a/wpull/document/css.py +++ b/wpull/document/css.py @@ -12,7 +12,7 @@ class CSSReader(BaseDocumentDetector, BaseTextStreamReader): '''Cascading Stylesheet Document Reader.''' - URL_PATTERN = r'''url\(\s*(['"]?)(.{1,500}?)(?:\1)\s*\)''' + URL_PATTERN = r'''url\(\s*(['"]?)(.*?)(?:\1)\s*\)''' IMPORT_URL_PATTERN = r'''@import\s*(?:url\()?['"]?([^\s'")]{1,500}).*?;''' URL_REGEX = re.compile(r'{}|{}'.format(URL_PATTERN, IMPORT_URL_PATTERN)) BUFFER_SIZE = 1048576 From fbb91160a3b14917efd2eeb69baf30ca765fb1e5 Mon Sep 17 00:00:00 2001 From: DoomTay Date: Tue, 19 Dec 2017 21:50:24 -0500 Subject: [PATCH 2/2] Appended unit tests --- wpull/scraper/html_test.py | 1 + wpull/testing/samples/many_urls.html | 3 +++ 2 files changed, 4 insertions(+) diff --git a/wpull/scraper/html_test.py b/wpull/scraper/html_test.py index 175a8349..14556522 100644 --- a/wpull/scraper/html_test.py +++ b/wpull/scraper/html_test.py @@ -52,6 +52,7 @@ def test_html_scraper_links(self): 'http://example.com/style_import_quote_url.css', 'http://example.com/style_single_quote_import.css', 'http://example.com/style_double_quote_import.css', + 'http://example.com/bg.png', 'http://example.com/link_href.css', 'http://example.com/script.js', 'http://example.com/body_background.png', diff --git a/wpull/testing/samples/many_urls.html b/wpull/testing/samples/many_urls.html index 08a9a624..db050d0e 100644 --- a/wpull/testing/samples/many_urls.html +++ b/wpull/testing/samples/many_urls.html @@ -9,6 +9,9 @@ @import 'style_single_quote_import.css'; @import "style_double_quote_import.css"; +