From a9a24f754f136e786d756a047514b9ceb3b948c7 Mon Sep 17 00:00:00 2001 From: Lucie Anglade Date: Thu, 10 Sep 2020 00:51:03 +0200 Subject: [PATCH 1/8] Use download attribute in to define filename for attachment --- weasyprint/document.py | 16 ++++++--- weasyprint/html.py | 1 + weasyprint/pdf.py | 12 ++++--- weasyprint/tests/test_api.py | 62 +++++++++++++++++++---------------- weasyprint/tools/navigator.py | 2 +- 5 files changed, 53 insertions(+), 40 deletions(-) diff --git a/weasyprint/document.py b/weasyprint/document.py index d6ff46da2..2ee90d994 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -120,6 +120,9 @@ def _gather_links_and_bookmarks(box, bookmarks, links, anchors, matrix): # In case of duplicate IDs, only the first is an anchor. has_anchor = anchor_name and anchor_name not in anchors is_attachment = hasattr(box, 'is_attachment') and box.is_attachment + download_name = ( + box.attachment_download + if hasattr(box, 'attachment_download') else None) if has_bookmark or has_link or has_anchor: pos_x, pos_y, width, height = box.hit_area() @@ -133,9 +136,11 @@ def _gather_links_and_bookmarks(box, bookmarks, links, anchors, matrix): if matrix: link = ( link_type, target, rectangle_aabb( - matrix, pos_x, pos_y, width, height)) + matrix, pos_x, pos_y, width, height), download_name) else: - link = (link_type, target, (pos_x, pos_y, width, height)) + link = ( + link_type, target, (pos_x, pos_y, width, height), + download_name) links.append(link) if matrix and (has_bookmark or has_anchor): pos_x, pos_y = matrix.transform_point(pos_x, pos_y) @@ -487,14 +492,15 @@ def resolve_links(self): for page in self.pages: page_links = [] for link in page.links: - link_type, anchor_name, rectangle = link + link_type, anchor_name, rectangle, _ = link if link_type == 'internal': if anchor_name not in anchors: LOGGER.error( 'No anchor #%s for internal URI reference', anchor_name) else: - page_links.append((link_type, anchor_name, rectangle)) + page_links.append( + (link_type, anchor_name, rectangle, None)) else: # External link page_links.append(link) @@ -565,7 +571,7 @@ def add_hyperlinks(self, links, anchors, context, scale): # defined by cairo when drawing targets. This would give a feeling # similiar to what browsers do with links that span multiple lines. for link in links: - link_type, link_target, rectangle = link + link_type, link_target, rectangle, _ = link if link_type == 'external': attributes = "rect=[{} {} {} {}] uri='{}'".format(*( [int(round(i * scale)) for i in rectangle] + diff --git a/weasyprint/html.py b/weasyprint/html.py index 5959a98fc..f1c5169fd 100644 --- a/weasyprint/html.py +++ b/weasyprint/html.py @@ -249,6 +249,7 @@ def handle_td(element, box, _get_image_from_uri, _base_url): def handle_a(element, box, _get_image_from_uri, base_url): """Handle the ``rel`` attribute.""" box.is_attachment = element_has_link_type(element, 'attachment') + box.attachment_download = element.get('download') return [box] diff --git a/weasyprint/pdf.py b/weasyprint/pdf.py index 336aa5228..d740b3f57 100644 --- a/weasyprint/pdf.py +++ b/weasyprint/pdf.py @@ -438,7 +438,7 @@ def _write_pdf_embedded_files(pdf, attachments, url_fetcher): return pdf.write_new_object(b''.join(content)) -def _write_pdf_attachment(pdf, attachment, url_fetcher): +def _write_pdf_attachment(pdf, attachment, url_fetcher, download_name=None): """Write an attachment to the PDF stream. :return: @@ -466,7 +466,9 @@ def _write_pdf_attachment(pdf, attachment, url_fetcher): # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename - filename = _get_filename_from_result(url, None) + filename = ( + download_name + if download_name else _get_filename_from_result(url, None)) return pdf.write_new_object(pdf_format( '<< /Type /Filespec /F () /UF {0!P} /EF << /F {1} 0 R >> ' @@ -509,11 +511,11 @@ def write_pdf_metadata(fileobj, scale, url_fetcher, attachments, # because two links might have the same href, but different titles. annot_files = {} for page_links in attachment_links: - for link_type, target, rectangle in page_links: + for link_type, target, rectangle, download_name in page_links: if link_type == 'attachment' and target not in annot_files: # TODO: use the title attribute as description annot_files[target] = _write_pdf_attachment( - pdf, (target, None), url_fetcher) + pdf, (target, None), url_fetcher, download_name) for pdf_page, document_page, page_links in zip( pdf.pages, pages, attachment_links): @@ -555,7 +557,7 @@ def write_pdf_metadata(fileobj, scale, url_fetcher, attachments, # would give a feeling similiar to what browsers do with links that # span multiple lines. annotations = [] - for link_type, target, rectangle in page_links: + for link_type, target, rectangle, _ in page_links: if link_type == 'attachment' and annot_files[target] is not None: matrix = cairo.Matrix( xx=scale, yy=-scale, y0=document_page.height * scale) diff --git a/weasyprint/tests/test_api.py b/weasyprint/tests/test_api.py index 4669802db..06e42749e 100644 --- a/weasyprint/tests/test_api.py +++ b/weasyprint/tests/test_api.py @@ -121,10 +121,10 @@ def _round_meta(pages): anchors[anchor_name] = round(pos_x, 6), round(pos_y, 6) links = page.links for i, link in enumerate(links): - link_type, target, (pos_x, pos_y, width, height) = link + link_type, target, (pos_x, pos_y, width, height), dl_name = link link = ( link_type, target, (round(pos_x, 6), round(pos_y, 6), - round(width, 6), round(height, 6))) + round(width, 6), round(height, 6)),dl_name) links[i] = link bookmarks = page.bookmarks for i, (level, label, (pos_x, pos_y), state) in enumerate(bookmarks): @@ -720,28 +720,28 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page,

''', [ [ - ('external', 'http://weasyprint.org', (0, 0, 30, 20)), - ('external', 'http://weasyprint.org', (0, 0, 30, 30)), - ('internal', 'lipsum', (10, 100, 32, 20)), - ('internal', 'lipsum', (10, 100, 32, 32)) + ('external', 'http://weasyprint.org', (0, 0, 30, 20), None), + ('external', 'http://weasyprint.org', (0, 0, 30, 30), None), + ('internal', 'lipsum', (10, 100, 32, 20), None), + ('internal', 'lipsum', (10, 100, 32, 32), None) ], - [('internal', 'hello', (0, 0, 200, 30))], + [('internal', 'hello', (0, 0, 200, 30), None)], ], [ {'hello': (0, 200)}, {'lipsum': (0, 0)} ], [ ( [ - ('external', 'http://weasyprint.org', (0, 0, 30, 20)), - ('external', 'http://weasyprint.org', (0, 0, 30, 30)), - ('internal', 'lipsum', (10, 100, 32, 20)), - ('internal', 'lipsum', (10, 100, 32, 32)) + ('external', 'http://weasyprint.org', (0, 0, 30, 20), None), + ('external', 'http://weasyprint.org', (0, 0, 30, 30), None), + ('internal', 'lipsum', (10, 100, 32, 20), None), + ('internal', 'lipsum', (10, 100, 32, 32), None) ], [('hello', 0, 200)], ), ( [ - ('internal', 'hello', (0, 0, 200, 30)) + ('internal', 'hello', (0, 0, 200, 30), None) ], [('lipsum', 0, 0)]), ]) @@ -751,9 +751,9 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page,
''', [[('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9', - (5, 10, 190, 0))]], + (5, 10, 190, 0), None)]], [{}], [([('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9', - (5, 10, 190, 0))], [])], + (5, 10, 190, 0), None)], [])], base_url='http://weasyprint.org/foo/bar/') assert_links( ''' @@ -761,9 +761,9 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page,
''', [[('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9', - (5, 10, 190, 0))]], + (5, 10, 190, 0), None)]], [{}], [([('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9', - (5, 10, 190, 0))], [])], + (5, 10, 190, 0), None)], [])], base_url='http://weasyprint.org/foo/bar/') # Relative URI reference without a base URI: allowed for links @@ -771,8 +771,9 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page, ''' - ''', [[('external', '../lipsum', (5, 10, 190, 0))]], [{}], - [([('external', '../lipsum', (5, 10, 190, 0))], [])], base_url=None) + ''', [[('external', '../lipsum', (5, 10, 190, 0), None)]], [{}], + [([('external', '../lipsum', (5, 10, 190, 0), None)], [])], + base_url=None) # Relative URI reference without a base URI: not supported for -weasy-link assert_links( @@ -791,11 +792,11 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page, - ''', [[('internal', 'lipsum', (5, 10, 190, 0)), - ('external', 'http://weasyprint.org/', (0, 10, 200, 0))]], + ''', [[('internal', 'lipsum', (5, 10, 190, 0), None), + ('external', 'http://weasyprint.org/', (0, 10, 200, 0), None)]], [{'lipsum': (5, 10)}], - [([('internal', 'lipsum', (5, 10, 190, 0)), - ('external', 'http://weasyprint.org/', (0, 10, 200, 0))], + [([('internal', 'lipsum', (5, 10, 190, 0), None), + ('external', 'http://weasyprint.org/', (0, 10, 200, 0), None)], [('lipsum', 5, 10)])], base_url=None) @@ -805,9 +806,10 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page,
''', - [[('internal', 'lipsum', (5, 10, 190, 0))]], + [[('internal', 'lipsum', (5, 10, 190, 0), None)]], [{'lipsum': (5, 10)}], - [([('internal', 'lipsum', (5, 10, 190, 0))], [('lipsum', 5, 10)])], + [([('internal', 'lipsum', (5, 10, 190, 0), None)], + [('lipsum', 5, 10)])], base_url=None) assert_links( @@ -817,10 +819,11 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page, ''', - [[('internal', 'lipsum', (0, 0, 200, 15)), - ('internal', 'missing', (0, 15, 200, 15))]], + [[('internal', 'lipsum', (0, 0, 200, 15), None), + ('internal', 'missing', (0, 15, 200, 15), None)]], [{'lipsum': (0, 15)}], - [([('internal', 'lipsum', (0, 0, 200, 15))], [('lipsum', 0, 15)])], + [([('internal', 'lipsum', (0, 0, 200, 15), None)], + [('lipsum', 0, 15)])], base_url=None, warnings=[ 'ERROR: No anchor #missing for internal URI reference']) @@ -831,9 +834,10 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page, ''', - [[('internal', 'lipsum', (30, 10, 40, 200))]], + [[('internal', 'lipsum', (30, 10, 40, 200), None)]], [{'lipsum': (70, 10)}], - [([('internal', 'lipsum', (30, 10, 40, 200))], [('lipsum', 70, 10)])], + [([('internal', 'lipsum', (30, 10, 40, 200), None)], + [('lipsum', 70, 10)])], round=True) diff --git a/weasyprint/tools/navigator.py b/weasyprint/tools/navigator.py index 3f5e339a6..f870864f2 100644 --- a/weasyprint/tools/navigator.py +++ b/weasyprint/tools/navigator.py @@ -65,7 +65,7 @@ def render_template(url): for width, height, data_url, links, anchors in get_pages(html): write('
\n' ' \n'.format(width, height, data_url)) - for link_type, target, (pos_x, pos_y, width, height) in links: + for link_type, target, (pos_x, pos_y, width, height), _ in links: href = ('#' + target if link_type == 'internal' else '/view/' + target) write(' @@ -841,6 +843,18 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page, [('lipsum', 70, 10)])], round=True) + # Download for attachment + assert_links( + ''' + + + ''', [[('attachment', 'pattern.png', + (5, 10, 190, 0), 'wow.png')]], + [{}], [([('attachment', 'pattern.png', + (5, 10, 190, 0), 'wow.png')], [])], + base_url=None) + # Make relative URL references work with our custom URL scheme. uses_relative.append('weasyprint-custom') From c5a06dbeaed64650ee8b632f17535864bf14ff45 Mon Sep 17 00:00:00 2001 From: Lucie Anglade Date: Thu, 10 Sep 2020 11:58:19 +0200 Subject: [PATCH 6/8] Clean code --- weasyprint/document.py | 6 ++---- weasyprint/pdf.py | 39 +++------------------------------------ 2 files changed, 5 insertions(+), 40 deletions(-) diff --git a/weasyprint/document.py b/weasyprint/document.py index 2ee90d994..ffa6d83ec 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -119,10 +119,8 @@ def _gather_links_and_bookmarks(box, bookmarks, links, anchors, matrix): has_link = link and not isinstance(box, boxes.TextBox) # In case of duplicate IDs, only the first is an anchor. has_anchor = anchor_name and anchor_name not in anchors - is_attachment = hasattr(box, 'is_attachment') and box.is_attachment - download_name = ( - box.attachment_download - if hasattr(box, 'attachment_download') else None) + is_attachment = getattr(box, 'is_attachment', box.is_attachment) + download_name = getattr(box, 'attachment_download', None) if has_bookmark or has_link or has_anchor: pos_x, pos_y, width, height = box.hit_area() diff --git a/weasyprint/pdf.py b/weasyprint/pdf.py index d740b3f57..c0fbbed74 100644 --- a/weasyprint/pdf.py +++ b/weasyprint/pdf.py @@ -361,7 +361,7 @@ def _write_compressed_file_object(pdf, file): return object_number -def _get_filename_from_result(url, result): +def _get_filename_from_url(url): """Derive a filename from a fetched resource. This is either the filename returned by the URL fetcher, the last URL path @@ -371,12 +371,6 @@ def _get_filename_from_result(url, result): filename = None - # A given filename will always take precedence - if result: - filename = result.get('filename') - if filename: - return filename - # The URL path likely contains a filename, which is a good second guess if url: split = urlsplit(url) @@ -385,32 +379,7 @@ def _get_filename_from_result(url, result): if filename == '': filename = None - if filename is None: - # The URL lacks a path altogether. Use a synthetic name. - - # Using guess_extension is a great idea, but sadly the extension is - # probably random, depending on the alignment of the stars, which car - # you're driving and which software has been installed on your machine. - # - # Unfortuneatly this isn't even imdepodent on one machine, because the - # extension can depend on PYTHONHASHSEED if mimetypes has multiple - # extensions to offer - extension = None - if result: - mime_type = result.get('mime_type') - if mime_type == 'text/plain': - # text/plain has a phletora of extensions - all garbage - extension = '.txt' - else: - extension = mimetypes.guess_extension(mime_type) or '.bin' - else: - extension = '.bin' - - filename = 'attachment' + extension - else: - filename = unquote(filename) - - return filename + return 'attachment.bin' if filename is None else unquote(filename) def _write_pdf_embedded_files(pdf, attachments, url_fetcher): @@ -466,9 +435,7 @@ def _write_pdf_attachment(pdf, attachment, url_fetcher, download_name=None): # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename - filename = ( - download_name - if download_name else _get_filename_from_result(url, None)) + filename = download_name or _get_filename_from_url(url) return pdf.write_new_object(pdf_format( '<< /Type /Filespec /F () /UF {0!P} /EF << /F {1} 0 R >> ' From 325c9cb7b4c63c35058c53a401992d06f45c82b3 Mon Sep 17 00:00:00 2001 From: Lucie Anglade Date: Thu, 10 Sep 2020 12:04:13 +0200 Subject: [PATCH 7/8] Fix test --- weasyprint/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weasyprint/document.py b/weasyprint/document.py index ffa6d83ec..08215e66f 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -119,7 +119,7 @@ def _gather_links_and_bookmarks(box, bookmarks, links, anchors, matrix): has_link = link and not isinstance(box, boxes.TextBox) # In case of duplicate IDs, only the first is an anchor. has_anchor = anchor_name and anchor_name not in anchors - is_attachment = getattr(box, 'is_attachment', box.is_attachment) + is_attachment = getattr(box, 'is_attachment', False) download_name = getattr(box, 'attachment_download', None) if has_bookmark or has_link or has_anchor: From a9822fbb83830f8aaa6dfca070780be6a0d6e683 Mon Sep 17 00:00:00 2001 From: Lucie Anglade Date: Thu, 10 Sep 2020 12:07:10 +0200 Subject: [PATCH 8/8] Remove useless import --- weasyprint/pdf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/weasyprint/pdf.py b/weasyprint/pdf.py index c0fbbed74..622c77eeb 100644 --- a/weasyprint/pdf.py +++ b/weasyprint/pdf.py @@ -26,7 +26,6 @@ import hashlib import io -import mimetypes import os import re import string