From 2656b4e4dc917bb246232b2b1466e0e7af58ee96 Mon Sep 17 00:00:00 2001 From: Konstantin Weddige <konstantin@skathi.net> Date: Fri, 22 Jul 2022 10:06:54 +0200 Subject: [PATCH 1/2] Add language to pdf --- weasyprint/document.py | 5 ++++- weasyprint/html.py | 3 ++- weasyprint/pdf/__init__.py | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/weasyprint/document.py b/weasyprint/document.py index 9f7580346..923757d05 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -104,7 +104,7 @@ class DocumentMetadata: """ def __init__(self, title=None, authors=None, description=None, keywords=None, generator=None, created=None, modified=None, - attachments=None, custom=None): + attachments=None, lang=None, custom=None): #: The title of the document, as a string or :obj:`None`. #: Extracted from the ``<title>`` element in HTML #: and written to the ``/Title`` info field in PDF. @@ -145,6 +145,9 @@ def __init__(self, title=None, authors=None, description=None, #: Extracted from the ``<link rel=attachment>`` elements in HTML #: and written to the ``/EmbeddedFiles`` dictionary in PDF. self.attachments = attachments or [] + #: Document language as BCP 47 language tags. + #: Extracted from ``<html lang=lang>`` in HTML. + self.lang = lang #: Custom metadata, as a dict whose keys are the metadata names and #: values are the metadata values. self.custom = custom or {} diff --git a/weasyprint/html.py b/weasyprint/html.py index cedf94765..3af11751a 100644 --- a/weasyprint/html.py +++ b/weasyprint/html.py @@ -267,6 +267,7 @@ def get_html_metadata(html): modified = None attachments = [] custom = {} + lang = html.etree_element.attrib.get('lang', None) for element in html.wrapper_element.query_all('title', 'meta', 'link'): element = element.etree_element if element.tag == 'title' and title is None: @@ -305,7 +306,7 @@ def get_html_metadata(html): return dict(title=title, description=description, generator=generator, keywords=keywords, authors=authors, created=created, modified=modified, - attachments=attachments, custom=custom) + attachments=attachments, lang=lang, custom=custom) def strip_whitespace(string): diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py index 9e40d1c85..99f25c75e 100644 --- a/weasyprint/pdf/__init__.py +++ b/weasyprint/pdf/__init__.py @@ -417,6 +417,8 @@ def generate_pdf(pages, url_fetcher, metadata, fonts, target, zoom, if metadata.modified: pdf.info['ModDate'] = pydyf.String( _w3c_date_to_pdf(metadata.modified, 'modified')) + if metadata.lang: + pdf.catalog['Lang'] = pydyf.String(metadata.lang) if custom_metadata: for key, value in metadata.custom.items(): key = ''.join(char for char in key if char.isalnum()) From 42db8bdb033bcc111db549cafffefaafdeced398 Mon Sep 17 00:00:00 2001 From: Konstantin Weddige <konstantin@skathi.net> Date: Fri, 22 Jul 2022 12:14:46 +0200 Subject: [PATCH 2/2] Update tests --- tests/test_api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index 83c2ddd5f..65a5d3ab8 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -998,6 +998,7 @@ def assert_meta(html, **meta): meta.setdefault('created', None) meta.setdefault('modified', None) meta.setdefault('attachments', []) + meta.setdefault('lang', None) meta.setdefault('custom', {}) assert vars(FakeHTML(string=html).render().metadata) == meta @@ -1011,6 +1012,7 @@ def test_html_meta_1(): def test_html_meta_2(): assert_meta( ''' + <html lang="en"><head> <meta name=author content="I Me & Myself"> <meta name=author content="Smith, John"> <title>Test document</title> @@ -1027,6 +1029,7 @@ def test_html_meta_2(): <meta name=dcterms.modified content=2013> <meta name=keywords content="Python; pydyf"> <meta name=description content="Blah… "> + </head></html> ''', authors=['I Me & Myself', 'Smith, John'], title='Test document', @@ -1035,6 +1038,7 @@ def test_html_meta_2(): description="Blah… ", created='2011-04', modified='2013', + lang='en', custom={'dummy': 'ignored'})