Put PDF fonts management in a separate module

Kozea · Jun 26, 2022 · 5486875 · 5486875
1 parent 70f9b62
commit 5486875
Show file tree

Hide file tree

Showing 2 changed files with 300 additions and 298 deletions.
diff --git a/weasyprint/pdf/__init__.py b/weasyprint/pdf/__init__.py
@@ -2,7 +2,6 @@
 
 import hashlib
 import io
-import math
 import zlib
 from os.path import basename
 from urllib.parse import unquote, urlsplit
@@ -16,6 +15,7 @@
 from ..matrix import Matrix
 from ..urls import URLFetchingError
 from . import pdfa
+from .fonts import build_fonts_dictionary
 from .stream import Stream
 
 VARIANTS = {
@@ -149,8 +149,7 @@ def _use_references(pdf, resources, images):
         # Resources
         if 'Resources' in x_object.extra:
             x_object.extra['Resources'] = _reference_resources(
-                pdf, x_object.extra['Resources'], images,
-                resources['Font'])
+                pdf, x_object.extra['Resources'], images, resources['Font'])
 
     # Patterns
     for key, pattern in resources.get('Pattern', {}).items():
@@ -209,8 +208,7 @@ def _create_bookmarks(bookmarks, pdf, parent=None):
     outlines = []
     for title, (page, x, y), children, state in bookmarks:
         destination = pydyf.Array((
-            pdf.objects[pdf.pages['Kids'][page * 3]].reference,
-            '/XYZ', x, y, 0))
+            pdf.objects[pdf.pages['Kids'][page*3]].reference, '/XYZ', x, y, 0))
         outline = pydyf.Dictionary({
             'Title': pydyf.String(title), 'Dest': destination})
         pdf.add_object(outline)
@@ -410,8 +408,7 @@ def generate_pdf(pages, url_fetcher, metadata, fonts, target, zoom,
     if metadata.description:
         pdf.info['Subject'] = pydyf.String(metadata.description)
     if metadata.keywords:
-        pdf.info['Keywords'] = pydyf.String(
-            ', '.join(metadata.keywords))
+        pdf.info['Keywords'] = pydyf.String(', '.join(metadata.keywords))
     if metadata.generator:
         pdf.info['Creator'] = pydyf.String(metadata.generator)
     if metadata.created:
@@ -444,297 +441,8 @@ def generate_pdf(pages, url_fetcher, metadata, fonts, target, zoom,
             pdf.catalog['Names'] = pydyf.Dictionary()
         pdf.catalog['Names']['EmbeddedFiles'] = content.reference
 
-    # Embeded fonts
-    pdf_fonts = pydyf.Dictionary()
-    fonts_by_file_hash = {}
-    for font in fonts.values():
-        fonts_by_file_hash.setdefault(font.hash, []).append(font)
-    font_references_by_file_hash = {}
-    for file_hash, file_fonts in fonts_by_file_hash.items():
-        # TODO: find why we can have multiple fonts for one font file
-        font = file_fonts[0]
-        if font.bitmap:
-            continue
-
-        # Clean font, optimize and handle emojis
-        cmap = {}
-        if 'fonts' in optimize_size:
-            for file_font in file_fonts:
-                cmap = {**cmap, **file_font.cmap}
-        font.clean(cmap)
-
-        # Include font
-        if font.type == 'otf':
-            font_extra = pydyf.Dictionary({'Subtype': '/OpenType'})
-        else:
-            font_extra = pydyf.Dictionary(
-                {'Length1': len(font.file_content)})
-        font_stream = pydyf.Stream(
-            [font.file_content], font_extra, compress=True)
-        pdf.add_object(font_stream)
-        font_references_by_file_hash[file_hash] = font_stream.reference
-
-    for font in fonts.values():
-        widths = pydyf.Array()
-        for i in sorted(font.widths):
-            if i - 1 not in font.widths:
-                widths.append(i)
-                current_widths = pydyf.Array()
-                widths.append(current_widths)
-            current_widths.append(font.widths[i])
-        font_file = f'FontFile{3 if font.type == "otf" else 2}'
-        to_unicode = pydyf.Stream([
-            b'/CIDInit /ProcSet findresource begin',
-            b'12 dict begin',
-            b'begincmap',
-            b'/CIDSystemInfo',
-            b'<< /Registry (Adobe)',
-            b'/Ordering (UCS)',
-            b'/Supplement 0',
-            b'>> def',
-            b'/CMapName /Adobe-Identity-UCS def',
-            b'/CMapType 2 def',
-            b'1 begincodespacerange',
-            b'<0000> <ffff>',
-            b'endcodespacerange',
-            f'{len(font.cmap)} beginbfchar'.encode()])
-        for glyph, text in font.cmap.items():
-            unicode_codepoints = ''.join(
-                f'{letter.encode("utf-16-be").hex()}' for letter in text)
-            to_unicode.stream.append(
-                f'<{glyph:04x}> <{unicode_codepoints}>'.encode())
-        to_unicode.stream.extend([
-            b'endbfchar',
-            b'endcmap',
-            b'CMapName currentdict /CMap defineresource pop',
-            b'end',
-            b'end'])
-        pdf.add_object(to_unicode)
-        font_dictionary = pydyf.Dictionary({
-            'Type': '/Font',
-            'Subtype': f'/Type{3 if font.bitmap else 0}',
-            'BaseFont': font.name,
-            'ToUnicode': to_unicode.reference,
-        })
-
-        if font.bitmap:
-            # https://docs.microsoft.com/typography/opentype/spec/ebdt
-            font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1])
-            font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0])
-            if 'fonts' in optimize_size:
-                chars = tuple(sorted(font.cmap))
-            else:
-                chars = tuple(range(256))
-            first, last = chars[0], chars[-1]
-            font_dictionary['FirstChar'] = first
-            font_dictionary['LastChar'] = last
-            differences = []
-            for index, index_widths in zip(widths[::2], widths[1::2]):
-                differences.append(index)
-                for i in range(len(index_widths)):
-                    if i + index in chars:
-                        differences.append(f'/{i + index}')
-            font_dictionary['Encoding'] = pydyf.Dictionary({
-                'Type': '/Encoding',
-                'Differences': pydyf.Array(differences),
-            })
-            char_procs = pydyf.Dictionary({})
-            font_glyphs = font.ttfont['EBDT'].strikeData[0]
-            widths = [0] * (last - first + 1)
-            glyphs_info = {}
-            for key, glyph in font_glyphs.items():
-                glyph_format = glyph.getFormat()
-                glyph_id = font.ttfont.getGlyphID(key)
-
-                # Get and store glyph metrics
-                if glyph_format == 5:
-                    data = glyph.data
-                    subtables = font.ttfont['EBLC'].strikes[0].indexSubTables
-                    for subtable in subtables:
-                        first_index = subtable.firstGlyphIndex
-                        last_index = subtable.lastGlyphIndex
-                        if first_index <= glyph_id <= last_index:
-                            height = subtable.metrics.height
-                            advance = width = subtable.metrics.width
-                            bearing_x = subtable.metrics.horiBearingX
-                            bearing_y = subtable.metrics.horiBearingY
-                            break
-                    else:
-                        LOGGER.warning(
-                            f'Unknown bitmap metrics for glyph: {glyph_id}')
-                        continue
-                else:
-                    data_start = 5 if glyph_format in (1, 2, 8) else 8
-                    data = glyph.data[data_start:]
-                    height, width = glyph.data[0:2]
-                    bearing_x = int.from_bytes(
-                        glyph.data[2:3], 'big', signed=True)
-                    bearing_y = int.from_bytes(
-                        glyph.data[3:4], 'big', signed=True)
-                    advance = glyph.data[4]
-                position_y = bearing_y - height
-                if glyph_id in chars:
-                    widths[glyph_id - first] = advance
-                stride = math.ceil(width / 8)
-                glyph_info = glyphs_info[glyph_id] = {
-                    'width': width,
-                    'height': height,
-                    'x': bearing_x,
-                    'y': position_y,
-                    'stride': stride,
-                    'bitmap': None,
-                    'subglyphs': None,
-                }
-
-                # Decode bitmaps
-                if glyph_format in (1, 6):
-                    glyph_info['bitmap'] = data
-                elif glyph_format in (2, 5, 7):
-                    padding = (8 - (width % 8)) % 8
-                    bits = bin(int(data.hex(), 16))[2:]
-                    bits = bits.zfill(8 * len(data))
-                    bitmap_bits = ''.join(
-                        bits[i * width:(i + 1) * width] + padding * '0'
-                        for i in range(height))
-                    glyph_info['bitmap'] = int(bitmap_bits, 2).to_bytes(
-                        height * stride, 'big')
-                elif glyph_format in (8, 9):
-                    subglyphs = glyph_info['subglyphs'] = []
-                    i = 0 if glyph_format == 9 else 1
-                    number_of_components = int.from_bytes(
-                        data[i:i+2], 'big')
-                    for j in range(number_of_components):
-                        index = (i + 2) + (j * 4)
-                        subglyph_id = int.from_bytes(
-                            data[index:index+2], 'big')
-                        x = int.from_bytes(
-                            data[index+2:index+3], 'big', signed=True)
-                        y = int.from_bytes(
-                            data[index+3:index+4], 'big', signed=True)
-                        subglyphs.append(
-                            {'id': subglyph_id, 'x': x, 'y': y})
-                else:  # pragma: no cover
-                    LOGGER.warning(
-                        f'Unsupported bitmap glyph format: {glyph_format}')
-                    glyph_info['bitmap'] = bytes(height * stride)
-
-            for glyph_id, glyph_info in glyphs_info.items():
-                # Don’t store glyph not in cmap
-                if glyph_id not in chars:
-                    continue
-
-                # Draw glyph
-                stride = glyph_info['stride']
-                width = glyph_info['width']
-                height = glyph_info['height']
-                x = glyph_info['x']
-                y = glyph_info['y']
-                if glyph_info['bitmap'] is None:
-                    length = height * stride
-                    bitmap_int = int.from_bytes(bytes(length), 'big')
-                    for subglyph in glyph_info['subglyphs']:
-                        sub_x = subglyph['x']
-                        sub_y = subglyph['y']
-                        sub_id = subglyph['id']
-                        if sub_id not in glyphs_info:
-                            LOGGER.warning(f'Unknown subglyph: {sub_id}')
-                            continue
-                        subglyph = glyphs_info[sub_id]
-                        if subglyph['bitmap'] is None:
-                            # TODO: support subglyph in subglyph
-                            LOGGER.warning(
-                                'Unsupported subglyph in subglyph: '
-                                f'{sub_id}')
-                            continue
-                        for row_y in range(subglyph['height']):
-                            row_slice = slice(
-                                row_y * subglyph['stride'],
-                                (row_y + 1) * subglyph['stride'])
-                            row = subglyph['bitmap'][row_slice]
-                            row_int = int.from_bytes(row, 'big')
-                            shift = (
-                                stride * 8 * (height - sub_y - row_y - 1))
-                            stride_difference = stride - subglyph['stride']
-                            if stride_difference > 0:
-                                row_int <<= stride_difference * 8
-                            elif stride_difference < 0:
-                                row_int >>= -stride_difference * 8
-                            if sub_x > 0:
-                                row_int >>= sub_x
-                            elif sub_x < 0:
-                                row_int <<= -sub_x
-                            row_int %= 1 << stride * 8
-                            row_int <<= shift
-                            bitmap_int |= row_int
-                    bitmap = bitmap_int.to_bytes(length, 'big')
-                else:
-                    bitmap = glyph_info['bitmap']
-                bitmap_stream = pydyf.Stream([
-                    b'0 0 d0',
-                    f'{width} 0 0 {height} {x} {y} cm'.encode(),
-                    b'BI',
-                    b'/IM true',
-                    b'/W', width,
-                    b'/H', height,
-                    b'/BPC 1',
-                    b'/D [1 0]',
-                    b'ID', bitmap, b'EI'
-                ])
-                pdf.add_object(bitmap_stream)
-                char_procs[glyph_id] = bitmap_stream.reference
-
-            pdf.add_object(char_procs)
-            font_dictionary['Widths'] = pydyf.Array(widths)
-            font_dictionary['CharProcs'] = char_procs.reference
-
-        else:
-            font_descriptor = pydyf.Dictionary({
-                'Type': '/FontDescriptor',
-                'FontName': font.name,
-                'FontFamily': pydyf.String(font.family),
-                'Flags': font.flags,
-                'FontBBox': pydyf.Array(font.bbox),
-                'ItalicAngle': font.italic_angle,
-                'Ascent': font.ascent,
-                'Descent': font.descent,
-                'CapHeight': font.bbox[3],
-                'StemV': font.stemv,
-                'StemH': font.stemh,
-                font_file: font_references_by_file_hash[font.hash],
-            })
-            if pdf.version <= b'1.4':
-                cids = sorted(font.widths)
-                padded_width = int(math.ceil(cids[-1] / 8))
-                bits = ['0'] * padded_width * 8
-                for cid in cids:
-                    bits[cid] = '1'
-                stream = pydyf.Stream(
-                    (int(''.join(bits), 2).to_bytes(padded_width, 'big'),))
-                pdf.add_object(stream)
-                font_descriptor['CIDSet'] = stream.reference
-            if font.type == 'otf':
-                font_descriptor['Subtype'] = '/OpenType'
-            pdf.add_object(font_descriptor)
-            subfont_dictionary = pydyf.Dictionary({
-                'Type': '/Font',
-                'Subtype': f'/CIDFontType{0 if font.type == "otf" else 2}',
-                'BaseFont': font.name,
-                'CIDSystemInfo': pydyf.Dictionary({
-                    'Registry': pydyf.String('Adobe'),
-                    'Ordering': pydyf.String('Identity'),
-                    'Supplement': 0,
-                }),
-                'CIDToGIDMap': '/Identity',
-                'W': widths,
-                'FontDescriptor': font_descriptor.reference,
-            })
-            pdf.add_object(subfont_dictionary)
-            font_dictionary['Encoding'] = '/Identity-H'
-            font_dictionary['DescendantFonts'] = pydyf.Array(
-                [subfont_dictionary.reference])
-        pdf.add_object(font_dictionary)
-        pdf_fonts[font.hash] = font_dictionary.reference
-
+    # Embedded fonts
+    pdf_fonts = build_fonts_dictionary(pdf, fonts, optimize_size)
     pdf.add_object(pdf_fonts)
     resources['Font'] = pdf_fonts.reference
     _use_references(pdf, resources, images)