pymupdf · JorjMcKie · Apr 24, 2024 · Apr 23, 2024
diff --git a/changes.txt b/changes.txt
@@ -2,6 +2,23 @@ Change Log
 ==========
 
 
+**Changes in version 1.24.3 (2024-04-xx)**
+
+
+* Fixed issues:
+
+  * **Fixed** `3402 <https://github.com/pymupdf/PyMuPDF/issues/3402>`_: Cannot add Widgets containing inter-field-calculation JavaScript
+
+  * **Fixed** `3379 <https://github.com/pymupdf/PyMuPDF/issues/3379>`_: Documentation mismatch for get_text_blocks return value order.
+
+* Other:
+
+  * New/modified methods:
+
+    * `Page.remove_rotation()`: new, set page rotation to zero while keeping appearance.
+
+
+
 **Changes in version 1.24.2 (2024-04-17)**
 
 * Removed obsolete classic implementation from releases

diff --git a/docs/document.rst b/docs/document.rst
@@ -1146,9 +1146,16 @@ For details on **embedded files** refer to Appendix 3.
 
   .. method:: bake(*, annots=True, widgets=True)
 
-    PDF only: Convert annotations and / or widgets to become permanent parts of the pages. This will retain each page's appearance. When widgets (fields) are selected, the document will no longer be a "Form PDF".
+    PDF only: Convert annotations and / or widgets to become permanent parts of the pages. The PDF **will be changed** by this method. If `widgets` is `True`, the document will also no longer be a "Form PDF".
+
+    All pages will look the same, but will no longer have annotations, respectively fields. The visible parts will be converted to standard text, vector graphics or images as required.
+
+    The method may thus be a viable **alternative for PDF-to-PDF conversions** using :meth:`Document.convert_to_pdf`.
+
+    Please consider that annotations are complex objects and may consist of more data "underneath" their visual appearance. Examples are "Text" and "FileAttachment" annotations. When "baking in" annotations / widgets with this method, all this underlying information (attached files, comments, associated PopUp annotations, etc.) will be lost and be removed on next garbage collection.
+
+    Use this feature for instance for methods :meth:`Document.insert_pdf` (which supports no copying of widgets) or :meth:`Page.show_pdf_page` (which supports neither annotations nor widgets) when the source pages should look exactly the same in the target.
 
-    Use this feature for instance in :meth:`Document.insert_pdf` (which supports no copying of widgets) or :meth:`Page.show_pdf_page` (which supports neither annotations nor widgets) when the same page appearance is desired.
 
     :arg bool annots: convert annotations.
     :arg bool widgets: convert fields / widgets. After execution, the document will no longer be a "Form PDF".
@@ -1806,19 +1813,28 @@ For details on **embedded files** refer to Appendix 3.
     :returns: *True* / *False*. As opposed to fields, which are also stored in a central place of a PDF document, the existence of links / annotations can only be detected by parsing each page. These methods are tuned to do this efficiently and will immediately return, if the answer is *True* for a page. For PDFs with many thousand pages however, an answer may take some time [#f6]_ if no link, resp. no annotation is found.
 
 
-  .. method:: subset_fonts()
+  .. method:: subset_fonts(verbose=False, fallback=False)
+
+    PDF only: Investigate eligible fonts for their use by text in the document. If a font is supported and a size reduction is possible, that font is replaced by a version with a subset of its characters.
 
-    * New in v1.18.7, changed in v1.18.9
+    Use this method immediately before saving the document.
 
-    PDF only: Investigate eligible fonts for their use by text in the document. If a font is supported and a size reduction is possible, that font is replaced by a version with a character subset.
+    :arg bool verbose: write various progress information to sysout. This currently only has an effect if `fallback` is `True`.
+    :arg bool fallback: if `True` use the deprecated algorithm that makes use of package `fontTools <https://pypi.org/project/fonttools/>`_ (which hence must be installed). If using the recommended value `False` (default), MuPDF's native function is used -- which is **very much faster** and can subset a broader range of font types. Package fontTools is not required then.
 
-    Use this method immediately before saving the document. The following features and restrictions apply for the time being:
+    The greatest benefit can be achieved when creating new PDFs using large fonts like is typical for Asian scripts. When using the :ref:`Story` class or method :meth:`Page.insert_htmlbox`, multiple fonts may automatically be included -- without the programmer becoming aware of it.
+
+    In all these cases, the set of actually used unicodes mostly is very small compared to the number of glyphs available in the used fonts. Using this method can easily reduce the embedded font binaries by two orders of magnitude -- from several megabytes down to a low two-digit kilobyte amount.
 
-    * Package `fontTools <https://pypi.org/project/fonttools/>`_ **must be installed**. It is required for creating the font subsets. If not installed, the method raises an `ImportError` exception.
-    * Supported font types only include embedded OTF, TTF and WOFF that are **not already subsets**.
-    * **Changed in v1.18.9:** A subset font directly replaces its original -- text remains untouched and **is not rewritten.** It thus should retain all its properties, like spacing, hiddenness, control by Optional Content, etc.
+    Creating font subsets leaves behind a large number of large, now unused PDF objects ("ghosts"). Therefore, make sure to compress and garbage-collect when saving the file. We recommend to use :meth:`Document.ez_save`.
+
+    |history_begin|
+
+    * New in v1.18.7
+    * Changed in v1.18.9
+    * Changed in v1.24.2 use native function of MuPDF.
 
-    The greatest benefit can be achieved when creating new PDFs using large fonts like is typical for Asian scripts. In these cases, the set of actually used unicodes mostly is small compared to the number of glyphs in the font. Using this feature can easily reduce the embedded font binary by two orders of magnitude -- from several megabytes to a low two-digit kilobyte amount.
+    |history_end|
 
 
   .. method:: journal_enable()

diff --git a/docs/page.rst b/docs/page.rst
@@ -106,6 +106,7 @@ In a nutshell, this is what you can do with PyMuPDF:
 :meth:`Page.load_widget`           PDF only: load a specific field
 :meth:`Page.load_links`            return the first link on a page
 :meth:`Page.new_shape`             PDF only: create a new :ref:`Shape`
+:meth:`Page.remove_rotation`       PDF only: set page rotation to 0
 :meth:`Page.replace_image`         PDF only: replace an image
 :meth:`Page.search_for`            search for a string
 :meth:`Page.set_artbox`            PDF only: modify `/ArtBox`
@@ -1908,6 +1909,14 @@ In a nutshell, this is what you can do with PyMuPDF:
 
       :arg int rotate: An integer specifying the required rotation in degrees. Must be an integer multiple of 90. Values will be converted to one of 0, 90, 180, 270.
 
+   .. method:: remove_rotation()
+
+      PDF only: Set page rotation to 0 while maintaining appearance and page content.
+
+      :returns: The inverted matrix used to achieve this change. If the page was not rotated (rotation 0), :ref:`Identity` is returned. The method automatically recomputes the rectangles of any annotations, links and widgets present on the page.
+
+         This method may come in handy when e.g. used with :meth:`Page.show_pdf_page`.
+
    .. index::
       pair: clip; show_pdf_page
       pair: keep_proportion; show_pdf_page

diff --git a/src/__init__.py b/src/__init__.py
@@ -8960,6 +8960,54 @@ def get_lineart(self) -> object:
             val = None
             return paths
 
+    def remove_rotation(self):
+        """Set page rotation to 0 while maintaining visual appearance."""
+        rot = self.rotation  # normalized rotation value
+        if rot == 0:
+            return  Identity # nothing to do
+
+        # need to derotate the page's content
+        mb = self.mediabox  # current mediabox
+
+        if rot == 90:
+            # before derotation, shift content horizontally
+            mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
+        elif rot == 270:
+            # before derotation, shift content vertically
+            mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
+        else:  # rot = 180
+            mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
+
+        # prefix with derotation matrix
+        mat = mat0 * self.derotation_matrix
+        cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
+        _ = TOOLS._insert_contents(self, cmd, False)  # prepend to page contents
+
+        # swap x- and y-coordinates
+        if rot in (90, 270):
+            x0, y0, x1, y1 = mb
+            mb.x0 = y0
+            mb.y0 = x0
+            mb.x1 = y1
+            mb.y1 = x1
+            self.set_mediabox(mb)
+
+        self.set_rotation(0)
+        rot = ~mat  # inverse of the derotation matrix
+        for annot in self.annots():  # modify rectangles of annotations
+            r = annot.rect * rot
+            annot.set_rect(r)
+        for link in self.get_links():  # modify 'from' rectangles of links
+            r = link["from"] * rot
+            self.delete_link(link)
+            link["from"] = r
+            self.insert_link(link)
+        for widget in self.widgets():  # modify field rectangles
+            r = widget.rect * rot
+            widget.rect = r
+            widget.update()
+        return rot  # the inverse of the generated derotation matrix
+
     def cluster_drawings(
         self, clip=None, drawings=None, x_tolerance: float = 3, y_tolerance: float = 3
     ) -> list:
@@ -20426,8 +20474,8 @@ def util_ensure_widget_calc(annot):
             PDF_NAME('AcroForm'),
             )
 
-    CO = mupdf.pdf_dict_get(acro, PDFNAME_CO)    # = AcroForm/CO
-    if not CO.this:
+    CO = mupdf.pdf_dict_get(acro, PDFNAME_CO)  # = AcroForm/CO
+    if not mupdf.pdf_is_array(CO):
         CO = mupdf.pdf_dict_put_array(acro, PDFNAME_CO, 2)
     n = mupdf.pdf_array_len(CO)
     found = 0

diff --git a/src/table.py b/src/table.py
@@ -1887,6 +1887,7 @@ def make_chars(page, clip=None):
     for block in blocks:
         for line in block["lines"]:
             ldir = line["dir"]  # = (cosine, sine) of angle
+            ldir = (round(ldir[0], 4), round(ldir[1], 4))
             matrix = Matrix(ldir[0], -ldir[1], ldir[1], ldir[0], 0, 0)
             if ldir[1] == 0:
                 upright = True

diff --git a/src/utils.py b/src/utils.py
@@ -480,16 +480,11 @@ def get_text_blocks(
         flags: (int) control the amount of data parsed into the textpage.
     Returns:
         A list of the blocks. Each item contains the containing rectangle
-        coordinates, text lines, block type and running block number.
+        coordinates, text lines, running block number and block type.
     """
     fitz.CheckParent(page)
     if flags is None:
-        flags = (
-                fitz.TEXT_PRESERVE_WHITESPACE
-                    | fitz.TEXT_PRESERVE_IMAGES
-                    | fitz.TEXT_PRESERVE_LIGATURES
-                    | fitz.TEXT_MEDIABOX_CLIP
-                )
+        flags = fitz.TEXTFLAGS_BLOCKS
     tp = textpage
     if tp is None:
         tp = page.get_textpage(clip=clip, flags=flags)

diff --git a/tests/gentle_compare.py b/tests/gentle_compare.py
@@ -0,0 +1,26 @@
+import fitz
+
+
+def gentle_compare(w0, w1):
+    """Check lists of "words" extractions for approximate equality.
+
+    * both lists must have same length
+    * word items must contain same word strings
+    * word rectangles must be approximately equal
+    """
+    tolerance = 1e-3  # maximum (Euclidean) norm of difference rectangle
+    word_count = len(w0)  # number of words
+    if word_count != len(w1):
+        print(f"different number of words: {word_count}/{len(w1)}")
+        return False
+    for i in range(word_count):
+        if w0[i][4] != w1[i][4]:  # word strings must be the same
+            print(f"word {i} mismatch")
+            return False
+        r0 = fitz.Rect(w0[i][:4])  # rect of first word
+        r1 = fitz.Rect(w1[i][:4])  # rect of second word
+        delta = (r1 - r0).norm()  # norm of difference rectangle
+        if delta > tolerance:
+            print(f"word {i}: rectangle mismatch {delta}")
+            return False
+    return True
diff --git a/tests/resources/test-2812.pdf b/tests/resources/test-2812.pdf
diff --git a/tests/test-remove-rotation.py b/tests/test-remove-rotation.py
@@ -0,0 +1,30 @@
+import os
+import fitz
+from gentle_compare import gentle_compare
+
+scriptdir = os.path.dirname(__file__)
+
+
+def test_remove_rotation():
+    """Remove rotation verifying identical appearance and text."""
+    filename = os.path.join(scriptdir, "resources", "test-2812.pdf")
+    doc = fitz.open(filename)
+
+    # We always create fresh pages to avoid false positves from cache content.
+    # Text on these pages consists of pairwise different strings, sorting by
+    # these strings must therefore yield identical bounding boxes.
+    for i in range(1, doc.page_count):
+        assert doc[i].rotation  # must be a rotated page
+        pix0 = doc[i].get_pixmap()  # make image
+        words0 = []
+        for w in doc[i].get_text("words"):
+            words0.append(list(fitz.Rect(w[:4]) * doc[i].rotation_matrix) + [w[4]])
+        words0.sort(key=lambda w: w[4])  # sort by word strings
+        # derotate page and confirm nothing else has changed
+        doc[i].remove_rotation()
+        assert doc[i].rotation == 0
+        pix1 = doc[i].get_pixmap()
+        words1 = doc[i].get_text("words")
+        words1.sort(key=lambda w: w[4])  # sort by word strings
+        assert pix1.digest == pix0.digest, f"{pix1.digest}/{pix0.digest}"
+        assert gentle_compare(words0, words1)
diff --git a/tests/test_mupdf_regressions.py b/tests/test_mupdf_regressions.py
@@ -1,34 +1,10 @@
 import fitz
 import os
+from gentle_compare import gentle_compare
 
 scriptdir = os.path.abspath(os.path.dirname(__file__))
 
 
-def gentle_compare(w0, w1):
-    """Check lists of "words" extractions for approximate equality.
-
-    * both lists must have same length
-    * word items must contain same word strings
-    * word rectangles must be approximately equal
-    """
-    tolerance = 1e-3  # maximum (Euclidean) norm of difference rectangle
-    word_count = len(w0)  # number of words
-    if word_count != len(w1):
-        print(f"different number of words: {word_count}/{len(w1)}")
-        return False
-    for i in range(word_count):
-        if w0[i][4] != w1[i][4]:  # word strings must be the same
-            print(f"word {i} mismatch")
-            return False
-        r0 = fitz.Rect(w0[i][:4])  # rect of first word
-        r1 = fitz.Rect(w1[i][:4])  # rect of second word
-        delta = (r1 - r0).norm()  # norm of difference rectangle
-        if delta > tolerance:
-            print(f"word {i}: rectangle mismatch {delta}")
-            return False
-    return True
-
-
 def test_707448():
     """Confirm page content cleaning does not destroy page appearance."""
     filename = os.path.join(scriptdir, "resources", "test-707448.pdf")
@@ -80,7 +56,7 @@ def test_707727():
     if fitz.mupdf_version_tuple <= (1, 24, 1):
         # We expect warnings.
         wt = fitz.TOOLS.mupdf_warnings()
-        print(f'{wt=}')
+        print(f"{wt=}")
         assert wt
 
 
@@ -90,7 +66,9 @@ def test_707721():
     MuPDF issue: https://bugs.ghostscript.com/show_bug.cgi?id=707721
     """
     if fitz.mupdf_version_tuple < (1, 24, 2):
-        print('test_707721(): not running because MuPDF-{fitz.mupdf_version} known to hang.')
+        print(
+            "test_707721(): not running because MuPDF-{fitz.mupdf_version} known to hang."
+        )
         return
     filename = os.path.join(scriptdir, "resources", "test_3357.pdf")
     doc = fitz.open(filename)