From 2aea3e97b89c4c7f4845b888a2dc1fc647db7576 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 13 Jun 2022 21:09:51 +0200 Subject: [PATCH] fix 'utf-16-be' codec can't decode bytes in position 0-1: unexpected end of data use surrogatepass in _cmap and _page --- PyPDF2/__init__.py | 2 +- PyPDF2/_cmap.py | 604 ++++---- PyPDF2/_page.py | 2945 +++++++++++++++++++-------------------- PyPDF2/_utils.py | 662 ++++----- tests/test_page.py | 1 + tests/test_reader.py | 1 + tests/test_workflows.py | 13 +- 7 files changed, 2123 insertions(+), 2105 deletions(-) diff --git a/PyPDF2/__init__.py b/PyPDF2/__init__.py index d9759231d..09076ba8d 100644 --- a/PyPDF2/__init__.py +++ b/PyPDF2/__init__.py @@ -1,5 +1,5 @@ from ._merger import PdfFileMerger, PdfMerger -from ._page import Transformation, PageObject +from ._page import PageObject, Transformation from ._reader import DocumentInformation, PdfFileReader, PdfReader from ._version import __version__ from ._writer import PdfFileWriter, PdfWriter diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index 32b9fcc41..bb9c477a1 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -1,294 +1,310 @@ -import warnings -from binascii import unhexlify -from typing import Any, Dict, List, Tuple, Union, cast - -from ._adobe_glyphs import adobe_glyphs -from .errors import PdfReadWarning -from .generic import DecodedStreamObject, DictionaryObject, charset_encoding - - -# code freely inspired from @twiggy ; see #711 -def build_char_map( - font_name: str, space_width: float, obj: DictionaryObject -) -> Tuple[ - str, float, Union[str, Dict[int, str]], Dict -]: # font_type,space_width /2, encoding, cmap - ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore - font_type: str = cast(str, ft["/Subtype"]) - - space_code = 32 - encoding, space_code = parse_encoding(ft, space_code) - map_dict, space_code, int_entry = parse_to_unicode(ft, space_code) - - # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) - # if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data - if encoding == "": - if -1 not in map_dict or map_dict[-1] == 1: - # I have not been able to find any rule for no /Encoding nor /ToUnicode - # One example shows /Symbol,bold I consider 8 bits encoding default - encoding = "charmap" - else: - encoding = "utf-16-be" - # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters) - # if encoding is an str it is expected to be a identity translation - elif isinstance(encoding, dict): - for x in int_entry: - if x <= 255: - encoding[x] = chr(x) - if font_name in _default_fonts_space_width: - # override space_width with new params - space_width = _default_fonts_space_width[font_name] - sp_width = compute_space_width(ft, space_code, space_width) - - return ( - font_type, - float(sp_width / 2), - encoding, - # https://github.com/python/mypy/issues/4374 - map_dict, # type: ignore - ) # type: ignore - - -# used when missing data, e.g. font def missing -unknown_char_map : Tuple[str, float, Union[str, Dict[int, str]], Dict] = ( - "Unknown", 9999, dict(zip(range(256), ["�"] * 256)), {} -) - - -_predefined_cmap: Dict[str, str] = { - "/Identity-H": "utf-16-be", - "/Identity-V": "utf-16-be", - "/GB-EUC-H": "gbk", # TBC - "/GB-EUC-V": "gbk", # TBC - "/GBpc-EUC-H": "gb2312", # TBC - "/GBpc-EUC-V": "gb2312", # TBC -} - - -# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz -_default_fonts_space_width: Dict[str, int] = { - "/Courrier": 600, - "/Courier-Bold": 600, - "/Courier-BoldOblique": 600, - "/Courier-Oblique": 600, - "/Helvetica": 278, - "/Helvetica-Bold": 278, - "/Helvetica-BoldOblique": 278, - "/Helvetica-Oblique": 278, - "/Helvetica-Narrow": 228, - "/Helvetica-NarrowBold": 228, - "/Helvetica-NarrowBoldOblique": 228, - "/Helvetica-NarrowOblique": 228, - "/Times-Roman": 250, - "/Times-Bold": 250, - "/Times-BoldItalic": 250, - "/Times-Italic": 250, - "/Symbol": 250, - "/ZapfDingbats": 278, -} - - -def parse_encoding( - ft: DictionaryObject, space_code: int -) -> Tuple[Union[str, Dict[int, str]], int]: - encoding: Union[str, List[str], Dict[int, str]] = [] - if "/Encoding" not in ft: - try: - if "/BaseFont" in ft and ft["/BaseFont"] in charset_encoding: - encoding = dict(zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])) - else: - encoding = "charmap" - return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])] - except Exception: - if ft["/Subtype"] == "/Type1": - return "charmap", space_code - else: - return "", space_code - enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore - if isinstance(enc, str): - try: - if enc in charset_encoding: - encoding = charset_encoding[enc].copy() - elif enc in _predefined_cmap: - encoding = _predefined_cmap[enc] - else: - raise Exception("not found") - except Exception: - warnings.warn( - f"Advanced encoding {enc} not implemented yet", - PdfReadWarning, - ) - encoding = enc - elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: - try: - encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - else: - encoding = charset_encoding["/StandardCoding"].copy() - if "/Differences" in enc: - x: int = 0 - o: Union[int, str] - for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): - if isinstance(o, int): - x = o - else: # isinstance(o,str): - try: - encoding[x] = adobe_glyphs[o] # type: ignore - except Exception: - encoding[x] = o # type: ignore - if o == " ": - space_code = x - x += 1 - if isinstance(encoding, list): - encoding = dict(zip(range(256), encoding)) - return encoding, space_code - - -def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int, List[int]]: - map_dict: Dict[ - Any, Any - ] = ( - {} - ) # will store all translation code and map_dict[-1] we will have the number of bytes to convert - int_entry : List[int] = [] # will provide the list of cmap keys as int to correct encoding - if "/ToUnicode" not in ft: - return {}, space_code, [] - process_rg: bool = False - process_char: bool = False - cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() - # we need to prepare cm before due to missing return line in pdf printed to pdf from word - cm = ( - cm.strip() - .replace(b"beginbfchar", b"\nbeginbfchar\n") - .replace(b"endbfchar", b"\nendbfchar\n") - .replace(b"beginbfrange", b"\nbeginbfrange\n") - .replace(b"endbfrange", b"\nendbfrange\n") - .replace(b"<<", b"\n{\n") # text between << and >> not used but - .replace(b">>", b"\n}\n") # some solution to find it back - ) - ll = cm.split(b"<") - for i in range(len(ll)): - j = ll[i].find(b">") - if j >= 0: - ll[i] = ll[i][:j].replace(b" ", b"") + b" " + ll[i][j + 1 :] - cm = (b" ".join(ll)).replace(b"[", b" [ ").replace(b"]", b" ]\n ").replace(b"\r", b"\n") - - for l in cm.split(b"\n"): - if l in (b"", b" "): - continue - if b"beginbfrange" in l: - process_rg = True - elif b"endbfrange" in l: - process_rg = False - elif b"beginbfchar" in l: - process_char = True - elif b"endbfchar" in l: - process_char = False - elif process_rg: - lst = [x for x in l.split(b" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - nbi = len(lst[0]) - map_dict[-1] = nbi // 2 - fmt = b"%%0%dX" % nbi - if lst[2] == b"[": - for sq in lst[3:]: - if sq == b"]": - break - map_dict[ - unhexlify(fmt % a).decode( - "charmap" if map_dict[-1] == 1 else "utf-16-be" - ) - ] = unhexlify(sq).decode("utf-16-be") - int_entry.append(a) - a += 1 - assert a > b - else: - c = int(lst[2], 16) - fmt2 = b"%%0%dX" % len(lst[2]) - while a <= b: - map_dict[ - unhexlify(fmt % a).decode( - "charmap" if map_dict[-1] == 1 else "utf-16-be" - ) - ] = unhexlify(fmt2 % c).decode("utf-16-be") - int_entry.append(a) - a += 1 - c += 1 - elif process_char: - lst = [x for x in l.split(b" ") if x] - map_dict[-1] = len(lst[0]) // 2 - while len(lst) > 0: - map_dict[ - unhexlify(lst[0]).decode( - "charmap" if map_dict[-1] == 1 else "utf-16-be" - ) - ] = unhexlify(lst[1]).decode( - "utf-16-be" - ) # join is here as some cases where the code was split - int_entry.append(int(lst[0], 16)) - lst = lst[2:] - for a in map_dict: - if map_dict[a] == " ": - space_code = a - return map_dict, space_code, int_entry - - -def compute_space_width( - ft: DictionaryObject, space_code: int, space_width: float -) -> float: - sp_width: float = space_width * 2 # default value - w = [] - st: int = 0 - if "/W" in ft: - if "/DW" in ft: - sp_width = cast(float, ft["/DW"]) - w = list(ft["/W"]) # type: ignore - while len(w) > 0: - st = w[0] - second = w[1] - if isinstance(int, second): - if st <= space_code and space_code <= second: - sp_width = w[2] - break - w = w[3:] - if isinstance(list, second): - if st <= space_code and space_code <= st + len(second) - 1: - sp_width = second[space_code - st] - w = w[2:] - else: - warnings.warn( - "unknown widths : \n" + (ft["/W"]).__repr__(), - PdfReadWarning, - ) - break - if "/Widths" in ft: - w = list(ft["/Widths"]) # type: ignore - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > space_code or en < space_code: - raise Exception("Not in range") - if w[space_code - st] == 0: - raise Exception("null width") - sp_width = w[space_code - st] - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore - else: - # will consider width of char as avg(width)/2 - m = 0 - cpt = 0 - for x in w: - if x > 0: - m += x - cpt += 1 - sp_width = m / max(1, cpt) / 2 - return sp_width +import warnings +from binascii import unhexlify +from typing import Any, Dict, List, Tuple, Union, cast + +from ._adobe_glyphs import adobe_glyphs +from .errors import PdfReadWarning +from .generic import DecodedStreamObject, DictionaryObject, charset_encoding + + +# code freely inspired from @twiggy ; see #711 +def build_char_map( + font_name: str, space_width: float, obj: DictionaryObject +) -> Tuple[ + str, float, Union[str, Dict[int, str]], Dict +]: # font_type,space_width /2, encoding, cmap + ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore + font_type: str = cast(str, ft["/Subtype"]) + + space_code = 32 + encoding, space_code = parse_encoding(ft, space_code) + map_dict, space_code, int_entry = parse_to_unicode(ft, space_code) + + # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) + # if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data + if encoding == "": + if -1 not in map_dict or map_dict[-1] == 1: + # I have not been able to find any rule for no /Encoding nor /ToUnicode + # One example shows /Symbol,bold I consider 8 bits encoding default + encoding = "charmap" + else: + encoding = "utf-16-be" + # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters) + # if encoding is an str it is expected to be a identity translation + elif isinstance(encoding, dict): + for x in int_entry: + if x <= 255: + encoding[x] = chr(x) + if font_name in _default_fonts_space_width: + # override space_width with new params + space_width = _default_fonts_space_width[font_name] + sp_width = compute_space_width(ft, space_code, space_width) + + return ( + font_type, + float(sp_width / 2), + encoding, + # https://github.com/python/mypy/issues/4374 + map_dict, # type: ignore + ) # type: ignore + + +# used when missing data, e.g. font def missing +unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict] = ( + "Unknown", + 9999, + dict(zip(range(256), ["�"] * 256)), + {}, +) + + +_predefined_cmap: Dict[str, str] = { + "/Identity-H": "utf-16-be", + "/Identity-V": "utf-16-be", + "/GB-EUC-H": "gbk", # TBC + "/GB-EUC-V": "gbk", # TBC + "/GBpc-EUC-H": "gb2312", # TBC + "/GBpc-EUC-V": "gb2312", # TBC +} + + +# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz +_default_fonts_space_width: Dict[str, int] = { + "/Courrier": 600, + "/Courier-Bold": 600, + "/Courier-BoldOblique": 600, + "/Courier-Oblique": 600, + "/Helvetica": 278, + "/Helvetica-Bold": 278, + "/Helvetica-BoldOblique": 278, + "/Helvetica-Oblique": 278, + "/Helvetica-Narrow": 228, + "/Helvetica-NarrowBold": 228, + "/Helvetica-NarrowBoldOblique": 228, + "/Helvetica-NarrowOblique": 228, + "/Times-Roman": 250, + "/Times-Bold": 250, + "/Times-BoldItalic": 250, + "/Times-Italic": 250, + "/Symbol": 250, + "/ZapfDingbats": 278, +} + + +def parse_encoding( + ft: DictionaryObject, space_code: int +) -> Tuple[Union[str, Dict[int, str]], int]: + encoding: Union[str, List[str], Dict[int, str]] = [] + if "/Encoding" not in ft: + try: + if "/BaseFont" in ft and ft["/BaseFont"] in charset_encoding: + encoding = dict( + zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) + ) + else: + encoding = "charmap" + return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])] + except Exception: + if ft["/Subtype"] == "/Type1": + return "charmap", space_code + else: + return "", space_code + enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore + if isinstance(enc, str): + try: + if enc in charset_encoding: + encoding = charset_encoding[enc].copy() + elif enc in _predefined_cmap: + encoding = _predefined_cmap[enc] + else: + raise Exception("not found") + except Exception: + warnings.warn( + f"Advanced encoding {enc} not implemented yet", + PdfReadWarning, + ) + encoding = enc + elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: + try: + encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + else: + encoding = charset_encoding["/StandardCoding"].copy() + if "/Differences" in enc: + x: int = 0 + o: Union[int, str] + for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): + if isinstance(o, int): + x = o + else: # isinstance(o,str): + try: + encoding[x] = adobe_glyphs[o] # type: ignore + except Exception: + encoding[x] = o # type: ignore + if o == " ": + space_code = x + x += 1 + if isinstance(encoding, list): + encoding = dict(zip(range(256), encoding)) + return encoding, space_code + + +def parse_to_unicode( + ft: DictionaryObject, space_code: int +) -> Tuple[Dict, int, List[int]]: + map_dict: Dict[ + Any, Any + ] = ( + {} + ) # will store all translation code and map_dict[-1] we will have the number of bytes to convert + int_entry: List[ + int + ] = [] # will provide the list of cmap keys as int to correct encoding + if "/ToUnicode" not in ft: + return {}, space_code, [] + process_rg: bool = False + process_char: bool = False + cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() + # we need to prepare cm before due to missing return line in pdf printed to pdf from word + cm = ( + cm.strip() + .replace(b"beginbfchar", b"\nbeginbfchar\n") + .replace(b"endbfchar", b"\nendbfchar\n") + .replace(b"beginbfrange", b"\nbeginbfrange\n") + .replace(b"endbfrange", b"\nendbfrange\n") + .replace(b"<<", b"\n{\n") # text between << and >> not used but + .replace(b">>", b"\n}\n") # some solution to find it back + ) + ll = cm.split(b"<") + for i in range(len(ll)): + j = ll[i].find(b">") + if j >= 0: + ll[i] = ll[i][:j].replace(b" ", b"") + b" " + ll[i][j + 1 :] + cm = ( + (b" ".join(ll)) + .replace(b"[", b" [ ") + .replace(b"]", b" ]\n ") + .replace(b"\r", b"\n") + ) + + for l in cm.split(b"\n"): + if l in (b"", b" "): + continue + if b"beginbfrange" in l: + process_rg = True + elif b"endbfrange" in l: + process_rg = False + elif b"beginbfchar" in l: + process_char = True + elif b"endbfchar" in l: + process_char = False + elif process_rg: + lst = [x for x in l.split(b" ") if x] + a = int(lst[0], 16) + b = int(lst[1], 16) + nbi = len(lst[0]) + map_dict[-1] = nbi // 2 + fmt = b"%%0%dX" % nbi + if lst[2] == b"[": + for sq in lst[3:]: + if sq == b"]": + break + map_dict[ + unhexlify(fmt % a).decode( + "charmap" if map_dict[-1] == 1 else "utf-16-be", + "surrogatepass", + ) + ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") + int_entry.append(a) + a += 1 + assert a > b + else: + c = int(lst[2], 16) + fmt2 = b"%%0%dX" % len(lst[2]) + while a <= b: + map_dict[ + unhexlify(fmt % a).decode( + "charmap" if map_dict[-1] == 1 else "utf-16-be", + "surrogatepass", + ) + ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") + int_entry.append(a) + a += 1 + c += 1 + elif process_char: + lst = [x for x in l.split(b" ") if x] + map_dict[-1] = len(lst[0]) // 2 + while len(lst) > 0: + map_dict[ + unhexlify(lst[0]).decode( + "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" + ) + ] = unhexlify(lst[1]).decode( + "utf-16-be", "surrogatepass" + ) # join is here as some cases where the code was split + int_entry.append(int(lst[0], 16)) + lst = lst[2:] + for a in map_dict: + if map_dict[a] == " ": + space_code = a + return map_dict, space_code, int_entry + + +def compute_space_width( + ft: DictionaryObject, space_code: int, space_width: float +) -> float: + sp_width: float = space_width * 2 # default value + w = [] + st: int = 0 + if "/W" in ft: + if "/DW" in ft: + sp_width = cast(float, ft["/DW"]) + w = list(ft["/W"]) # type: ignore + while len(w) > 0: + st = w[0] + second = w[1] + if isinstance(int, second): + if st <= space_code and space_code <= second: + sp_width = w[2] + break + w = w[3:] + if isinstance(list, second): + if st <= space_code and space_code <= st + len(second) - 1: + sp_width = second[space_code - st] + w = w[2:] + else: + warnings.warn( + "unknown widths : \n" + (ft["/W"]).__repr__(), + PdfReadWarning, + ) + break + if "/Widths" in ft: + w = list(ft["/Widths"]) # type: ignore + try: + st = cast(int, ft["/FirstChar"]) + en: int = cast(int, ft["/LastChar"]) + if st > space_code or en < space_code: + raise Exception("Not in range") + if w[space_code - st] == 0: + raise Exception("null width") + sp_width = w[space_code - st] + except Exception: + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + else: + # will consider width of char as avg(width)/2 + m = 0 + cpt = 0 + for x in w: + if x > 0: + m += x + cpt += 1 + sp_width = m / max(1, cpt) / 2 + return sp_width diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 1570439ca..031aa1471 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1,1473 +1,1472 @@ -# Copyright (c) 2006, Mathieu Fenniak -# Copyright (c) 2007, Ashish Kulkarni -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - -import math -import uuid -import warnings -from decimal import Decimal -# from math import sqrt -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - List, - Optional, - Tuple, - Union, - cast, -) - -from ._cmap import build_char_map, unknown_char_map -from ._utils import ( - CompressedTransformationMatrix, - TransformationMatrixType, - deprecate_no_replacement, - deprecate_with_replacement, - matrix_multiply, -) -from .constants import PageAttributes as PG -from .constants import Ressources as RES -from .errors import PageSizeNotDefinedError, PdfReadWarning -from .generic import ( - ArrayObject, - ContentStream, - DictionaryObject, - EncodedStreamObject, - FloatObject, - IndirectObject, - NameObject, - NullObject, - NumberObject, - RectangleObject, - TextStringObject, - encode_pdfdocencoding, -) - - -def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: - retval: Union[None, RectangleObject, IndirectObject] = self.get(name) - if isinstance(retval, RectangleObject): - return retval - if retval is None: - for d in defaults: - retval = self.get(d) - if retval is not None: - break - if isinstance(retval, IndirectObject): - retval = self.pdf.get_object(retval) - retval = RectangleObject(retval) # type: ignore - _set_rectangle(self, name, retval) - return retval - - -def getRectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: - deprecate_no_replacement("getRectangle") - return _get_rectangle(self, name, defaults) - - -def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: - if not isinstance(name, NameObject): - name = NameObject(name) - self[name] = value - - -def setRectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: - deprecate_no_replacement("setRectangle") - _set_rectangle(self, name, value) - - -def _delete_rectangle(self: Any, name: str) -> None: - del self[name] - - -def deleteRectangle(self: Any, name: str) -> None: - deprecate_no_replacement("deleteRectangle") - del self[name] - - -def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: - return property( - lambda self: _get_rectangle(self, name, fallback), - lambda self, value: _set_rectangle(self, name, value), - lambda self: _delete_rectangle(self, name), - ) - - -def createRectangleAccessor(name: str, fallback: Iterable[str]) -> property: - deprecate_no_replacement("createRectangleAccessor") - return _create_rectangle_accessor(name, fallback) - - -class Transformation: - """ - Specify a 2D transformation. - - The transformation between two coordinate systems is represented by a 3-by-3 - transformation matrix written as follows:: - - a b 0 - c d 0 - e f 1 - - Because a transformation matrix has only six elements that can be changed, - it is usually specified in PDF as the six-element array [ a b c d e f ]. - - Coordinate transformations are expressed as matrix multiplications:: - - a b 0 - [ x′ y′ 1 ] = [ x y 1 ] × c d 0 - e f 1 - - - Usage - ----- - - >>> from PyPDF2 import Transformation - >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) - >>> page.add_transformation(op) - """ - - # 9.5.4 Coordinate Systems for 3D - # 4.2.2 Common Transformations - def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)): - self.ctm = ctm - - @property - def matrix(self) -> TransformationMatrixType: - return ( - (self.ctm[0], self.ctm[1], 0), - (self.ctm[2], self.ctm[3], 0), - (self.ctm[4], self.ctm[5], 1), - ) - - @staticmethod - def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: - return ( - matrix[0][0], - matrix[0][1], - matrix[1][0], - matrix[1][1], - matrix[0][2], - matrix[1][2], - ) - - def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": - m = self.ctm - return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) - - def scale( - self, sx: Optional[float] = None, sy: Optional[float] = None - ) -> "Transformation": - if sx is None and sy is None: - raise ValueError("Either sx or sy must be specified") - if sx is None: - sx = sy - if sy is None: - sy = sx - assert sx is not None - assert sy is not None - op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) - ctm = Transformation.compress(matrix_multiply(self.matrix, op)) - return Transformation(ctm) - - def rotate(self, rotation: float) -> "Transformation": - rotation = math.radians(rotation) - op: TransformationMatrixType = ( - (math.cos(rotation), math.sin(rotation), 0), - (-math.sin(rotation), math.cos(rotation), 0), - (0, 0, 1), - ) - ctm = Transformation.compress(matrix_multiply(self.matrix, op)) - return Transformation(ctm) - - def __repr__(self) -> str: - return f"Transformation(ctm={self.ctm})" - - -class PageObject(DictionaryObject): - """ - PageObject represents a single page within a PDF file. - - Typically this object will be created by accessing the - :meth:`get_page()` method of the - :class:`PdfReader` class, but it is - also possible to create an empty page with the - :meth:`create_blank_page()` static method. - - :param pdf: PDF file the page belongs to. - :param indirect_ref: Stores the original indirect reference to - this object in its source PDF - """ - - def __init__( - self, - pdf: Optional[Any] = None, # PdfReader - indirect_ref: Optional[IndirectObject] = None, - ) -> None: - from ._reader import PdfReader - - DictionaryObject.__init__(self) - self.pdf: Optional[PdfReader] = pdf - self.indirect_ref = indirect_ref - - @staticmethod - def create_blank_page( - pdf: Optional[Any] = None, # PdfReader - width: Union[float, Decimal, None] = None, - height: Union[float, Decimal, None] = None, - ) -> "PageObject": - """ - Return a new blank page. - - If ``width`` or ``height`` is ``None``, try to get the page size - from the last page of *pdf*. - - :param pdf: PDF file the page belongs to - :param float width: The width of the new page expressed in default user - space units. - :param float height: The height of the new page expressed in default user - space units. - :return: the new blank page: - :rtype: :class:`PageObject` - :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains - no page - """ - page = PageObject(pdf) - - # Creates a new page (cf PDF Reference 7.7.3.3) - page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) - page.__setitem__(NameObject(PG.PARENT), NullObject()) - page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) - if width is None or height is None: - if pdf is not None and len(pdf.pages) > 0: - lastpage = pdf.pages[len(pdf.pages) - 1] - width = lastpage.mediabox.width - height = lastpage.mediabox.height - else: - raise PageSizeNotDefinedError - page.__setitem__( - NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore - ) - - return page - - @staticmethod - def createBlankPage( - pdf: Optional[Any] = None, # PdfReader - width: Union[float, Decimal, None] = None, - height: Union[float, Decimal, None] = None, - ) -> "PageObject": # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`create_blank_page` instead. - """ - deprecate_with_replacement("createBlankPage", "create_blank_page") - return PageObject.create_blank_page(pdf, width, height) - - def rotate(self, angle: float) -> "PageObject": - """ - Rotate a page clockwise by increments of 90 degrees. - - :param int angle: Angle to rotate the page. Must be an increment - of 90 deg. - """ - if angle % 90 != 0: - raise ValueError("Rotation angle must be a multiple of 90") - rotate_obj = self.get(PG.ROTATE, 0) - current_angle = ( - rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() - ) - self[NameObject(PG.ROTATE)] = NumberObject(current_angle + angle) - return self - - def rotate_clockwise(self, angle: float) -> "PageObject": # pragma: no cover - deprecate_with_replacement("rotate_clockwise", "rotate") - return self.rotate(angle) - - def rotateClockwise(self, angle: float) -> "PageObject": # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`rotate_clockwise` instead. - """ - deprecate_with_replacement("rotateClockwise", "rotate") - return self.rotate(angle) - - def rotateCounterClockwise(self, angle: float) -> "PageObject": # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`rotate_clockwise` with a negative argument instead. - """ - deprecate_with_replacement("rotateCounterClockwise", "rotate") - return self.rotate(-angle) - - @staticmethod - def _merge_resources( - res1: DictionaryObject, res2: DictionaryObject, resource: Any - ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - new_res = DictionaryObject() - new_res.update(res1.get(resource, DictionaryObject()).get_object()) - page2res = cast( - DictionaryObject, res2.get(resource, DictionaryObject()).get_object() - ) - rename_res = {} - for key in list(page2res.keys()): - if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): - newname = NameObject(key + str(uuid.uuid4())) - rename_res[key] = newname - new_res[newname] = page2res[key] - elif key not in new_res: - new_res[key] = page2res.raw_get(key) - return new_res, rename_res - - @staticmethod - def _content_stream_rename( - stream: ContentStream, rename: Dict[Any, Any], pdf: Any # PdfReader - ) -> ContentStream: - if not rename: - return stream - stream = ContentStream(stream, pdf) - for operands, _operator in stream.operations: - if isinstance(operands, list): - for i in range(len(operands)): - op = operands[i] - if isinstance(op, NameObject): - operands[i] = rename.get(op, op) - elif isinstance(operands, dict): - for i in operands: - op = operands[i] - if isinstance(op, NameObject): - operands[i] = rename.get(op, op) - else: - raise KeyError("type of operands is %s" % type(operands)) - return stream - - @staticmethod - def _push_pop_gs(contents: Any, pdf: Any) -> ContentStream: # PdfReader - # adds a graphics state "push" and "pop" to the beginning and end - # of a content stream. This isolates it from changes such as - # transformation matricies. - stream = ContentStream(contents, pdf) - stream.operations.insert(0, ([], "q")) - stream.operations.append(([], "Q")) - return stream - - @staticmethod - def _add_transformation_matrix( - contents: Any, pdf: Any, ctm: CompressedTransformationMatrix - ) -> ContentStream: # PdfReader - # adds transformation matrix at the beginning of the given - # contents stream. - a, b, c, d, e, f = ctm - contents = ContentStream(contents, pdf) - contents.operations.insert( - 0, - [ - [ - FloatObject(a), - FloatObject(b), - FloatObject(c), - FloatObject(d), - FloatObject(e), - FloatObject(f), - ], - " cm", - ], - ) - return contents - - def get_contents(self) -> Optional[ContentStream]: - """ - Access the page contents. - - :return: the ``/Contents`` object, or ``None`` if it doesn't exist. - ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 - """ - if PG.CONTENTS in self: - return self[PG.CONTENTS].get_object() # type: ignore - else: - return None - - def getContents(self) -> Optional[ContentStream]: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`get_contents` instead. - """ - deprecate_with_replacement("getContents", "get_contents") - return self.get_contents() - - def merge_page(self, page2: "PageObject", expand: bool = False) -> None: - """ - Merge the content streams of two pages into one. - - Resource references - (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc - of this page are not altered. The parameter page's content stream will - be added to the end of this page's content stream, meaning that it will - be drawn after, or "on top" of this page. - - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param bool expand: If true, the current page dimensions will be - expanded to accommodate the dimensions of the page to be merged. - """ - self._merge_page(page2, expand=expand) - - def mergePage(self, page2: "PageObject") -> None: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`merge_page` instead. - """ - deprecate_with_replacement("mergePage", "merge_page") - return self.merge_page(page2) - - def _merge_page( - self, - page2: "PageObject", - page2transformation: Optional[Callable[[Any], ContentStream]] = None, - ctm: Optional[CompressedTransformationMatrix] = None, - expand: bool = False, - ) -> None: - # First we work on merging the resource dictionaries. This allows us - # to find out what symbols in the content streams we might need to - # rename. - - new_resources = DictionaryObject() - rename = {} - original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) - page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) - new_annots = ArrayObject() - - for page in (self, page2): - if PG.ANNOTS in page: - annots = page[PG.ANNOTS] - if isinstance(annots, ArrayObject): - for ref in annots: - new_annots.append(ref) - - for res in ( - RES.EXT_G_STATE, - RES.FONT, - RES.XOBJECT, - RES.COLOR_SPACE, - RES.PATTERN, - RES.SHADING, - RES.PROPERTIES, - ): - new, newrename = PageObject._merge_resources( - original_resources, page2resources, res - ) - if new: - new_resources[NameObject(res)] = new - rename.update(newrename) - - # Combine /ProcSet sets. - new_resources[NameObject(RES.PROC_SET)] = ArrayObject( - frozenset( - original_resources.get(RES.PROC_SET, ArrayObject()).get_object() # type: ignore - ).union( - frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) # type: ignore - ) - ) - - new_content_array = ArrayObject() - - original_content = self.get_contents() - if original_content is not None: - new_content_array.append( - PageObject._push_pop_gs(original_content, self.pdf) - ) - - page2content = page2.get_contents() - if page2content is not None: - page2content = ContentStream(page2content, self.pdf) - page2content.operations.insert( - 0, - ( - map( - FloatObject, - [ - page2.trimbox.left, - page2.trimbox.bottom, - page2.trimbox.width, - page2.trimbox.height, - ], - ), - "re", - ), - ) - page2content.operations.insert(1, ([], "W")) - page2content.operations.insert(2, ([], "n")) - if page2transformation is not None: - page2content = page2transformation(page2content) - page2content = PageObject._content_stream_rename( - page2content, rename, self.pdf - ) - page2content = PageObject._push_pop_gs(page2content, self.pdf) - new_content_array.append(page2content) - - # if expanding the page to fit a new page, calculate the new media box size - if expand: - self._expand_mediabox(page2, ctm) - - self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) - self[NameObject(PG.RESOURCES)] = new_resources - self[NameObject(PG.ANNOTS)] = new_annots - - def _expand_mediabox( - self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] - ) -> None: - corners1 = ( - self.mediabox.left.as_numeric(), - self.mediabox.bottom.as_numeric(), - self.mediabox.right.as_numeric(), - self.mediabox.top.as_numeric(), - ) - corners2 = ( - page2.mediabox.left.as_numeric(), - page2.mediabox.bottom.as_numeric(), - page2.mediabox.left.as_numeric(), - page2.mediabox.top.as_numeric(), - page2.mediabox.right.as_numeric(), - page2.mediabox.top.as_numeric(), - page2.mediabox.right.as_numeric(), - page2.mediabox.bottom.as_numeric(), - ) - if ctm is not None: - ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] - new_x = tuple( - ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] - for i in range(0, 8, 2) - ) - new_y = tuple( - ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] - for i in range(0, 8, 2) - ) - else: - new_x = corners2[0:8:2] - new_y = corners2[1:8:2] - lowerleft = (min(new_x), min(new_y)) - upperright = (max(new_x), max(new_y)) - lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) - upperright = ( - max(corners1[2], upperright[0]), - max(corners1[3], upperright[1]), - ) - - self.mediabox.lower_left = lowerleft - self.mediabox.upper_right = upperright - - def mergeTransformedPage( - self, - page2: "PageObject", - ctm: Union[CompressedTransformationMatrix, Transformation], - expand: bool = False, - ) -> None: # pragma: no cover - """ - mergeTransformedPage is similar to merge_page, but a transformation - matrix is applied to the merged stream. - - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param tuple ctm: a 6-element tuple containing the operands of the - transformation matrix - :param bool expand: Whether the page should be expanded to fit the dimensions - of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeTransformedPage(page2, ctm)", - "page2.add_transformation(ctm); page.merge_page(page2)", - ) - if isinstance(ctm, Transformation): - ctm = ctm.ctm - ctm = cast(CompressedTransformationMatrix, ctm) - self._merge_page( - page2, - lambda page2Content: PageObject._add_transformation_matrix( - page2Content, page2.pdf, ctm # type: ignore[arg-type] - ), - ctm, - expand, - ) - - def mergeScaledPage( - self, page2: "PageObject", scale: float, expand: bool = False - ) -> None: # pragma: no cover - """ - mergeScaledPage is similar to merge_page, but the stream to be merged - is scaled by appling a transformation matrix. - - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeScaledPage(page2, scale, expand)", - "page2.add_transformation(Transformation().scale(scale)); page.merge_page(page2, expand)", - ) - op = Transformation().scale(scale, scale) - self.mergeTransformedPage(page2, op, expand) - - def mergeRotatedPage( - self, page2: "PageObject", rotation: float, expand: bool = False - ) -> None: # pragma: no cover - """ - mergeRotatedPage is similar to merge_page, but the stream to be merged - is rotated by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float rotation: The angle of the rotation, in degrees - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeRotatedPage(page2, rotation, expand)", - "page2.add_transformation(Transformation().rotate(rotation)); page.merge_page(page2, expand)", - ) - op = Transformation().rotate(rotation) - self.mergeTransformedPage(page2, op, expand) - - def mergeTranslatedPage( - self, page2: "PageObject", tx: float, ty: float, expand: bool = False - ) -> None: # pragma: no cover - """ - mergeTranslatedPage is similar to merge_page, but the stream to be - merged is translated by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeTranslatedPage(page2, tx, ty, expand)", - "page2.add_transformation(Transformation().translate(tx, ty)); page.merge_page(page2, expand)", - ) - op = Transformation().translate(tx, ty) - self.mergeTransformedPage(page2, op, expand) - - def mergeRotatedTranslatedPage( - self, - page2: "PageObject", - rotation: float, - tx: float, - ty: float, - expand: bool = False, - ) -> None: # pragma: no cover - """ - mergeRotatedTranslatedPage is similar to merge_page, but the stream to - be merged is rotated and translated by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param float rotation: The angle of the rotation, in degrees - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeRotatedTranslatedPage(page2, rotation, tx, ty, expand)", - "page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); page.merge_page(page2, expand)", - ) - op = Transformation().translate(-tx, -ty).rotate(rotation).translate(tx, ty) - return self.mergeTransformedPage(page2, op, expand) - - def mergeRotatedScaledPage( - self, page2: "PageObject", rotation: float, scale: float, expand: bool = False - ) -> None: # pragma: no cover - """ - mergeRotatedScaledPage is similar to merge_page, but the stream to be - merged is rotated and scaled by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float rotation: The angle of the rotation, in degrees - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeRotatedScaledPage(page2, rotation, scale, expand)", - "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", - ) - op = Transformation().rotate(rotation).scale(scale, scale) - self.mergeTransformedPage(page2, op, expand) - - def mergeScaledTranslatedPage( - self, - page2: "PageObject", - scale: float, - tx: float, - ty: float, - expand: bool = False, - ) -> None: # pragma: no cover - """ - mergeScaledTranslatedPage is similar to merge_page, but the stream to be - merged is translated and scaled by appling a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float scale: The scaling factor - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeScaledTranslatedPage(page2, scale, tx, ty, expand)", - "page2.add_transformation(Transformation().scale(scale).translate(tx, ty)); page.merge_page(page2, expand)", - ) - op = Transformation().scale(scale, scale).translate(tx, ty) - return self.mergeTransformedPage(page2, op, expand) - - def mergeRotatedScaledTranslatedPage( - self, - page2: "PageObject", - rotation: float, - scale: float, - tx: float, - ty: float, - expand: bool = False, - ) -> None: # pragma: no cover - """ - mergeRotatedScaledTranslatedPage is similar to merge_page, but the - stream to be merged is translated, rotated and scaled by appling a - transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param float rotation: The angle of the rotation, in degrees - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. - """ - deprecate_with_replacement( - "page.mergeRotatedScaledTranslatedPage(page2, rotation, tx, ty, expand)", - "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", - ) - op = Transformation().rotate(rotation).scale(scale, scale).translate(tx, ty) - self.mergeTransformedPage(page2, op, expand) - - def add_transformation( - self, - ctm: Union[Transformation, CompressedTransformationMatrix], - expand: bool = False, - ) -> None: - """ - Apply a transformation matrix to the page. - - :param tuple ctm: A 6-element tuple containing the operands of the - transformation matrix. Alternatively, a - :py:class:`Transformation` - object can be passed. - - See :doc:`/user/cropping-and-transforming`. - """ - if isinstance(ctm, Transformation): - ctm = ctm.ctm - content = self.get_contents() - if content is not None: - content = PageObject._add_transformation_matrix(content, self.pdf, ctm) - content = PageObject._push_pop_gs(content, self.pdf) - self[NameObject(PG.CONTENTS)] = content - # if expanding the page to fit a new page, calculate the new media box size - if expand: - corners = [ - self.mediabox.left.as_numeric(), - self.mediabox.bottom.as_numeric(), - self.mediabox.left.as_numeric(), - self.mediabox.top.as_numeric(), - self.mediabox.right.as_numeric(), - self.mediabox.top.as_numeric(), - self.mediabox.right.as_numeric(), - self.mediabox.bottom.as_numeric(), - ] - - ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] - new_x = [ - ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] - for i in range(0, 8, 2) - ] - new_y = [ - ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] - for i in range(0, 8, 2) - ] - - lowerleft = (min(new_x), min(new_y)) - upperright = (max(new_x), max(new_y)) - lowerleft = (min(corners[0], lowerleft[0]), min(corners[1], lowerleft[1])) - upperright = ( - max(corners[2], upperright[0]), - max(corners[3], upperright[1]), - ) - - self.mediabox.lower_left = lowerleft - self.mediabox.upper_right = upperright - - def addTransformation( - self, ctm: CompressedTransformationMatrix - ) -> None: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` instead. - """ - deprecate_with_replacement("addTransformation", "add_transformation") - self.add_transformation(ctm) - - def scale(self, sx: float, sy: float) -> None: - """ - Scale a page by the given factors by appling a transformation - matrix to its content and updating the page size. - - :param float sx: The scaling factor on horizontal axis. - :param float sy: The scaling factor on vertical axis. - """ - self.add_transformation((sx, 0, 0, sy, 0, 0)) - self.mediabox = RectangleObject( - ( - float(self.mediabox.left) * sx, - float(self.mediabox.bottom) * sy, - float(self.mediabox.right) * sx, - float(self.mediabox.top) * sy, - ) - ) - if PG.VP in self: - viewport = self[PG.VP] - if isinstance(viewport, ArrayObject): - bbox = viewport[0]["/BBox"] - else: - bbox = viewport["/BBox"] # type: ignore - scaled_bbox = RectangleObject( - ( - float(bbox[0]) * sx, - float(bbox[1]) * sy, - float(bbox[2]) * sx, - float(bbox[3]) * sy, - ) - ) - if isinstance(viewport, ArrayObject): - self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore - NameObject("/BBox") - ] = scaled_bbox - else: - self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore - - def scale_by(self, factor: float) -> None: - """ - Scale a page by the given factor by appling a transformation - matrix to its content and updating the page size. - - :param float factor: The scaling factor (for both X and Y axis). - """ - self.scale(factor, factor) - - def scaleBy(self, factor: float) -> None: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`scale_by` instead. - """ - deprecate_with_replacement("scaleBy", "scale_by") - self.scale(factor, factor) - - def scale_to(self, width: float, height: float) -> None: - """ - Scale a page to the specified dimentions by appling a - transformation matrix to its content and updating the page size. - - :param float width: The new width. - :param float height: The new heigth. - """ - sx = width / float(self.mediabox.width) - sy = height / float(self.mediabox.height) - self.scale(sx, sy) - - def scaleTo(self, width: float, height: float) -> None: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`scale_to` instead. - """ - deprecate_with_replacement("scaleTo", "scale_to") - self.scale_to(width, height) - - def compress_content_streams(self) -> None: - """ - Compress the size of this page by joining all content streams and - applying a FlateDecode filter. - - However, it is possible that this function will perform no action if - content stream compression becomes "automatic" for some reason. - """ - content = self.get_contents() - if content is not None: - if not isinstance(content, ContentStream): - content = ContentStream(content, self.pdf) - self[NameObject(PG.CONTENTS)] = content.flate_encode() - - def compressContentStreams(self) -> None: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`compress_content_streams` instead. - """ - deprecate_with_replacement("compressContentStreams", "compress_content_streams") - self.compress_content_streams() - - def _extract_text_old( - self, Tj_sep: str = "", TJ_sep: str = "" - ) -> str: # pragma: no cover - """ - Locate all text drawing commands, in the order they are provided in the - content stream, and extract the text. This works well for some PDF - files, but poorly for others, depending on the generator used. This will - be refined in the future. Do not rely on the order of text coming out of - this function, as it will change if this function is made more - sophisticated. - - :return: a string object. - """ - text = "" - content = self[PG.CONTENTS].get_object() - if not isinstance(content, ContentStream): - content = ContentStream(content, self.pdf) - # Note: we check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - - space_scale = 1.0 - - for operands, operator in content.operations: - # Missing operators: - # Tf: text font - # Tfs: text font size - # Tc: '5.2.1 Character Spacing' - # Th: '5.2.3 Horizontal Scaling' - # Tl: '5.2.4 Leading' - # Tmode: '5.2.5 Text Rendering Mode' - # Trise: '5.2.6 Text Rise' - - if operator in [b"Tf", b"Tfs", b"Tc", b"Th", b"Tl", b"Tmode"]: - pass - elif operator == b"Tw": # word spacing - # See '5.2.2 Word Spacing' - space_scale = 1.0 + float(operands[0]) - elif operator == b"Tj": - # See 'TABLE 5.6 Text-showing operators' - _text = operands[0] - if isinstance(_text, TextStringObject): - text += Tj_sep - text += _text - text += "\n" - elif operator == b"T*": - # See 'TABLE 5.5 Text-positioning operators' - text += "\n" - elif operator == b"'": - # See 'TABLE 5.6 Text-showing operators' - text += "\n" - _text = operands[0] - if isinstance(_text, TextStringObject): - text += operands[0] - elif operator == b'"': - # See 'TABLE 5.6 Text-showing operators' - _text = operands[2] - if isinstance(_text, TextStringObject): - text += "\n" - text += _text - elif operator == b"TJ": - # See 'TABLE 5.6 Text-showing operators' - for i in operands[0]: - if isinstance(i, TextStringObject): - text += TJ_sep - text += i - elif isinstance(i, (NumberObject, FloatObject)): - # a positive value decreases and the negative value increases - # space - if int(i) < -space_scale * 250: - if len(text) == 0 or text[-1] != " ": - text += " " - else: - if len(text) > 1 and text[-1] == " ": - text = text[:-1] - text += "\n" - return text - - def _debug_for_extract(self) -> str: - out = "" - for ope, op in ContentStream( - self["/Contents"].getObject(), self.pdf, "bytes" - ).operations: - if op == b"TJ": - s = [x for x in ope[0] if isinstance(x, str)] - else: - s = [] - out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" - out += "\n=============================\n" - try: - for fo in self["/Resources"]["/Font"]: # type:ignore - out += fo + "\n" - out += self["/Resources"]["/Font"][fo].__repr__() + "\n" # type:ignore - try: - enc_repr = self["/Resources"]["/Font"][fo][ # type:ignore - "/Encoding" - ].__repr__() - out += enc_repr + "\n" - except Exception: - pass - except KeyError: - out += "No Font\n" - return out - - def _extract_text( - self, - obj: Any, - pdf: Any, - space_width: float = 200.0, - content_key: Optional[str] = PG.CONTENTS, - ) -> str: - """ - Locate all text drawing commands, in the order they are provided in the - content stream, and extract the text. This works well for some PDF - files, but poorly for others, depending on the generator used. This will - be refined in the future. Do not rely on the order of text coming out of - this function, as it will change if this function is made more - sophisticated. - - :param float space_width: force default space width - (if not extracted from font (default 200) - :param Optional[str] content_key: indicate the default key where to extract data - None = the opbject; this allow to reuse the function on XObject - default = "/Content" - :return: a string object. - """ - - text: str = "" - output: str = "" - cmaps: Dict[ - str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] - ] = {} - resources_dict = cast(DictionaryObject, obj["/Resources"]) - if "/Font" in resources_dict: - for f in cast(DictionaryObject, resources_dict["/Font"]): - cmaps[f] = build_char_map(f, space_width, obj) - cmap: Tuple[ - Union[str, Dict[int, str]], Dict[str, str], str - ] # (encoding,CMAP,font_name) - try: - content = ( - obj[content_key].get_object() if isinstance(content_key, str) else obj - ) - if not isinstance(content, ContentStream): - content = ContentStream(content, pdf, "bytes") - except KeyError: # it means no content can be extracted(certainly empty page) - return "" - # Note: we check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - - tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - char_scale = 1.0 - space_scale = 1.0 - _space_width: float = 500.0 # will be set correctly at first Tf - TL = 0.0 - font_size = 12.0 # init just in case of - - # tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text, - # char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width, - # TL: float = TL, font_size: float = font_size, cmap = cmap - - def process_operation(operator: bytes, operands: List) -> None: - nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap - if tm_matrix[4] != 0 and tm_matrix[5] != 0: # o reuse of the - tm_prev = list(tm_matrix) - # Table 5.4 page 405 - if operator == b"BT": - tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - # tm_prev = tm_matrix - output += text - # based - # if output != "" and output[-1]!="\n": - # output += "\n" - text = "" - return None - elif operator == b"ET": - output += text - text = "" - # Table 5.2 page 398 - elif operator == b"Tz": - char_scale = float(operands[0]) / 100.0 - elif operator == b"Tw": - space_scale = 1.0 + float(operands[0]) - elif operator == b"TL": - TL = float(operands[0]) - elif operator == b"Tf": - if text != "": - output += text # .translate(cmap) - text = "" - try: - _space_width = cmaps[operands[0]][1] - cmap = ( - cmaps[operands[0]][2], - cmaps[operands[0]][3], - operands[0], - ) # type:ignore - except KeyError: # font not found - _space_width = unknown_char_map[1] - cmap = ( - unknown_char_map[2], - unknown_char_map[3], - "???" + operands[0], - ) - try: - font_size = float(operands[1]) - except Exception: - pass # keep previous size - # Table 5.5 page 406 - elif operator == b"Td": - tm_matrix[5] += float(operands[1]) - tm_matrix[4] += float(operands[0]) - elif operator == b"Tm": - tm_matrix = [ - float(operands[0]), - float(operands[1]), - float(operands[2]), - float(operands[3]), - float(operands[4]), - float(operands[5]), - ] - elif operator == b"T*": - tm_matrix[5] -= TL - elif operator == b"Tj": - t: str = "" - tt: bytes = ( - encode_pdfdocencoding(operands[0]) - if isinstance(operands[0], str) - else operands[0] - ) - if isinstance(cmap[0], str): - t = tt.decode(cmap[0]) # apply str encoding - else: # apply dict encoding - t = "".join( - [ - cmap[0][x] if x in cmap[0] else bytes((x,)).decode() - for x in tt - ] - ) - - text += "".join([cmap[1][x] if x in cmap[1] else x for x in t]) - else: - return None - # process text changes due to positionchange: " " - if tm_matrix[5] <= ( - tm_prev[5] - - font_size # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2) - ): # it means that we are moving down by one line - output += text + "\n" # .translate(cmap) + "\n" - text = "" - elif tm_matrix[4] >= ( - tm_prev[4] + space_scale * _space_width * char_scale - ): # it means that we are moving down by one line - text += " " - return None - # for clarity Operator in (b"g",b"G") : nothing to do - # end of process_operation ###### - - for operands, operator in content.operations: - # multiple operators are defined in here #### - if operator == b"'": - process_operation(b"T*", []) - process_operation(b"Tj", operands) - elif operator == b'"': - process_operation(b"T*", []) - process_operation(b"TJ", operands) - elif operator == b"TD": - process_operation(b"TL", [-operands[1]]) - process_operation(b"Td", operands) - elif operator == b"TJ": - for op in operands[0]: - if isinstance(op, (str, bytes)): - process_operation(b"Tj", [op]) - if isinstance(op, (int, float, NumberObject, FloatObject)): - process_operation(b"Td", [-op, 0.0]) - elif operator == b"Do": - output += text - if output != "": - output += "\n" - try: - xobj = resources_dict["/XObject"] # type: ignore - if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore - output += text - text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore - output += text - except Exception: - warnings.warn( - f" impossible to decode XFormObject {operands[0]}", - PdfReadWarning, - ) - finally: - text = "" - else: - process_operation(operator, operands) - output += text # just in case of - return output - - def extract_text( - self, Tj_sep: str = "", TJ_sep: str = "", space_width: float = 200.0 - ) -> str: - """ - Locate all text drawing commands, in the order they are provided in the - content stream, and extract the text. This works well for some PDF - files, but poorly for others, depending on the generator used. This will - be refined in the future. Do not rely on the order of text coming out of - this function, as it will change if this function is made more - sophisticated. - space_width : float = force default space width (if not extracted from font (default 200) - - :return: a string object. - """ - return self._extract_text(self, self.pdf, space_width, PG.CONTENTS) - - def extract_xform_text( - self, xform: EncodedStreamObject, space_width: float = 200.0 - ) -> str: - """ - Extraction tet from an XObject. - space_width : float = force default space width (if not extracted from font (default 200) - - :return: a string object. - """ - return self._extract_text(xform, self.pdf, space_width, None) - - def extractText( - self, Tj_sep: str = "", TJ_sep: str = "" - ) -> str: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :meth:`extract_text` instead. - """ - deprecate_with_replacement("extractText", "extract_text") - return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep) - - mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the boundaries of the physical medium on which the page is - intended to be displayed or printed. - """ - - @property - def mediaBox(self) -> RectangleObject: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :py:attr:`mediabox` instead. - """ - deprecate_with_replacement("mediaBox", "mediabox") - return self.mediabox - - @mediaBox.setter - def mediaBox(self, value: RectangleObject) -> None: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :py:attr:`mediabox` instead. - """ - deprecate_with_replacement("mediaBox", "mediabox") - self.mediabox = value - - cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the visible region of default user space. When the page is - displayed or printed, its contents are to be clipped (cropped) to this - rectangle and then imposed on the output medium in some - implementation-defined manner. Default value: same as :attr:`mediabox`. - """ - - @property - def cropBox(self) -> RectangleObject: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :py:attr:`cropbox` instead. - """ - deprecate_with_replacement("cropBox", "cropbox") - return self.cropbox - - @cropBox.setter - def cropBox(self, value: RectangleObject) -> None: # pragma: no cover - deprecate_with_replacement("cropBox", "cropbox") - self.cropbox = value - - bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the region to which the contents of the page should be clipped - when output in a production enviroment. - """ - - @property - def bleedBox(self) -> RectangleObject: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :py:attr:`bleedbox` instead. - """ - deprecate_with_replacement("bleedBox", "bleedbox") - return self.bleedbox - - @bleedBox.setter - def bleedBox(self, value: RectangleObject) -> None: # pragma: no cover - deprecate_with_replacement("bleedBox", "bleedbox") - self.bleedbox = value - - trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the intended dimensions of the finished page after trimming. - """ - - @property - def trimBox(self) -> RectangleObject: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :py:attr:`trimbox` instead. - """ - deprecate_with_replacement("trimBox", "trimbox") - return self.trimbox - - @trimBox.setter - def trimBox(self, value: RectangleObject) -> None: # pragma: no cover - deprecate_with_replacement("trimBox", "trimbox") - self.trimbox = value - - artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in default user space units, - defining the extent of the page's meaningful content as intended by the - page's creator. - """ - - @property - def artBox(self) -> RectangleObject: # pragma: no cover - """ - .. deprecated:: 1.28.0 - - Use :py:attr:`artbox` instead. - """ - deprecate_with_replacement("artBox", "artbox") - return self.artbox - - @artBox.setter - def artBox(self, value: RectangleObject) -> None: # pragma: no cover - deprecate_with_replacement("artBox", "artbox") - self.artbox = value - - -class _VirtualList: - def __init__( - self, - length_function: Callable[[], int], - get_function: Callable[[int], PageObject], - ) -> None: - self.length_function = length_function - self.get_function = get_function - self.current = -1 - - def __len__(self) -> int: - return self.length_function() - - def __getitem__(self, index: int) -> PageObject: - if isinstance(index, slice): - indices = range(*index.indices(len(self))) - cls = type(self) - return cls(indices.__len__, lambda idx: self[indices[idx]]) - if not isinstance(index, int): - raise TypeError("sequence indices must be integers") - len_self = len(self) - if index < 0: - # support negative indexes - index = len_self + index - if index < 0 or index >= len_self: - raise IndexError("sequence index out of range") - return self.get_function(index) - - def __iter__(self) -> Iterator[PageObject]: - for i in range(len(self)): - yield self[i] +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import math +import uuid +import warnings +from decimal import Decimal +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Union, + cast, +) + +from ._cmap import build_char_map, unknown_char_map +from ._utils import ( + CompressedTransformationMatrix, + TransformationMatrixType, + deprecate_no_replacement, + deprecate_with_replacement, + matrix_multiply, +) +from .constants import PageAttributes as PG +from .constants import Ressources as RES +from .errors import PageSizeNotDefinedError, PdfReadWarning +from .generic import ( + ArrayObject, + ContentStream, + DictionaryObject, + EncodedStreamObject, + FloatObject, + IndirectObject, + NameObject, + NullObject, + NumberObject, + RectangleObject, + TextStringObject, + encode_pdfdocencoding, +) + + +def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: + retval: Union[None, RectangleObject, IndirectObject] = self.get(name) + if isinstance(retval, RectangleObject): + return retval + if retval is None: + for d in defaults: + retval = self.get(d) + if retval is not None: + break + if isinstance(retval, IndirectObject): + retval = self.pdf.get_object(retval) + retval = RectangleObject(retval) # type: ignore + _set_rectangle(self, name, retval) + return retval + + +def getRectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: + deprecate_no_replacement("getRectangle") + return _get_rectangle(self, name, defaults) + + +def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: + if not isinstance(name, NameObject): + name = NameObject(name) + self[name] = value + + +def setRectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: + deprecate_no_replacement("setRectangle") + _set_rectangle(self, name, value) + + +def _delete_rectangle(self: Any, name: str) -> None: + del self[name] + + +def deleteRectangle(self: Any, name: str) -> None: + deprecate_no_replacement("deleteRectangle") + del self[name] + + +def _create_rectangle_accessor(name: str, fallback: Iterable[str]) -> property: + return property( + lambda self: _get_rectangle(self, name, fallback), + lambda self, value: _set_rectangle(self, name, value), + lambda self: _delete_rectangle(self, name), + ) + + +def createRectangleAccessor(name: str, fallback: Iterable[str]) -> property: + deprecate_no_replacement("createRectangleAccessor") + return _create_rectangle_accessor(name, fallback) + + +class Transformation: + """ + Specify a 2D transformation. + + The transformation between two coordinate systems is represented by a 3-by-3 + transformation matrix written as follows:: + + a b 0 + c d 0 + e f 1 + + Because a transformation matrix has only six elements that can be changed, + it is usually specified in PDF as the six-element array [ a b c d e f ]. + + Coordinate transformations are expressed as matrix multiplications:: + + a b 0 + [ x′ y′ 1 ] = [ x y 1 ] × c d 0 + e f 1 + + + Usage + ----- + + >>> from PyPDF2 import Transformation + >>> op = Transformation().scale(sx=2, sy=3).translate(tx=10, ty=20) + >>> page.add_transformation(op) + """ + + # 9.5.4 Coordinate Systems for 3D + # 4.2.2 Common Transformations + def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)): + self.ctm = ctm + + @property + def matrix(self) -> TransformationMatrixType: + return ( + (self.ctm[0], self.ctm[1], 0), + (self.ctm[2], self.ctm[3], 0), + (self.ctm[4], self.ctm[5], 1), + ) + + @staticmethod + def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix: + return ( + matrix[0][0], + matrix[0][1], + matrix[1][0], + matrix[1][1], + matrix[0][2], + matrix[1][2], + ) + + def translate(self, tx: float = 0, ty: float = 0) -> "Transformation": + m = self.ctm + return Transformation(ctm=(m[0], m[1], m[2], m[3], m[4] + tx, m[5] + ty)) + + def scale( + self, sx: Optional[float] = None, sy: Optional[float] = None + ) -> "Transformation": + if sx is None and sy is None: + raise ValueError("Either sx or sy must be specified") + if sx is None: + sx = sy + if sy is None: + sy = sx + assert sx is not None + assert sy is not None + op: TransformationMatrixType = ((sx, 0, 0), (0, sy, 0), (0, 0, 1)) + ctm = Transformation.compress(matrix_multiply(self.matrix, op)) + return Transformation(ctm) + + def rotate(self, rotation: float) -> "Transformation": + rotation = math.radians(rotation) + op: TransformationMatrixType = ( + (math.cos(rotation), math.sin(rotation), 0), + (-math.sin(rotation), math.cos(rotation), 0), + (0, 0, 1), + ) + ctm = Transformation.compress(matrix_multiply(self.matrix, op)) + return Transformation(ctm) + + def __repr__(self) -> str: + return f"Transformation(ctm={self.ctm})" + + +class PageObject(DictionaryObject): + """ + PageObject represents a single page within a PDF file. + + Typically this object will be created by accessing the + :meth:`get_page()` method of the + :class:`PdfReader` class, but it is + also possible to create an empty page with the + :meth:`create_blank_page()` static method. + + :param pdf: PDF file the page belongs to. + :param indirect_ref: Stores the original indirect reference to + this object in its source PDF + """ + + def __init__( + self, + pdf: Optional[Any] = None, # PdfReader + indirect_ref: Optional[IndirectObject] = None, + ) -> None: + from ._reader import PdfReader + + DictionaryObject.__init__(self) + self.pdf: Optional[PdfReader] = pdf + self.indirect_ref = indirect_ref + + @staticmethod + def create_blank_page( + pdf: Optional[Any] = None, # PdfReader + width: Union[float, Decimal, None] = None, + height: Union[float, Decimal, None] = None, + ) -> "PageObject": + """ + Return a new blank page. + + If ``width`` or ``height`` is ``None``, try to get the page size + from the last page of *pdf*. + + :param pdf: PDF file the page belongs to + :param float width: The width of the new page expressed in default user + space units. + :param float height: The height of the new page expressed in default user + space units. + :return: the new blank page: + :rtype: :class:`PageObject` + :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains + no page + """ + page = PageObject(pdf) + + # Creates a new page (cf PDF Reference 7.7.3.3) + page.__setitem__(NameObject(PG.TYPE), NameObject("/Page")) + page.__setitem__(NameObject(PG.PARENT), NullObject()) + page.__setitem__(NameObject(PG.RESOURCES), DictionaryObject()) + if width is None or height is None: + if pdf is not None and len(pdf.pages) > 0: + lastpage = pdf.pages[len(pdf.pages) - 1] + width = lastpage.mediabox.width + height = lastpage.mediabox.height + else: + raise PageSizeNotDefinedError + page.__setitem__( + NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore + ) + + return page + + @staticmethod + def createBlankPage( + pdf: Optional[Any] = None, # PdfReader + width: Union[float, Decimal, None] = None, + height: Union[float, Decimal, None] = None, + ) -> "PageObject": # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`create_blank_page` instead. + """ + deprecate_with_replacement("createBlankPage", "create_blank_page") + return PageObject.create_blank_page(pdf, width, height) + + def rotate(self, angle: float) -> "PageObject": + """ + Rotate a page clockwise by increments of 90 degrees. + + :param int angle: Angle to rotate the page. Must be an increment + of 90 deg. + """ + if angle % 90 != 0: + raise ValueError("Rotation angle must be a multiple of 90") + rotate_obj = self.get(PG.ROTATE, 0) + current_angle = ( + rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() + ) + self[NameObject(PG.ROTATE)] = NumberObject(current_angle + angle) + return self + + def rotate_clockwise(self, angle: float) -> "PageObject": # pragma: no cover + deprecate_with_replacement("rotate_clockwise", "rotate") + return self.rotate(angle) + + def rotateClockwise(self, angle: float) -> "PageObject": # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`rotate_clockwise` instead. + """ + deprecate_with_replacement("rotateClockwise", "rotate") + return self.rotate(angle) + + def rotateCounterClockwise(self, angle: float) -> "PageObject": # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`rotate_clockwise` with a negative argument instead. + """ + deprecate_with_replacement("rotateCounterClockwise", "rotate") + return self.rotate(-angle) + + @staticmethod + def _merge_resources( + res1: DictionaryObject, res2: DictionaryObject, resource: Any + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + new_res = DictionaryObject() + new_res.update(res1.get(resource, DictionaryObject()).get_object()) + page2res = cast( + DictionaryObject, res2.get(resource, DictionaryObject()).get_object() + ) + rename_res = {} + for key in list(page2res.keys()): + if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): + newname = NameObject(key + str(uuid.uuid4())) + rename_res[key] = newname + new_res[newname] = page2res[key] + elif key not in new_res: + new_res[key] = page2res.raw_get(key) + return new_res, rename_res + + @staticmethod + def _content_stream_rename( + stream: ContentStream, rename: Dict[Any, Any], pdf: Any # PdfReader + ) -> ContentStream: + if not rename: + return stream + stream = ContentStream(stream, pdf) + for operands, _operator in stream.operations: + if isinstance(operands, list): + for i in range(len(operands)): + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + elif isinstance(operands, dict): + for i in operands: + op = operands[i] + if isinstance(op, NameObject): + operands[i] = rename.get(op, op) + else: + raise KeyError("type of operands is %s" % type(operands)) + return stream + + @staticmethod + def _push_pop_gs(contents: Any, pdf: Any) -> ContentStream: # PdfReader + # adds a graphics state "push" and "pop" to the beginning and end + # of a content stream. This isolates it from changes such as + # transformation matricies. + stream = ContentStream(contents, pdf) + stream.operations.insert(0, ([], "q")) + stream.operations.append(([], "Q")) + return stream + + @staticmethod + def _add_transformation_matrix( + contents: Any, pdf: Any, ctm: CompressedTransformationMatrix + ) -> ContentStream: # PdfReader + # adds transformation matrix at the beginning of the given + # contents stream. + a, b, c, d, e, f = ctm + contents = ContentStream(contents, pdf) + contents.operations.insert( + 0, + [ + [ + FloatObject(a), + FloatObject(b), + FloatObject(c), + FloatObject(d), + FloatObject(e), + FloatObject(f), + ], + " cm", + ], + ) + return contents + + def get_contents(self) -> Optional[ContentStream]: + """ + Access the page contents. + + :return: the ``/Contents`` object, or ``None`` if it doesn't exist. + ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 + """ + if PG.CONTENTS in self: + return self[PG.CONTENTS].get_object() # type: ignore + else: + return None + + def getContents(self) -> Optional[ContentStream]: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`get_contents` instead. + """ + deprecate_with_replacement("getContents", "get_contents") + return self.get_contents() + + def merge_page(self, page2: "PageObject", expand: bool = False) -> None: + """ + Merge the content streams of two pages into one. + + Resource references + (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc + of this page are not altered. The parameter page's content stream will + be added to the end of this page's content stream, meaning that it will + be drawn after, or "on top" of this page. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param bool expand: If true, the current page dimensions will be + expanded to accommodate the dimensions of the page to be merged. + """ + self._merge_page(page2, expand=expand) + + def mergePage(self, page2: "PageObject") -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`merge_page` instead. + """ + deprecate_with_replacement("mergePage", "merge_page") + return self.merge_page(page2) + + def _merge_page( + self, + page2: "PageObject", + page2transformation: Optional[Callable[[Any], ContentStream]] = None, + ctm: Optional[CompressedTransformationMatrix] = None, + expand: bool = False, + ) -> None: + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + + new_resources = DictionaryObject() + rename = {} + original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) + page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) + new_annots = ArrayObject() + + for page in (self, page2): + if PG.ANNOTS in page: + annots = page[PG.ANNOTS] + if isinstance(annots, ArrayObject): + for ref in annots: + new_annots.append(ref) + + for res in ( + RES.EXT_G_STATE, + RES.FONT, + RES.XOBJECT, + RES.COLOR_SPACE, + RES.PATTERN, + RES.SHADING, + RES.PROPERTIES, + ): + new, newrename = PageObject._merge_resources( + original_resources, page2resources, res + ) + if new: + new_resources[NameObject(res)] = new + rename.update(newrename) + + # Combine /ProcSet sets. + new_resources[NameObject(RES.PROC_SET)] = ArrayObject( + frozenset( + original_resources.get(RES.PROC_SET, ArrayObject()).get_object() # type: ignore + ).union( + frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) # type: ignore + ) + ) + + new_content_array = ArrayObject() + + original_content = self.get_contents() + if original_content is not None: + new_content_array.append( + PageObject._push_pop_gs(original_content, self.pdf) + ) + + page2content = page2.get_contents() + if page2content is not None: + page2content = ContentStream(page2content, self.pdf) + page2content.operations.insert( + 0, + ( + map( + FloatObject, + [ + page2.trimbox.left, + page2.trimbox.bottom, + page2.trimbox.width, + page2.trimbox.height, + ], + ), + "re", + ), + ) + page2content.operations.insert(1, ([], "W")) + page2content.operations.insert(2, ([], "n")) + if page2transformation is not None: + page2content = page2transformation(page2content) + page2content = PageObject._content_stream_rename( + page2content, rename, self.pdf + ) + page2content = PageObject._push_pop_gs(page2content, self.pdf) + new_content_array.append(page2content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + self._expand_mediabox(page2, ctm) + + self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) + self[NameObject(PG.RESOURCES)] = new_resources + self[NameObject(PG.ANNOTS)] = new_annots + + def _expand_mediabox( + self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] + ) -> None: + corners1 = ( + self.mediabox.left.as_numeric(), + self.mediabox.bottom.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.top.as_numeric(), + ) + corners2 = ( + page2.mediabox.left.as_numeric(), + page2.mediabox.bottom.as_numeric(), + page2.mediabox.left.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.bottom.as_numeric(), + ) + if ctm is not None: + ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] + new_x = tuple( + ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] + for i in range(0, 8, 2) + ) + new_y = tuple( + ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] + for i in range(0, 8, 2) + ) + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = (min(new_x), min(new_y)) + upperright = (max(new_x), max(new_y)) + lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) + upperright = ( + max(corners1[2], upperright[0]), + max(corners1[3], upperright[1]), + ) + + self.mediabox.lower_left = lowerleft + self.mediabox.upper_right = upperright + + def mergeTransformedPage( + self, + page2: "PageObject", + ctm: Union[CompressedTransformationMatrix, Transformation], + expand: bool = False, + ) -> None: # pragma: no cover + """ + mergeTransformedPage is similar to merge_page, but a transformation + matrix is applied to the merged stream. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param tuple ctm: a 6-element tuple containing the operands of the + transformation matrix + :param bool expand: Whether the page should be expanded to fit the dimensions + of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeTransformedPage(page2, ctm)", + "page2.add_transformation(ctm); page.merge_page(page2)", + ) + if isinstance(ctm, Transformation): + ctm = ctm.ctm + ctm = cast(CompressedTransformationMatrix, ctm) + self._merge_page( + page2, + lambda page2Content: PageObject._add_transformation_matrix( + page2Content, page2.pdf, ctm # type: ignore[arg-type] + ), + ctm, + expand, + ) + + def mergeScaledPage( + self, page2: "PageObject", scale: float, expand: bool = False + ) -> None: # pragma: no cover + """ + mergeScaledPage is similar to merge_page, but the stream to be merged + is scaled by appling a transformation matrix. + + :param PageObject page2: The page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeScaledPage(page2, scale, expand)", + "page2.add_transformation(Transformation().scale(scale)); page.merge_page(page2, expand)", + ) + op = Transformation().scale(scale, scale) + self.mergeTransformedPage(page2, op, expand) + + def mergeRotatedPage( + self, page2: "PageObject", rotation: float, expand: bool = False + ) -> None: # pragma: no cover + """ + mergeRotatedPage is similar to merge_page, but the stream to be merged + is rotated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeRotatedPage(page2, rotation, expand)", + "page2.add_transformation(Transformation().rotate(rotation)); page.merge_page(page2, expand)", + ) + op = Transformation().rotate(rotation) + self.mergeTransformedPage(page2, op, expand) + + def mergeTranslatedPage( + self, page2: "PageObject", tx: float, ty: float, expand: bool = False + ) -> None: # pragma: no cover + """ + mergeTranslatedPage is similar to merge_page, but the stream to be + merged is translated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeTranslatedPage(page2, tx, ty, expand)", + "page2.add_transformation(Transformation().translate(tx, ty)); page.merge_page(page2, expand)", + ) + op = Transformation().translate(tx, ty) + self.mergeTransformedPage(page2, op, expand) + + def mergeRotatedTranslatedPage( + self, + page2: "PageObject", + rotation: float, + tx: float, + ty: float, + expand: bool = False, + ) -> None: # pragma: no cover + """ + mergeRotatedTranslatedPage is similar to merge_page, but the stream to + be merged is rotated and translated by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeRotatedTranslatedPage(page2, rotation, tx, ty, expand)", + "page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); page.merge_page(page2, expand)", + ) + op = Transformation().translate(-tx, -ty).rotate(rotation).translate(tx, ty) + return self.mergeTransformedPage(page2, op, expand) + + def mergeRotatedScaledPage( + self, page2: "PageObject", rotation: float, scale: float, expand: bool = False + ) -> None: # pragma: no cover + """ + mergeRotatedScaledPage is similar to merge_page, but the stream to be + merged is rotated and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeRotatedScaledPage(page2, rotation, scale, expand)", + "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", + ) + op = Transformation().rotate(rotation).scale(scale, scale) + self.mergeTransformedPage(page2, op, expand) + + def mergeScaledTranslatedPage( + self, + page2: "PageObject", + scale: float, + tx: float, + ty: float, + expand: bool = False, + ) -> None: # pragma: no cover + """ + mergeScaledTranslatedPage is similar to merge_page, but the stream to be + merged is translated and scaled by appling a transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float scale: The scaling factor + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeScaledTranslatedPage(page2, scale, tx, ty, expand)", + "page2.add_transformation(Transformation().scale(scale).translate(tx, ty)); page.merge_page(page2, expand)", + ) + op = Transformation().scale(scale, scale).translate(tx, ty) + return self.mergeTransformedPage(page2, op, expand) + + def mergeRotatedScaledTranslatedPage( + self, + page2: "PageObject", + rotation: float, + scale: float, + tx: float, + ty: float, + expand: bool = False, + ) -> None: # pragma: no cover + """ + mergeRotatedScaledTranslatedPage is similar to merge_page, but the + stream to be merged is translated, rotated and scaled by appling a + transformation matrix. + + :param PageObject page2: the page to be merged into this one. Should be + an instance of :class:`PageObject`. + :param float tx: The translation on X axis + :param float ty: The translation on Y axis + :param float rotation: The angle of the rotation, in degrees + :param float scale: The scaling factor + :param bool expand: Whether the page should be expanded to fit the + dimensions of the page to be merged. + + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` and :meth:`merge_page` instead. + """ + deprecate_with_replacement( + "page.mergeRotatedScaledTranslatedPage(page2, rotation, tx, ty, expand)", + "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", + ) + op = Transformation().rotate(rotation).scale(scale, scale).translate(tx, ty) + self.mergeTransformedPage(page2, op, expand) + + def add_transformation( + self, + ctm: Union[Transformation, CompressedTransformationMatrix], + expand: bool = False, + ) -> None: + """ + Apply a transformation matrix to the page. + + :param tuple ctm: A 6-element tuple containing the operands of the + transformation matrix. Alternatively, a + :py:class:`Transformation` + object can be passed. + + See :doc:`/user/cropping-and-transforming`. + """ + if isinstance(ctm, Transformation): + ctm = ctm.ctm + content = self.get_contents() + if content is not None: + content = PageObject._add_transformation_matrix(content, self.pdf, ctm) + content = PageObject._push_pop_gs(content, self.pdf) + self[NameObject(PG.CONTENTS)] = content + # if expanding the page to fit a new page, calculate the new media box size + if expand: + corners = [ + self.mediabox.left.as_numeric(), + self.mediabox.bottom.as_numeric(), + self.mediabox.left.as_numeric(), + self.mediabox.top.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.top.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.bottom.as_numeric(), + ] + + ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] + new_x = [ + ctm[0] * corners[i] + ctm[2] * corners[i + 1] + ctm[4] + for i in range(0, 8, 2) + ] + new_y = [ + ctm[1] * corners[i] + ctm[3] * corners[i + 1] + ctm[5] + for i in range(0, 8, 2) + ] + + lowerleft = (min(new_x), min(new_y)) + upperright = (max(new_x), max(new_y)) + lowerleft = (min(corners[0], lowerleft[0]), min(corners[1], lowerleft[1])) + upperright = ( + max(corners[2], upperright[0]), + max(corners[3], upperright[1]), + ) + + self.mediabox.lower_left = lowerleft + self.mediabox.upper_right = upperright + + def addTransformation( + self, ctm: CompressedTransformationMatrix + ) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`add_transformation` instead. + """ + deprecate_with_replacement("addTransformation", "add_transformation") + self.add_transformation(ctm) + + def scale(self, sx: float, sy: float) -> None: + """ + Scale a page by the given factors by appling a transformation + matrix to its content and updating the page size. + + :param float sx: The scaling factor on horizontal axis. + :param float sy: The scaling factor on vertical axis. + """ + self.add_transformation((sx, 0, 0, sy, 0, 0)) + self.mediabox = RectangleObject( + ( + float(self.mediabox.left) * sx, + float(self.mediabox.bottom) * sy, + float(self.mediabox.right) * sx, + float(self.mediabox.top) * sy, + ) + ) + if PG.VP in self: + viewport = self[PG.VP] + if isinstance(viewport, ArrayObject): + bbox = viewport[0]["/BBox"] + else: + bbox = viewport["/BBox"] # type: ignore + scaled_bbox = RectangleObject( + ( + float(bbox[0]) * sx, + float(bbox[1]) * sy, + float(bbox[2]) * sx, + float(bbox[3]) * sy, + ) + ) + if isinstance(viewport, ArrayObject): + self[NameObject(PG.VP)][NumberObject(0)][ # type: ignore + NameObject("/BBox") + ] = scaled_bbox + else: + self[NameObject(PG.VP)][NameObject("/BBox")] = scaled_bbox # type: ignore + + def scale_by(self, factor: float) -> None: + """ + Scale a page by the given factor by appling a transformation + matrix to its content and updating the page size. + + :param float factor: The scaling factor (for both X and Y axis). + """ + self.scale(factor, factor) + + def scaleBy(self, factor: float) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`scale_by` instead. + """ + deprecate_with_replacement("scaleBy", "scale_by") + self.scale(factor, factor) + + def scale_to(self, width: float, height: float) -> None: + """ + Scale a page to the specified dimentions by appling a + transformation matrix to its content and updating the page size. + + :param float width: The new width. + :param float height: The new heigth. + """ + sx = width / float(self.mediabox.width) + sy = height / float(self.mediabox.height) + self.scale(sx, sy) + + def scaleTo(self, width: float, height: float) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`scale_to` instead. + """ + deprecate_with_replacement("scaleTo", "scale_to") + self.scale_to(width, height) + + def compress_content_streams(self) -> None: + """ + Compress the size of this page by joining all content streams and + applying a FlateDecode filter. + + However, it is possible that this function will perform no action if + content stream compression becomes "automatic" for some reason. + """ + content = self.get_contents() + if content is not None: + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + self[NameObject(PG.CONTENTS)] = content.flate_encode() + + def compressContentStreams(self) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`compress_content_streams` instead. + """ + deprecate_with_replacement("compressContentStreams", "compress_content_streams") + self.compress_content_streams() + + def _extract_text_old( + self, Tj_sep: str = "", TJ_sep: str = "" + ) -> str: # pragma: no cover + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + + :return: a string object. + """ + text = "" + content = self[PG.CONTENTS].get_object() + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf) + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + + space_scale = 1.0 + + for operands, operator in content.operations: + # Missing operators: + # Tf: text font + # Tfs: text font size + # Tc: '5.2.1 Character Spacing' + # Th: '5.2.3 Horizontal Scaling' + # Tl: '5.2.4 Leading' + # Tmode: '5.2.5 Text Rendering Mode' + # Trise: '5.2.6 Text Rise' + + if operator in [b"Tf", b"Tfs", b"Tc", b"Th", b"Tl", b"Tmode"]: + pass + elif operator == b"Tw": # word spacing + # See '5.2.2 Word Spacing' + space_scale = 1.0 + float(operands[0]) + elif operator == b"Tj": + # See 'TABLE 5.6 Text-showing operators' + _text = operands[0] + if isinstance(_text, TextStringObject): + text += Tj_sep + text += _text + text += "\n" + elif operator == b"T*": + # See 'TABLE 5.5 Text-positioning operators' + text += "\n" + elif operator == b"'": + # See 'TABLE 5.6 Text-showing operators' + text += "\n" + _text = operands[0] + if isinstance(_text, TextStringObject): + text += operands[0] + elif operator == b'"': + # See 'TABLE 5.6 Text-showing operators' + _text = operands[2] + if isinstance(_text, TextStringObject): + text += "\n" + text += _text + elif operator == b"TJ": + # See 'TABLE 5.6 Text-showing operators' + for i in operands[0]: + if isinstance(i, TextStringObject): + text += TJ_sep + text += i + elif isinstance(i, (NumberObject, FloatObject)): + # a positive value decreases and the negative value increases + # space + if int(i) < -space_scale * 250: + if len(text) == 0 or text[-1] != " ": + text += " " + else: + if len(text) > 1 and text[-1] == " ": + text = text[:-1] + text += "\n" + return text + + def _debug_for_extract(self) -> str: + out = "" + for ope, op in ContentStream( + self["/Contents"].getObject(), self.pdf, "bytes" + ).operations: + if op == b"TJ": + s = [x for x in ope[0] if isinstance(x, str)] + else: + s = [] + out += op.decode("utf-8") + " " + "".join(s) + ope.__repr__() + "\n" + out += "\n=============================\n" + try: + for fo in self["/Resources"]["/Font"]: # type:ignore + out += fo + "\n" + out += self["/Resources"]["/Font"][fo].__repr__() + "\n" # type:ignore + try: + enc_repr = self["/Resources"]["/Font"][fo][ # type:ignore + "/Encoding" + ].__repr__() + out += enc_repr + "\n" + except Exception: + pass + except KeyError: + out += "No Font\n" + return out + + def _extract_text( + self, + obj: Any, + pdf: Any, + space_width: float = 200.0, + content_key: Optional[str] = PG.CONTENTS, + ) -> str: + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + + :param float space_width: force default space width + (if not extracted from font (default 200) + :param Optional[str] content_key: indicate the default key where to extract data + None = the opbject; this allow to reuse the function on XObject + default = "/Content" + :return: a string object. + """ + + text: str = "" + output: str = "" + cmaps: Dict[ + str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] + ] = {} + resources_dict = cast(DictionaryObject, obj["/Resources"]) + if "/Font" in resources_dict: + for f in cast(DictionaryObject, resources_dict["/Font"]): + cmaps[f] = build_char_map(f, space_width, obj) + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str + ] # (encoding,CMAP,font_name) + try: + content = ( + obj[content_key].get_object() if isinstance(content_key, str) else obj + ) + if not isinstance(content, ContentStream): + content = ContentStream(content, pdf, "bytes") + except KeyError: # it means no content can be extracted(certainly empty page) + return "" + # Note: we check all strings are TextStringObjects. ByteStringObjects + # are strings where the byte->string encoding was unknown, so adding + # them to the text here would be gibberish. + + tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + char_scale = 1.0 + space_scale = 1.0 + _space_width: float = 500.0 # will be set correctly at first Tf + TL = 0.0 + font_size = 12.0 # init just in case of + + # tm_matrix: Tuple = tm_matrix, output: str = output, text: str = text, + # char_scale: float = char_scale,space_scale : float = space_scale, _space_width: float = _space_width, + # TL: float = TL, font_size: float = font_size, cmap = cmap + + def process_operation(operator: bytes, operands: List) -> None: + nonlocal tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap + if tm_matrix[4] != 0 and tm_matrix[5] != 0: # o reuse of the + tm_prev = list(tm_matrix) + # Table 5.4 page 405 + if operator == b"BT": + tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + # tm_prev = tm_matrix + output += text + # based + # if output != "" and output[-1]!="\n": + # output += "\n" + text = "" + return None + elif operator == b"ET": + output += text + text = "" + # Table 5.2 page 398 + elif operator == b"Tz": + char_scale = float(operands[0]) / 100.0 + elif operator == b"Tw": + space_scale = 1.0 + float(operands[0]) + elif operator == b"TL": + TL = float(operands[0]) + elif operator == b"Tf": + if text != "": + output += text # .translate(cmap) + text = "" + try: + _space_width = cmaps[operands[0]][1] + cmap = ( + cmaps[operands[0]][2], + cmaps[operands[0]][3], + operands[0], + ) # type:ignore + except KeyError: # font not found + _space_width = unknown_char_map[1] + cmap = ( + unknown_char_map[2], + unknown_char_map[3], + "???" + operands[0], + ) + try: + font_size = float(operands[1]) + except Exception: + pass # keep previous size + # Table 5.5 page 406 + elif operator == b"Td": + tm_matrix[5] += float(operands[1]) + tm_matrix[4] += float(operands[0]) + elif operator == b"Tm": + tm_matrix = [ + float(operands[0]), + float(operands[1]), + float(operands[2]), + float(operands[3]), + float(operands[4]), + float(operands[5]), + ] + elif operator == b"T*": + tm_matrix[5] -= TL + elif operator == b"Tj": + t: str = "" + tt: bytes = ( + encode_pdfdocencoding(operands[0]) + if isinstance(operands[0], str) + else operands[0] + ) + if isinstance(cmap[0], str): + t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + else: # apply dict encoding + t = "".join( + [ + cmap[0][x] if x in cmap[0] else bytes((x,)).decode() + for x in tt + ] + ) + + text += "".join([cmap[1][x] if x in cmap[1] else x for x in t]) + else: + return None + # process text changes due to positionchange: " " + if tm_matrix[5] <= ( + tm_prev[5] + - font_size # remove scaling * sqrt(tm_matrix[2] ** 2 + tm_matrix[3] ** 2) + ): # it means that we are moving down by one line + output += text + "\n" # .translate(cmap) + "\n" + text = "" + elif tm_matrix[4] >= ( + tm_prev[4] + space_scale * _space_width * char_scale + ): # it means that we are moving down by one line + text += " " + return None + # for clarity Operator in (b"g",b"G") : nothing to do + # end of process_operation ###### + + for operands, operator in content.operations: + # multiple operators are defined in here #### + if operator == b"'": + process_operation(b"T*", []) + process_operation(b"Tj", operands) + elif operator == b'"': + process_operation(b"T*", []) + process_operation(b"TJ", operands) + elif operator == b"TD": + process_operation(b"TL", [-operands[1]]) + process_operation(b"Td", operands) + elif operator == b"TJ": + for op in operands[0]: + if isinstance(op, (str, bytes)): + process_operation(b"Tj", [op]) + if isinstance(op, (int, float, NumberObject, FloatObject)): + process_operation(b"Td", [-op, 0.0]) + elif operator == b"Do": + output += text + if output != "": + output += "\n" + try: + xobj = resources_dict["/XObject"] # type: ignore + if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore + output += text + text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore + output += text + except Exception: + warnings.warn( + f" impossible to decode XFormObject {operands[0]}", + PdfReadWarning, + ) + finally: + text = "" + else: + process_operation(operator, operands) + output += text # just in case of + return output + + def extract_text( + self, Tj_sep: str = "", TJ_sep: str = "", space_width: float = 200.0 + ) -> str: + """ + Locate all text drawing commands, in the order they are provided in the + content stream, and extract the text. This works well for some PDF + files, but poorly for others, depending on the generator used. This will + be refined in the future. Do not rely on the order of text coming out of + this function, as it will change if this function is made more + sophisticated. + space_width : float = force default space width (if not extracted from font (default 200) + + :return: a string object. + """ + return self._extract_text(self, self.pdf, space_width, PG.CONTENTS) + + def extract_xform_text( + self, xform: EncodedStreamObject, space_width: float = 200.0 + ) -> str: + """ + Extraction tet from an XObject. + space_width : float = force default space width (if not extracted from font (default 200) + + :return: a string object. + """ + return self._extract_text(xform, self.pdf, space_width, None) + + def extractText( + self, Tj_sep: str = "", TJ_sep: str = "" + ) -> str: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :meth:`extract_text` instead. + """ + deprecate_with_replacement("extractText", "extract_text") + return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep) + + mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the boundaries of the physical medium on which the page is + intended to be displayed or printed. + """ + + @property + def mediaBox(self) -> RectangleObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`mediabox` instead. + """ + deprecate_with_replacement("mediaBox", "mediabox") + return self.mediabox + + @mediaBox.setter + def mediaBox(self, value: RectangleObject) -> None: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`mediabox` instead. + """ + deprecate_with_replacement("mediaBox", "mediabox") + self.mediabox = value + + cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the visible region of default user space. When the page is + displayed or printed, its contents are to be clipped (cropped) to this + rectangle and then imposed on the output medium in some + implementation-defined manner. Default value: same as :attr:`mediabox`. + """ + + @property + def cropBox(self) -> RectangleObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`cropbox` instead. + """ + deprecate_with_replacement("cropBox", "cropbox") + return self.cropbox + + @cropBox.setter + def cropBox(self, value: RectangleObject) -> None: # pragma: no cover + deprecate_with_replacement("cropBox", "cropbox") + self.cropbox = value + + bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the region to which the contents of the page should be clipped + when output in a production enviroment. + """ + + @property + def bleedBox(self) -> RectangleObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`bleedbox` instead. + """ + deprecate_with_replacement("bleedBox", "bleedbox") + return self.bleedbox + + @bleedBox.setter + def bleedBox(self, value: RectangleObject) -> None: # pragma: no cover + deprecate_with_replacement("bleedBox", "bleedbox") + self.bleedbox = value + + trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the intended dimensions of the finished page after trimming. + """ + + @property + def trimBox(self) -> RectangleObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`trimbox` instead. + """ + deprecate_with_replacement("trimBox", "trimbox") + return self.trimbox + + @trimBox.setter + def trimBox(self, value: RectangleObject) -> None: # pragma: no cover + deprecate_with_replacement("trimBox", "trimbox") + self.trimbox = value + + artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) + """ + A :class:`RectangleObject`, expressed in default user space units, + defining the extent of the page's meaningful content as intended by the + page's creator. + """ + + @property + def artBox(self) -> RectangleObject: # pragma: no cover + """ + .. deprecated:: 1.28.0 + + Use :py:attr:`artbox` instead. + """ + deprecate_with_replacement("artBox", "artbox") + return self.artbox + + @artBox.setter + def artBox(self, value: RectangleObject) -> None: # pragma: no cover + deprecate_with_replacement("artBox", "artbox") + self.artbox = value + + +class _VirtualList: + def __init__( + self, + length_function: Callable[[], int], + get_function: Callable[[int], PageObject], + ) -> None: + self.length_function = length_function + self.get_function = get_function + self.current = -1 + + def __len__(self) -> int: + return self.length_function() + + def __getitem__(self, index: int) -> PageObject: + if isinstance(index, slice): + indices = range(*index.indices(len(self))) + cls = type(self) + return cls(indices.__len__, lambda idx: self[indices[idx]]) + if not isinstance(index, int): + raise TypeError("sequence indices must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("sequence index out of range") + return self.get_function(index) + + def __iter__(self) -> Iterator[PageObject]: + for i in range(len(self)): + yield self[i] diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 910bb6327..24ab68caa 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -1,331 +1,331 @@ -# Copyright (c) 2006, Mathieu Fenniak -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - -""" -Utility functions for PDF library. -""" -__author__ = "Mathieu Fenniak" -__author_email__ = "biziqe@mathieu.fenniak.net" - -import os -import warnings -from codecs import getencoder -from io import ( - DEFAULT_BUFFER_SIZE, - BufferedReader, - BufferedWriter, - BytesIO, - FileIO, -) -from typing import Any, Dict, Optional, Tuple, Union, overload - -try: - # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ - from typing import TypeAlias # type: ignore[attr-defined] -except ImportError: - from typing_extensions import TypeAlias # type: ignore[misc] - -from .errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError - -TransformationMatrixType: TypeAlias = Tuple[ - Tuple[float, float, float], Tuple[float, float, float], Tuple[float, float, float] -] -CompressedTransformationMatrix: TypeAlias = Tuple[ - float, float, float, float, float, float -] - -bytes_type = type(bytes()) # Works the same in Python 2.X and 3.X -StreamType = Union[BytesIO, BufferedReader, BufferedWriter, FileIO] -StrByteType = Union[str, StreamType] - -DEPR_MSG_NO_REPLACEMENT = "{} is deprecated and will be removed in PyPDF2 {}." -DEPR_MSG = "{} is deprecated and will be removed in PyPDF2 3.0.0. Use {} instead." - - -def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes: - """ - Reads non-whitespace characters and returns them. - Stops upon encountering whitespace or when maxchars is reached. - """ - txt = b_("") - while True: - tok = stream.read(1) - if tok.isspace() or not tok: - break - txt += tok - if len(txt) == maxchars: - break - return txt - - -def read_non_whitespace(stream: StreamType) -> bytes: - """ - Finds and reads the next non-whitespace character (ignores whitespace). - """ - tok = WHITESPACES[0] - while tok in WHITESPACES: - tok = stream.read(1) - return tok - - -def skip_over_whitespace(stream: StreamType) -> bool: - """ - Similar to readNonWhitespace, but returns a Boolean if more than - one whitespace character was read. - """ - tok = WHITESPACES[0] - cnt = 0 - while tok in WHITESPACES: - tok = stream.read(1) - cnt += 1 - return cnt > 1 - - -def skip_over_comment(stream: StreamType) -> None: - tok = stream.read(1) - stream.seek(-1, 1) - if tok == b_("%"): - while tok not in (b_("\n"), b_("\r")): - tok = stream.read(1) - - -def read_until_regex(stream: StreamType, regex: Any, ignore_eof: bool = False) -> bytes: - """ - Reads until the regular expression pattern matched (ignore the match) - :raises PdfStreamError: on premature end-of-file - :param bool ignore_eof: If true, ignore end-of-line and return immediately - :param regex: re.Pattern - """ - name = b_("") - while True: - tok = stream.read(16) - if not tok: - # stream has truncated prematurely - if ignore_eof: - return name - else: - raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) - m = regex.search(tok) - if m is not None: - name += tok[: m.start()] - stream.seek(m.start() - len(tok), 1) - break - name += tok - return name - - -CRLF = b"\r\n" - - -def read_block_backwards(stream: StreamType, to_read: int) -> bytes: - """Given a stream at position X, read a block of size - to_read ending at position X. - The stream's position should be unchanged. - """ - if stream.tell() < to_read: - raise PdfStreamError("Could not read malformed PDF file") - # Seek to the start of the block we want to read. - stream.seek(-to_read, os.SEEK_CUR) - read = stream.read(to_read) - # Seek to the start of the block we read after reading it. - stream.seek(-to_read, os.SEEK_CUR) - if len(read) != to_read: - raise PdfStreamError(f"EOF: read {len(read)}, expected {to_read}?") - return read - - -def read_previous_line(stream: StreamType) -> bytes: - """Given a byte stream with current position X, return the previous - line - all characters between the first CR/LF byte found before X - (or, the start of the file, if no such byte is found) and position X - After this call, the stream will be positioned one byte after the - first non-CRLF character found beyond the first CR/LF byte before X, - or, if no such byte is found, at the beginning of the stream. - """ - line_content = [] - found_crlf = False - if stream.tell() == 0: - raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) - while True: - to_read = min(DEFAULT_BUFFER_SIZE, stream.tell()) - if to_read == 0: - break - # Read the block. After this, our stream will be one - # beyond the initial position. - block = read_block_backwards(stream, to_read) - idx = len(block) - 1 - if not found_crlf: - # We haven't found our first CR/LF yet. - # Read off characters until we hit one. - while idx >= 0 and block[idx] not in CRLF: - idx -= 1 - if idx >= 0: - found_crlf = True - if found_crlf: - # We found our first CR/LF already (on this block or - # a previous one). - # Our combined line is the remainder of the block - # plus any previously read blocks. - line_content.append(block[idx + 1 :]) - # Continue to read off any more CRLF characters. - while idx >= 0 and block[idx] in CRLF: - idx -= 1 - else: - # Didn't find CR/LF yet - add this block to our - # previously read blocks and continue. - line_content.append(block) - if idx >= 0: - # We found the next non-CRLF character. - # Set the stream position correctly, then break - stream.seek(idx + 1, os.SEEK_CUR) - break - # Join all the blocks in the line (which are in reverse order) - return b"".join(line_content[::-1]) - - -def matrix_multiply( - a: TransformationMatrixType, b: TransformationMatrixType -) -> TransformationMatrixType: - return tuple( # type: ignore[return-value] - tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b)) - for row in a - ) - - -def mark_location(stream: StreamType) -> None: - """Creates text file showing current location in context.""" - # Mainly for debugging - radius = 5000 - stream.seek(-radius, 1) - with open("PyPDF2_pdfLocation.txt", "wb") as output_fh: - output_fh.write(stream.read(radius)) - output_fh.write(b"HERE") - output_fh.write(stream.read(radius)) - stream.seek(-radius, 1) - - -B_CACHE: Dict[Union[str, bytes], bytes] = {} - - -def b_(s: Union[str, bytes]) -> bytes: - bc = B_CACHE - if s in bc: - return bc[s] - if isinstance(s, bytes): - return s - else: - try: - r = s.encode("latin-1") - if len(s) < 2: - bc[s] = r - return r - except Exception: - r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r - - -@overload -def str_(b: str) -> str: - ... - - -@overload -def str_(b: bytes) -> str: - ... - - -def str_(b: Union[str, bytes]) -> str: - if isinstance(b, bytes): - return b.decode("latin-1") - else: - return b - - -@overload -def ord_(b: str) -> int: - ... - - -@overload -def ord_(b: bytes) -> bytes: - ... - - -@overload -def ord_(b: int) -> int: - ... - - -def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: - if isinstance(b, str): - return ord(b) - else: - return b - - -def hexencode(b: bytes) -> bytes: - - coder = getencoder("hex_codec") - coded = coder(b) # type: ignore - return coded[0] - - -def hex_str(num: int) -> str: - return hex(num).replace("L", "") - - -WHITESPACES = [b_(x) for x in [" ", "\n", "\r", "\t", "\x00"]] - - -def paeth_predictor(left: int, up: int, up_left: int) -> int: - p = left + up - up_left - dist_left = abs(p - left) - dist_up = abs(p - up) - dist_up_left = abs(p - up_left) - - if dist_left <= dist_up and dist_left <= dist_up_left: - return left - elif dist_up <= dist_up_left: - return up - else: - return up_left - - -def deprecate(msg: str, stacklevel: int = 3) -> None: - warnings.warn(msg, PendingDeprecationWarning, stacklevel=stacklevel) - - -def deprecate_with_replacement( - old_name: str, new_name: str, removed_in: str = "3.0.0" -) -> None: - deprecate(DEPR_MSG.format(old_name, new_name, removed_in), 4) - - -def deprecate_no_replacement(name: str, removed_in: str = "3.0.0") -> None: - deprecate(DEPR_MSG_NO_REPLACEMENT.format(name, removed_in), 4) +# Copyright (c) 2006, Mathieu Fenniak +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" +Utility functions for PDF library. +""" +__author__ = "Mathieu Fenniak" +__author_email__ = "biziqe@mathieu.fenniak.net" + +import os +import warnings +from codecs import getencoder +from io import ( + DEFAULT_BUFFER_SIZE, + BufferedReader, + BufferedWriter, + BytesIO, + FileIO, +) +from typing import Any, Dict, Optional, Tuple, Union, overload + +try: + # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ + from typing import TypeAlias # type: ignore[attr-defined] +except ImportError: + from typing_extensions import TypeAlias # type: ignore[misc] + +from .errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError + +TransformationMatrixType: TypeAlias = Tuple[ + Tuple[float, float, float], Tuple[float, float, float], Tuple[float, float, float] +] +CompressedTransformationMatrix: TypeAlias = Tuple[ + float, float, float, float, float, float +] + +bytes_type = bytes # Works the same in Python 2.X and 3.X +StreamType = Union[BytesIO, BufferedReader, BufferedWriter, FileIO] +StrByteType = Union[str, StreamType] + +DEPR_MSG_NO_REPLACEMENT = "{} is deprecated and will be removed in PyPDF2 {}." +DEPR_MSG = "{} is deprecated and will be removed in PyPDF2 3.0.0. Use {} instead." + + +def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> bytes: + """ + Reads non-whitespace characters and returns them. + Stops upon encountering whitespace or when maxchars is reached. + """ + txt = b_("") + while True: + tok = stream.read(1) + if tok.isspace() or not tok: + break + txt += tok + if len(txt) == maxchars: + break + return txt + + +def read_non_whitespace(stream: StreamType) -> bytes: + """ + Finds and reads the next non-whitespace character (ignores whitespace). + """ + tok = WHITESPACES[0] + while tok in WHITESPACES: + tok = stream.read(1) + return tok + + +def skip_over_whitespace(stream: StreamType) -> bool: + """ + Similar to readNonWhitespace, but returns a Boolean if more than + one whitespace character was read. + """ + tok = WHITESPACES[0] + cnt = 0 + while tok in WHITESPACES: + tok = stream.read(1) + cnt += 1 + return cnt > 1 + + +def skip_over_comment(stream: StreamType) -> None: + tok = stream.read(1) + stream.seek(-1, 1) + if tok == b_("%"): + while tok not in (b_("\n"), b_("\r")): + tok = stream.read(1) + + +def read_until_regex(stream: StreamType, regex: Any, ignore_eof: bool = False) -> bytes: + """ + Reads until the regular expression pattern matched (ignore the match) + :raises PdfStreamError: on premature end-of-file + :param bool ignore_eof: If true, ignore end-of-line and return immediately + :param regex: re.Pattern + """ + name = b_("") + while True: + tok = stream.read(16) + if not tok: + # stream has truncated prematurely + if ignore_eof: + return name + else: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + m = regex.search(tok) + if m is not None: + name += tok[: m.start()] + stream.seek(m.start() - len(tok), 1) + break + name += tok + return name + + +CRLF = b"\r\n" + + +def read_block_backwards(stream: StreamType, to_read: int) -> bytes: + """Given a stream at position X, read a block of size + to_read ending at position X. + The stream's position should be unchanged. + """ + if stream.tell() < to_read: + raise PdfStreamError("Could not read malformed PDF file") + # Seek to the start of the block we want to read. + stream.seek(-to_read, os.SEEK_CUR) + read = stream.read(to_read) + # Seek to the start of the block we read after reading it. + stream.seek(-to_read, os.SEEK_CUR) + if len(read) != to_read: + raise PdfStreamError(f"EOF: read {len(read)}, expected {to_read}?") + return read + + +def read_previous_line(stream: StreamType) -> bytes: + """Given a byte stream with current position X, return the previous + line - all characters between the first CR/LF byte found before X + (or, the start of the file, if no such byte is found) and position X + After this call, the stream will be positioned one byte after the + first non-CRLF character found beyond the first CR/LF byte before X, + or, if no such byte is found, at the beginning of the stream. + """ + line_content = [] + found_crlf = False + if stream.tell() == 0: + raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + while True: + to_read = min(DEFAULT_BUFFER_SIZE, stream.tell()) + if to_read == 0: + break + # Read the block. After this, our stream will be one + # beyond the initial position. + block = read_block_backwards(stream, to_read) + idx = len(block) - 1 + if not found_crlf: + # We haven't found our first CR/LF yet. + # Read off characters until we hit one. + while idx >= 0 and block[idx] not in CRLF: + idx -= 1 + if idx >= 0: + found_crlf = True + if found_crlf: + # We found our first CR/LF already (on this block or + # a previous one). + # Our combined line is the remainder of the block + # plus any previously read blocks. + line_content.append(block[idx + 1 :]) + # Continue to read off any more CRLF characters. + while idx >= 0 and block[idx] in CRLF: + idx -= 1 + else: + # Didn't find CR/LF yet - add this block to our + # previously read blocks and continue. + line_content.append(block) + if idx >= 0: + # We found the next non-CRLF character. + # Set the stream position correctly, then break + stream.seek(idx + 1, os.SEEK_CUR) + break + # Join all the blocks in the line (which are in reverse order) + return b"".join(line_content[::-1]) + + +def matrix_multiply( + a: TransformationMatrixType, b: TransformationMatrixType +) -> TransformationMatrixType: + return tuple( # type: ignore[return-value] + tuple(sum(float(i) * float(j) for i, j in zip(row, col)) for col in zip(*b)) + for row in a + ) + + +def mark_location(stream: StreamType) -> None: + """Creates text file showing current location in context.""" + # Mainly for debugging + radius = 5000 + stream.seek(-radius, 1) + with open("PyPDF2_pdfLocation.txt", "wb") as output_fh: + output_fh.write(stream.read(radius)) + output_fh.write(b"HERE") + output_fh.write(stream.read(radius)) + stream.seek(-radius, 1) + + +B_CACHE: Dict[Union[str, bytes], bytes] = {} + + +def b_(s: Union[str, bytes]) -> bytes: + bc = B_CACHE + if s in bc: + return bc[s] + if isinstance(s, bytes): + return s + else: + try: + r = s.encode("latin-1") + if len(s) < 2: + bc[s] = r + return r + except Exception: + r = s.encode("utf-8") + if len(s) < 2: + bc[s] = r + return r + + +@overload +def str_(b: str) -> str: + ... + + +@overload +def str_(b: bytes) -> str: + ... + + +def str_(b: Union[str, bytes]) -> str: + if isinstance(b, bytes): + return b.decode("latin-1") + else: + return b + + +@overload +def ord_(b: str) -> int: + ... + + +@overload +def ord_(b: bytes) -> bytes: + ... + + +@overload +def ord_(b: int) -> int: + ... + + +def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: + if isinstance(b, str): + return ord(b) + else: + return b + + +def hexencode(b: bytes) -> bytes: + + coder = getencoder("hex_codec") + coded = coder(b) # type: ignore + return coded[0] + + +def hex_str(num: int) -> str: + return hex(num).replace("L", "") + + +WHITESPACES = [b_(x) for x in [" ", "\n", "\r", "\t", "\x00"]] + + +def paeth_predictor(left: int, up: int, up_left: int) -> int: + p = left + up - up_left + dist_left = abs(p - left) + dist_up = abs(p - up) + dist_up_left = abs(p - up_left) + + if dist_left <= dist_up and dist_left <= dist_up_left: + return left + elif dist_up <= dist_up_left: + return up + else: + return up_left + + +def deprecate(msg: str, stacklevel: int = 3) -> None: + warnings.warn(msg, PendingDeprecationWarning, stacklevel=stacklevel) + + +def deprecate_with_replacement( + old_name: str, new_name: str, removed_in: str = "3.0.0" +) -> None: + deprecate(DEPR_MSG.format(old_name, new_name, removed_in), 4) + + +def deprecate_no_replacement(name: str, removed_in: str = "3.0.0") -> None: + deprecate(DEPR_MSG_NO_REPLACEMENT.format(name, removed_in), 4) diff --git a/tests/test_page.py b/tests/test_page.py index d56ef724b..2aee95103 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -9,6 +9,7 @@ from PyPDF2._page import PageObject from PyPDF2.constants import PageAttributes as PG from PyPDF2.generic import DictionaryObject, NameObject, RectangleObject + from . import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) diff --git a/tests/test_reader.py b/tests/test_reader.py index 53b4a6ca5..09797b15f 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -17,6 +17,7 @@ PdfReadWarning, ) from PyPDF2.filters import _xobj_to_image + from . import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index c7dc4034a..b928b858a 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -7,6 +7,7 @@ from PyPDF2 import PdfReader from PyPDF2.constants import PageAttributes as PG + from . import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -144,7 +145,11 @@ def test_rotate_45(): "https://github.com/py-pdf/PyPDF2/files/3796761/17343_2008_Order_09-Jan-2019.pdf", [0, 1], ), - (True, "https://github.com/py-pdf/PyPDF2/files/8884471/ssi_manwaring.pdf", [0, 1]), + ( + True, + "https://github.com/py-pdf/PyPDF2/files/8884471/ssi_manwaring.pdf", + [0, 1], + ), (True, "https://github.com/py-pdf/PyPDF2/files/8884469/999092.pdf", [0, 1]), ( True, @@ -156,11 +161,7 @@ def test_rotate_45(): "https://github.com/py-pdf/PyPDF2/files/8884470/fdocuments.in_sweet-fundamentals-of-crystallography.pdf", [0, 1, 34, 35, 36, 118, 119, 120, 121], ), - ( - True, - "https://github.com/py-pdf/PyPDF2/files/8884493/998167.pdf", - [0] - ), + (True, "https://github.com/py-pdf/PyPDF2/files/8884493/998167.pdf", [0]), ], ) def test_extract_textbench(enable, url, pages, print_result=False):