diff --git a/.flake8 b/.flake8 index fbc73fdc2..95a1289f4 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,7 @@ [flake8] # The flake8 config should work well with black, # see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 -ignore = E203,E501,E741,W503,W604 +ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,VNE003,N802,SIM105,P101 exclude = build,sample-files +per-file-ignores = + tests/*: ASS001,PT011 diff --git a/.gitignore b/.gitignore index 9a3837b31..48a4771fa 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ Image9.png PyPDF2_pdfLocation.txt .python-version +tests/pdf_cache/ diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py new file mode 100644 index 000000000..2d25f9e8d --- /dev/null +++ b/PyPDF2/_cmap.py @@ -0,0 +1,184 @@ +import warnings +from binascii import unhexlify +from typing import Any, Dict, List, Tuple, Union, cast + +from ._adobe_glyphs import adobe_glyphs +from .errors import PdfReadWarning +from .generic import DecodedStreamObject, DictionaryObject, charset_encoding + + +# code freely inspired from @twiggy ; see #711 +def build_char_map( + font_name: str, space_width: float, obj: DictionaryObject +) -> Tuple[str, float, Dict[int, str], Dict]: + ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore + font_type: str = cast(str, ft["/Subtype"]) + + space_code = 32 + encoding, space_code = parse_encoding(ft, space_code) + map_dict, space_code = parse_to_unicode(ft, space_code) + sp_width = compute_space_width(ft, space_code, space_width) + + return ( + font_type, + float(sp_width / 2), + dict(zip(range(256), encoding)), + # https://github.com/python/mypy/issues/4374 + "".maketrans(map_dict), # type: ignore + ) + + +def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], int]: + encoding: List[str] = [] + if "/Encoding" not in ft: + return encoding, space_code + enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore + if isinstance(enc, str): + try: + if enc in ("/Identity-H", "/Identity-V"): + encoding = [] + else: + encoding = charset_encoding[enc].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: + try: + encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + else: + encoding = charset_encoding["/StandardCoding"].copy() + if "/Differences" in enc: + x = 0 + for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): + if isinstance(o, int): + x = o + else: + try: + encoding[x] = adobe_glyphs[o] + except Exception: + encoding[x] = o + if o == " ": + space_code = x + x += 1 + return encoding, space_code + + +def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int]: + map_dict: Dict[Any, Any] = {} + if "/ToUnicode" not in ft: + return map_dict, space_code + process_rg: bool = False + process_char: bool = False + cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") + for l in ( + cm.strip() + .replace("<", " ") + .replace(">", "") + .replace("[", " [ ") + .replace("]", " ] ") + .split("\n") + ): + if l == "": + continue + if "beginbfrange" in l: + process_rg = True + elif "endbfrange" in l: + process_rg = False + elif "beginbfchar" in l: + process_char = True + elif "endbfchar" in l: + process_char = False + elif process_rg: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + b = int(lst[1], 16) + if lst[2] == "[": + for sq in lst[3:]: + if "]": + break + map_dict[a] = unhexlify(sq).decode("utf-16-be") + a += 1 + assert a > b + else: + c = int(lst[2], 16) + fmt = b"%%0%dX" % len(lst[2]) + while a <= b: + map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") + a += 1 + c += 1 + elif process_char: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + map_dict[a] = unhexlify("".join(lst[1:])).decode( + "utf-16-be" + ) # join is here as some cases where the code was split + + # get + for a in map_dict: + if map_dict[a] == " ": + space_code = a + return map_dict, space_code + + +def compute_space_width( + ft: DictionaryObject, space_code: int, space_width: float +) -> float: + sp_width: float = space_width * 2 # default value + w = [] + st: int = 0 + if "/W" in ft: + if "/DW" in ft: + sp_width = cast(float, ft["/DW"]) + w = list(ft["/W"]) # type: ignore + while len(w) > 0: + st = w[0] + second = w[1] + if isinstance(int, second): + if st <= space_code and space_code <= second: + sp_width = w[2] + break + w = w[3:] + if isinstance(list, second): + if st <= space_code and space_code <= st + len(second) - 1: + sp_width = second[space_code - st] + w = w[2:] + else: + warnings.warn( + "unknown widths : \n" + (ft["/W"]).__repr__(), + PdfReadWarning, + ) + break + if "/Widths" in ft: + w = list(ft["/Widths"]) # type: ignore + try: + st = cast(int, ft["/FirstChar"]) + en: int = cast(int, ft["/LastChar"]) + if st > space_code or en < space_code: + raise Exception("Not in range") + if w[space_code - st] == 0: + raise Exception("null width") + sp_width = w[space_code - st] + except Exception: + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + else: + # will consider width of char as avg(width)/2 + m = 0 + cpt = 0 + for x in w: + if x > 0: + m += x + cpt += 1 + sp_width = m / max(1, cpt) / 2 + return sp_width diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py index 64b53c001..abf56d86b 100644 --- a/PyPDF2/_merger.py +++ b/PyPDF2/_merger.py @@ -703,13 +703,13 @@ def add_bookmark( {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) - format = 0 + format_flag = 0 if italic: - format += 1 + format_flag += 1 if bold: - format += 2 - if format: - bookmark.update({NameObject("/F"): NumberObject(format)}) + format_flag += 2 + if format_flag: + bookmark.update({NameObject("/F"): NumberObject(format_flag)}) bookmark_ref = self.output._add_object(bookmark) parent = cast(Bookmark, parent.get_object()) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 8b3367e1b..ae44506e6 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -30,7 +30,6 @@ import math import uuid import warnings -from binascii import unhexlify from decimal import Decimal from math import sqrt from typing import ( @@ -46,7 +45,7 @@ cast, ) -from ._adobe_glyphs import adobe_glyphs +from ._cmap import build_char_map from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, @@ -60,7 +59,6 @@ from .generic import ( ArrayObject, ContentStream, - DecodedStreamObject, DictionaryObject, EncodedStreamObject, FloatObject, @@ -70,7 +68,6 @@ NumberObject, RectangleObject, TextStringObject, - charset_encoding, ) @@ -274,7 +271,7 @@ def create_blank_page( width = lastpage.mediabox.width height = lastpage.mediabox.height else: - raise PageSizeNotDefinedError() + raise PageSizeNotDefinedError page.__setitem__( NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore ) @@ -542,50 +539,55 @@ def _merge_page( # if expanding the page to fit a new page, calculate the new media box size if expand: - corners1 = [ - self.mediabox.left.as_numeric(), - self.mediabox.bottom.as_numeric(), - self.mediabox.right.as_numeric(), - self.mediabox.top.as_numeric(), - ] - corners2 = [ - page2.mediabox.left.as_numeric(), - page2.mediabox.bottom.as_numeric(), - page2.mediabox.left.as_numeric(), - page2.mediabox.top.as_numeric(), - page2.mediabox.right.as_numeric(), - page2.mediabox.top.as_numeric(), - page2.mediabox.right.as_numeric(), - page2.mediabox.bottom.as_numeric(), - ] - if ctm is not None: - ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] - new_x = [ - ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] - for i in range(0, 8, 2) - ] - new_y = [ - ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] - for i in range(0, 8, 2) - ] - else: - new_x = corners2[0:8:2] - new_y = corners2[1:8:2] - lowerleft = (min(new_x), min(new_y)) - upperright = (max(new_x), max(new_y)) - lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) - upperright = ( - max(corners1[2], upperright[0]), - max(corners1[3], upperright[1]), - ) - - self.mediabox.lower_left = lowerleft - self.mediabox.upper_right = upperright + self._expand_mediabox(page2, ctm) self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) self[NameObject(PG.RESOURCES)] = new_resources self[NameObject(PG.ANNOTS)] = new_annots + def _expand_mediabox( + self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] + ) -> None: + corners1 = [ + self.mediabox.left.as_numeric(), + self.mediabox.bottom.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.top.as_numeric(), + ] + corners2 = [ + page2.mediabox.left.as_numeric(), + page2.mediabox.bottom.as_numeric(), + page2.mediabox.left.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.bottom.as_numeric(), + ] + if ctm is not None: + ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] + new_x = [ + ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] + for i in range(0, 8, 2) + ] + new_y = [ + ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] + for i in range(0, 8, 2) + ] + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = (min(new_x), min(new_y)) + upperright = (max(new_x), max(new_y)) + lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) + upperright = ( + max(corners1[2], upperright[0]), + max(corners1[3], upperright[1]), + ) + + self.mediabox.lower_left = lowerleft + self.mediabox.upper_right = upperright + def mergeTransformedPage( self, page2: "PageObject", @@ -984,7 +986,9 @@ def compressContentStreams(self) -> None: # pragma: no cover deprecate_with_replacement("compressContentStreams", "compress_content_streams") self.compress_content_streams() - def _extract_text_old(self, Tj_sep: str = "", TJ_sep: str = "") -> str: + def _extract_text_old( + self, Tj_sep: str = "", TJ_sep: str = "" + ) -> str: # pragma: no cover """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF @@ -1006,28 +1010,20 @@ def _extract_text_old(self, Tj_sep: str = "", TJ_sep: str = "") -> str: space_scale = 1.0 for operands, operator in content.operations: - if operator == b"Tf": # text font - pass - elif operator == b"Tfs": # text font size - pass - elif operator == b"Tc": # character spacing - # See '5.2.1 Character Spacing' + # Missing operators: + # Tf: text font + # Tfs: text font size + # Tc: '5.2.1 Character Spacing' + # Th: '5.2.3 Horizontal Scaling' + # Tl: '5.2.4 Leading' + # Tmode: '5.2.5 Text Rendering Mode' + # Trise: '5.2.6 Text Rise' + + if operator in [b"Tf", b"Tfs", b"Tc", b"Th", b"Tl", b"Tmode"]: pass elif operator == b"Tw": # word spacing # See '5.2.2 Word Spacing' space_scale = 1.0 + float(operands[0]) - elif operator == b"Th": # horizontal scaling - # See '5.2.3 Horizontal Scaling' - pass - elif operator == b"Tl": # leading - # See '5.2.4 Leading' - pass - elif operator == b"Tmode": # text rendering mode - # See '5.2.5 Text Rendering Mode' - pass - elif operator == b"Trise": # text rise - # See '5.2.6 Text Rise' - pass elif operator == b"Tj": # See 'TABLE 5.6 Text-showing operators' _text = operands[0] @@ -1116,170 +1112,6 @@ def _extract_text( default = "/Content" :return: a string object. """ - # code freely inspired from @twiggy ; see #711 - def buildCharMap(font_name: str) -> Tuple[str, float, Dict, Dict]: - map_dict: Any = {} - process_rg: bool = False - process_char: bool = False - encoding: List[str] = [] - ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore - font_type: str = cast(str, ft["/Subtype"]) - sp_width: float = space_width * 2 # default value - w = [] - # encoding - space_code = 32 - if "/Encoding" in ft: - enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore - if isinstance(enc, str): - try: - if enc in ("/Identity-H", "/Identity-V"): - encoding = [] - else: - encoding = charset_encoding[enc].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: - try: - encoding = charset_encoding[ - cast(str, enc["/BaseEncoding"]) - ].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - else: - encoding = charset_encoding["/StandardCoding"].copy() - if "/Differences" in enc: - x = 0 - for o in cast( - DictionaryObject, cast(DictionaryObject, enc)["/Differences"] - ): - if isinstance(o, int): - x = o - else: - try: - encoding[x] = adobe_glyphs[o] - except Exception: - encoding[x] = o - if o == " ": - space_code = x - x += 1 - if "/ToUnicode" in ft: - cm: str = ( - cast(DecodedStreamObject, ft["/ToUnicode"]) - .get_data() - .decode("utf-8") - ) - for l in ( - cm.strip() - .replace("<", " ") - .replace(">", "") - .replace("[", " [ ") - .replace("]", " ] ") - .split("\n") - ): - if l == "": - continue - if "beginbfrange" in l: - process_rg = True - elif "endbfrange" in l: - process_rg = False - elif "beginbfchar" in l: - process_char = True - elif "endbfchar" in l: - process_char = False - elif process_rg: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - if lst[2] == "[": - # lst = lst[3:].trim(' []').split(' ') - for sq in lst[3:]: - if "]": - break - map_dict[a] = unhexlify(sq).decode("utf-16-be") - a += 1 - assert a > b - else: - c = int(lst[2], 16) - fmt = b"%%0%dX" % len(lst[2]) - while a <= b: - map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") - a += 1 - c += 1 - elif process_char: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - map_dict[a] = unhexlify("".join(lst[1:])).decode( - "utf-16-be" - ) # join is here as some cases where the code was split - - # get - for a in map_dict: - if map_dict[a] == " ": - space_code = a - - # compute space width - st: int = 0 # declaration for mypy - if "/W" in ft: - if "/DW" in ft: - sp_width = cast(float, ft["/DW"]) - w = [x for x in ft["/W"]] # type: ignore - while len(w) > 0: - st = w[0] - second = w[1] - if isinstance(int, second): - if st <= space_code and space_code <= second: - sp_width = w[2] - break - w = w[3:] - if isinstance(list, second): - if st <= space_code and space_code <= st + len(second) - 1: - sp_width = second[space_code - st] - w = w[2:] - else: - warnings.warn( - "unknown widths : \n" + (ft["/W"]).__repr__(), - PdfReadWarning, - ) - break - if "/Widths" in ft: - w = [x for x in ft["/Widths"]] # type: ignore - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > space_code or en < space_code: - raise Exception("Not in range") - if w[space_code - st] == 0: - raise Exception("null width") - sp_width = w[space_code - st] - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore - else: - # will consider width of char as avg(width)/2 - m = 0 - cpt = 0 - for x in w: - if x > 0: - m += x - cpt += 1 - sp_width = m / max(1, cpt) / 2 - - return ( - font_type, - float(sp_width / 2), - dict(zip(range(256), encoding)), - "".maketrans(map_dict), - ) text: str = "" output: str = "" @@ -1287,7 +1119,7 @@ def buildCharMap(font_name: str) -> Tuple[str, float, Dict, Dict]: resources_dict = cast(DictionaryObject, obj["/Resources"]) if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): - cmaps[f] = buildCharMap(f) + cmaps[f] = build_char_map(f, space_width, obj) cmap: Union[str, Dict[int, str]] = {} content = obj[content_key].get_object() if isinstance(content_key, str) else obj if not isinstance(content, ContentStream): diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index e82d8b0e3..45173c415 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -393,7 +393,7 @@ def _get_page(self, page_number: int) -> PageObject: """ Retrieves a page by number from this PDF file. - :param int pageNumber: The page number to retrieve + :param int page_number: The page number to retrieve (pages begin at zero) :return: a :class:`PageObject` instance. :rtype: :class:`PageObject` @@ -498,7 +498,7 @@ def _build_field( field: Union[TreeObject, DictionaryObject], retval: Dict[Any, Any], fileobj: Any, - fieldAttributes: Any, + field_attributes: Any, ) -> None: self._check_kids(field, retval, fileobj) try: @@ -510,7 +510,7 @@ def _build_field( # Ignore no-name field for now return if fileobj: - self._write_field(fileobj, field, fieldAttributes) + self._write_field(fileobj, field, field_attributes) fileobj.write("\n") retval[key] = Field(field) @@ -913,7 +913,7 @@ def _flatten( inherit: Optional[Dict[str, Any]] = None, indirect_ref: Optional[IndirectObject] = None, ) -> None: - inheritablePageAttributes = ( + inheritable_page_attributes = ( NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), NameObject(PG.CROPBOX), @@ -933,7 +933,7 @@ def _flatten( t = pages[PA.TYPE] # type: ignore if t == "/Pages": - for attr in inheritablePageAttributes: + for attr in inheritable_page_attributes: if attr in pages: inherit[attr] = pages[attr] for page in pages[PA.KIDS]: # type: ignore @@ -980,8 +980,11 @@ def _get_object_from_stream( if self.strict and idx != i: raise PdfReadError("Object is in wrong index.") stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore - read_non_whitespace(stream_data) # to cope with some case where the 'pointer' is on a white space + + # to cope with some case where the 'pointer' is on a white space + read_non_whitespace(stream_data) stream_data.seek(-1, 1) + try: obj = read_object(stream_data, self) except PdfStreamError as exc: @@ -1212,10 +1215,10 @@ def read(self, stream: StreamType) -> None: ) ) stream.seek(-1, 2) - last1M = stream.tell() - 1024 * 1024 + 1 # offset of last MB of stream + last_mb = stream.tell() - 1024 * 1024 + 1 # offset of last MB of stream line = b_("") while line[:5] != b_("%%EOF"): - if stream.tell() < last1M: + if stream.tell() < last_mb: raise PdfReadError("EOF marker not found") line = self.read_next_end_line(stream) diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 9ae1bb582..61e864697 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -31,10 +31,10 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" +import warnings from codecs import getencoder from io import BufferedReader, BufferedWriter, BytesIO, FileIO from typing import Any, Dict, Optional, Tuple, Union, overload -import warnings try: # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 95648d9fa..e38af4d74 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -215,7 +215,7 @@ def insertPage(self, page: PageObject, index: int = 0) -> None: # pragma: no co deprecate_with_replacement("insertPage", "insert_page") self.insert_page(page, index) - def get_page(self, pageNumber: int) -> PageObject: + def get_page(self, pageNumber: int) -> PageObject: # TODO: PEP8 """ Retrieve a page by number from this PDF file. @@ -453,8 +453,8 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: endobj """ - embeddedFilesNamesDictionary = DictionaryObject() - embeddedFilesNamesDictionary.update( + embedded_files_names_dictionary = DictionaryObject() + embedded_files_names_dictionary.update( { NameObject(CA.NAMES): ArrayObject( [createStringObject(filename), filespec] @@ -462,12 +462,12 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: } ) - embeddedFilesDictionary = DictionaryObject() - embeddedFilesDictionary.update( - {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary} + embedded_files_dictionary = DictionaryObject() + embedded_files_dictionary.update( + {NameObject("/EmbeddedFiles"): embedded_files_names_dictionary} ) # Update the root - self._root_object.update({NameObject(CA.NAMES): embeddedFilesDictionary}) + self._root_object.update({NameObject(CA.NAMES): embedded_files_dictionary}) def addAttachment( self, fname: str, fdata: Union[str, bytes] @@ -1097,13 +1097,13 @@ def add_bookmark( {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) - format = 0 + format_flag = 0 if italic: - format += 1 + format_flag += 1 if bold: - format += 2 - if format: - bookmark.update({NameObject("/F"): NumberObject(format)}) + format_flag += 2 + if format_flag: + bookmark.update({NameObject("/F"): NumberObject(format_flag)}) bookmark_ref = self._add_object(bookmark) @@ -1248,9 +1248,10 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: for operands, operator in content.operations: if operator in [b_("Tj"), b_("'")]: text = operands[0] - if ignore_byte_string_object: - if not isinstance(text, TextStringObject): - operands[0] = TextStringObject() + if ignore_byte_string_object and not isinstance( + text, TextStringObject + ): + operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] if ignore_byte_string_object and not isinstance( diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 6bf4619c3..ca21f4aec 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -76,7 +76,9 @@ def compress(data: bytes) -> bytes: class FlateDecode: @staticmethod def decode( - data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] + # TODO: PEP8 + data: bytes, + decodeParms: Union[None, ArrayObject, DictionaryObject], ) -> bytes: """ :param data: flate-encoded data. @@ -90,9 +92,9 @@ def decode( if decodeParms: try: if isinstance(decodeParms, ArrayObject): - for decodeParm in decodeParms: - if "/Predictor" in decodeParm: - predictor = decodeParm["/Predictor"] + for decode_parm in decodeParms: + if "/Predictor" in decode_parm: + predictor = decode_parm["/Predictor"] else: predictor = decodeParms.get("/Predictor", 1) except AttributeError: @@ -103,9 +105,9 @@ def decode( # §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8 if isinstance(decodeParms, ArrayObject): columns = 1 - for decodeParm in decodeParms: - if "/Columns" in decodeParm: - columns = decodeParm["/Columns"] + for decode_parm in decodeParms: + if "/Columns" in decode_parm: + columns = decode_parm["/Columns"] else: columns = 1 if decodeParms is None else decodeParms.get(LZW.COLUMNS, 1) @@ -169,7 +171,9 @@ class ASCIIHexDecode: @staticmethod def decode( - data: str, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + # TODO: PEP8 + data: str, + decodeParms: Union[None, ArrayObject, DictionaryObject] = None, ) -> str: """ :param data: a str sequence of hexadecimal-encoded values to be @@ -285,7 +289,9 @@ def decode(self) -> str: @staticmethod def decode( - data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + # TODO: PEP8 + data: bytes, + decodeParms: Union[None, ArrayObject, DictionaryObject] = None, ) -> str: """ :param data: ``bytes`` or ``str`` text to decode. @@ -384,11 +390,11 @@ def _get_parameters( columns = 0 if parameters: if isinstance(parameters, ArrayObject): - for decodeParm in parameters: - if CCITT.COLUMNS in decodeParm: - columns = decodeParm[CCITT.COLUMNS] - if CCITT.K in decodeParm: - k = decodeParm[CCITT.K] + for decode_parm in parameters: + if CCITT.COLUMNS in decode_parm: + columns = decode_parm[CCITT.COLUMNS] + if CCITT.K in decode_parm: + k = decode_parm[CCITT.K] else: columns = parameters[CCITT.COLUMNS] # type: ignore k = parameters[CCITT.K] # type: ignore @@ -398,6 +404,7 @@ def _get_parameters( @staticmethod def decode( data: bytes, + # TODO: PEP8 decodeParms: Union[None, ArrayObject, DictionaryObject] = None, height: int = 0, ) -> bytes: @@ -460,25 +467,25 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # utils.StreamObject data: bytes = stream._data # If there is not data to decode we should not try to decode the data. if data: - for filterType in filters: - if filterType == FT.FLATE_DECODE or filterType == FTA.FL: + for filter_type in filters: + if filter_type == FT.FLATE_DECODE or filter_type == FTA.FL: data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS)) - elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx: + elif filter_type == FT.ASCII_HEX_DECODE or filter_type == FTA.AHx: data = ASCIIHexDecode.decode(data) # type: ignore - elif filterType == FT.LZW_DECODE or filterType == FTA.LZW: + elif filter_type == FT.LZW_DECODE or filter_type == FTA.LZW: data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS)) # type: ignore - elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85: + elif filter_type == FT.ASCII_85_DECODE or filter_type == FTA.A85: data = ASCII85Decode.decode(data) - elif filterType == FT.DCT_DECODE: + elif filter_type == FT.DCT_DECODE: data = DCTDecode.decode(data) - elif filterType == "/JPXDecode": + elif filter_type == "/JPXDecode": data = JPXDecode.decode(data) - elif filterType == FT.CCITT_FAX_DECODE: + elif filter_type == FT.CCITT_FAX_DECODE: height = stream.get(IA.HEIGHT, ()) data = CCITTFaxDecode.decode(data, stream.get(SA.DECODE_PARMS), height) - elif filterType == "/Crypt": - decodeParms = stream.get(SA.DECODE_PARMS, {}) - if "/Name" not in decodeParms and "/Type" not in decodeParms: + elif filter_type == "/Crypt": + decode_parms = stream.get(SA.DECODE_PARMS, {}) + if "/Name" not in decode_parms and "/Type" not in decode_parms: pass else: raise NotImplementedError( @@ -486,7 +493,7 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # utils.StreamObject ) else: # Unsupported filter - raise NotImplementedError("unsupported filter %s" % filterType) + raise NotImplementedError("unsupported filter %s" % filter_type) return data diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index c81e6da5f..667f60136 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -84,7 +84,7 @@ def getObject(self) -> Optional["PdfObject"]: # pragma: no cover def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: - raise NotImplementedError() + raise NotImplementedError class NullObject(PdfObject): @@ -555,7 +555,7 @@ def writeToStream( class NameObject(str, PdfObject): - delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) + delimiter_pattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) surfix = b_("/") def write_to_stream( @@ -574,21 +574,21 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") - name += read_until_regex(stream, NameObject.delimiterPattern, ignore_eof=True) + name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True) try: try: ret = name.decode("utf-8") except (UnicodeEncodeError, UnicodeDecodeError): ret = name.decode("gbk") return NameObject(ret) - except (UnicodeEncodeError, UnicodeDecodeError): + except (UnicodeEncodeError, UnicodeDecodeError) as e: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number if not pdf.strict: warnings.warn("Illegal character in Name Object", PdfReadWarning) return NameObject(name) else: - raise PdfReadError("Illegal character in Name Object") + raise PdfReadError("Illegal character in Name Object") from e @staticmethod def readFromStream( @@ -687,22 +687,20 @@ def read_from_stream( forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> "DictionaryObject": def get_next_obj_pos( - p: int, p1: int, remGens: List[int], pdf: Any + p: int, p1: int, rem_gens: List[int], pdf: Any ) -> int: # PdfReader - l = pdf.xref[remGens[0]] + l = pdf.xref[rem_gens[0]] for o in l: if p1 > l[o] and p < l[o]: p1 = l[o] - if len(remGens) == 1: + if len(rem_gens) == 1: return p1 else: - return get_next_obj_pos(p, p1, remGens[1:], pdf) + return get_next_obj_pos(p, p1, rem_gens[1:], pdf) def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader # we are just pointing at beginning of the stream - eon = ( - get_next_obj_pos(stream.tell(), 2**32, [g for g in pdf.xref], pdf) - 1 - ) + eon = get_next_obj_pos(stream.tell(), 2**32, list(pdf.xref), pdf) - 1 curr = stream.tell() rw = stream.read(eon - stream.tell()) p = rw.find(b_("endstream")) @@ -895,8 +893,8 @@ def remove_child(self, child: Any) -> None: if NameObject("/Next") in cur: # Removing first tree node next_ref = cur[NameObject("/Next")] - next = next_ref.get_object() - del next[NameObject("/Prev")] + next_obj = next_ref.get_object() + del next_obj[NameObject("/Prev")] self[NameObject("/First")] = next_ref self[NameObject("/Count")] -= 1 # type: ignore @@ -911,8 +909,8 @@ def remove_child(self, child: Any) -> None: if NameObject("/Next") in cur: # Removing middle tree node next_ref = cur[NameObject("/Next")] - next = next_ref.get_object() - next[NameObject("/Prev")] = prev_ref + next_obj = next_ref.get_object() + next_obj[NameObject("/Prev")] = prev_ref prev[NameObject("/Next")] = next_ref self[NameObject("/Count")] -= 1 else: @@ -1129,7 +1127,7 @@ def __parseContentStream(self, stream: StreamType) -> None: break stream.seek(-1, 1) if peek.isalpha() or peek == b_("'") or peek == b_('"'): - operator = read_until_regex(stream, NameObject.delimiterPattern, True) + operator = read_until_regex(stream, NameObject.delimiter_pattern, True) if operator == b_("BI"): # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py index cd5ab5660..c9090d0f3 100644 --- a/PyPDF2/pagerange.py +++ b/PyPDF2/pagerange.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ Representation and utils for ranges of PDF file pages. diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py index 2364cac25..f2deef1e1 100644 --- a/PyPDF2/xmp.py +++ b/PyPDF2/xmp.py @@ -370,6 +370,7 @@ def _get_text(self, element: XmlElement) -> str: The name of the tool that created the PDF document. """ + # TODO: PEP8 xmp_createDate = property( _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) ) @@ -378,6 +379,7 @@ def _get_text(self, element: XmlElement) -> str: time are returned as a UTC datetime.datetime object. """ + # TODO: PEP8 xmp_modifyDate = property( _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) ) @@ -386,6 +388,7 @@ def _get_text(self, element: XmlElement) -> str: are returned as a UTC datetime.datetime object. """ + # TODO: PEP8 xmp_metadataDate = property( _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date) ) @@ -395,16 +398,19 @@ def _get_text(self, element: XmlElement) -> str: object. """ + # TODO: PEP8 xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) """ The name of the first known tool used to create the resource. """ + # TODO: PEP8 xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) """ The common identifier for all versions and renditions of this resource. """ + # TODO: PEP8 xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) """ An identifier for a specific incarnation of a document, updated each diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..3ec9b8785 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,27 @@ +import os +import urllib.request + + +def get_pdf_from_url(url: str, name: str) -> bytes: + """ + Download a PDF from a URL and return its contents. + + This function makes sure the PDF is not downloaded too often. + This function is a last resort for PDF files where we are uncertain if + we may add it for testing purposes to https://github.com/py-pdf/sample-files + + :param str url: location of the PDF file + :param str name: unique name accross all files + """ + cache_dir = os.path.join(os.path.dirname(__file__), "pdf_cache") + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + cache_path = os.path.join(cache_dir, name) + if not os.path.exists(cache_path): + with urllib.request.urlopen(url) as response, open( + cache_path, "wb" + ) as out_file: + out_file.write(response.read()) + with open(cache_path, "rb") as fp: + data = fp.read() + return data diff --git a/tests/bench.py b/tests/bench.py index d10fb767c..d8f526ed9 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -129,5 +129,5 @@ def text_extraction(pdf_path): @pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning") def test_text_extraction(benchmark): - file = os.path.join(SAMPLE_ROOT, "009-pdflatex-geotopo/GeoTopo.pdf") - benchmark(text_extraction, file) + file_path = os.path.join(SAMPLE_ROOT, "009-pdflatex-geotopo/GeoTopo.pdf") + benchmark(text_extraction, file_path) diff --git a/tests/test_basic_features.py b/tests/test_basic_features.py index 541334a6c..5a6d23bfd 100644 --- a/tests/test_basic_features.py +++ b/tests/test_basic_features.py @@ -1,7 +1,5 @@ import os -import pytest - from PyPDF2 import PdfReader, PdfWriter TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -22,26 +20,20 @@ def test_basic_features(): # add page 2 from input1, but rotated clockwise 90 degrees writer.add_page(reader.pages[0].rotate(90)) - # add page 3 from input1, rotated the other way: - with pytest.warns(PendingDeprecationWarning): - rotated = reader.pages[0].rotateCounterClockwise(90) - writer.add_page(rotated) - # alt: output.addPage(input1.pages[0].rotate(270)) - - # add page 4 from input1, but first add a watermark from another PDF: - page4 = reader.pages[0] + # add page 3 from input1, but first add a watermark from another PDF: + page3 = reader.pages[0] watermark_pdf = pdf_path watermark = PdfReader(watermark_pdf) - page4.merge_page(watermark.pages[0]) - writer.add_page(page4) + page3.merge_page(watermark.pages[0]) + writer.add_page(page3) - # add page 5 from input1, but crop it to half size: - page5 = reader.pages[0] - page5.mediabox.upper_right = ( - page5.mediabox.right / 2, - page5.mediabox.top / 2, + # add page 4 from input1, but crop it to half size: + page4 = reader.pages[0] + page4.mediabox.upper_right = ( + page4.mediabox.right / 2, + page4.mediabox.top / 2, ) - writer.add_page(page5) + writer.add_page(page4) # add some Javascript to launch the print window on opening this PDF. # the password dialog may prevent the print dialog from being shown, diff --git a/tests/test_filters.py b/tests/test_filters.py index db9f8078a..b2b6e1be0 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -54,7 +54,7 @@ def test_FlateDecode_unsupported_predictor(): @pytest.mark.parametrize( - ("input", "expected"), + ("data", "expected"), [ (">", ""), ( @@ -88,7 +88,7 @@ def test_FlateDecode_unsupported_predictor(): "whitespace", ], ) -def test_ASCIIHexDecode(input, expected): +def test_ASCIIHexDecode(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. @@ -97,7 +97,7 @@ def test_ASCIIHexDecode(input, expected): is currently raised.) """ - assert ASCIIHexDecode.decode(input) == expected + assert ASCIIHexDecode.decode(data) == expected def test_ASCIIHexDecode_no_eod(): diff --git a/tests/test_generic.py b/tests/test_generic.py index a56972690..2bb36bb4b 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -137,7 +137,6 @@ def test_readStringFromStream_not_in_escapedict_no_digit(): with pytest.raises(PdfReadError) as exc: readStringFromStream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" - # "Unexpected escaped string: y" def test_readStringFromStream_multichar_eol(): @@ -243,8 +242,6 @@ def test_DictionaryObject_key_is_no_pdfobject(): def test_DictionaryObject_xmp_meta(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) assert do.xmp_metadata is None - with pytest.warns(PendingDeprecationWarning): - assert do.xmpMetadata is None def test_DictionaryObject_value_is_no_pdfobject(): @@ -311,10 +308,10 @@ def test_DictionaryObject_read_from_stream_stream_no_newline(): def test_DictionaryObject_read_from_stream_stream_no_stream_length(strict): stream = BytesIO(b"<< /S /GoTo >>stream\n") - class tst: # to replace pdf + class Tst: # to replace pdf strict = False - pdf = tst() + pdf = Tst() pdf.strict = strict with pytest.raises(PdfReadError) as exc: DictionaryObject.read_from_stream(stream, pdf) @@ -322,7 +319,7 @@ class tst: # to replace pdf @pytest.mark.parametrize( - ("strict", "length", "shouldFail"), + ("strict", "length", "should_fail"), [ (True, 6, False), (True, 10, False), @@ -332,14 +329,14 @@ class tst: # to replace pdf ], ) def test_DictionaryObject_read_from_stream_stream_stream_valid( - strict, length, shouldFail + strict, length, should_fail ): stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length) - class tst: # to replace pdf + class Tst: # to replace pdf strict = True - pdf = tst() + pdf = Tst() pdf.strict = strict with pytest.raises(PdfReadError) as exc: do = DictionaryObject.read_from_stream(stream, pdf) @@ -349,7 +346,7 @@ class tst: # to replace pdf assert b"BT /F1" in do._StreamObject__data raise PdfReadError("__ALLGOOD__") print(exc.value) - assert shouldFail ^ (exc.value.args[0] == "__ALLGOOD__") + assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__") def test_RectangleObject(): @@ -399,13 +396,8 @@ def test_remove_child_in_tree(): reader = PdfReader(pdf) writer = PdfWriter() writer.add_page(reader.pages[0]) - writer.add_bookmark("foo", 0) + writer.add_bookmark("foo", pagenum=0) obj = writer._objects[-1] - # print(dict) - # print(type(dict)) - # for obj in writer._objects: - # print(obj) - # print(type(obj)) tree.add_child(obj, writer) tree.remove_child(obj) tree.add_child(obj, writer) diff --git a/tests/test_javascript.py b/tests/test_javascript.py index 3376fd862..83e08ff21 100644 --- a/tests/test_javascript.py +++ b/tests/test_javascript.py @@ -15,7 +15,7 @@ def pdf_file_writer(): reader = PdfReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) writer = PdfWriter() writer.append_pages_from_reader(reader) - yield writer + return writer def test_add_js(pdf_file_writer): diff --git a/tests/test_merger.py b/tests/test_merger.py index 2d137fcad..12cba53c4 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -38,7 +38,7 @@ def test_merge(): merger.append(fh) bookmark = merger.add_bookmark("A bookmark", 0) - merger.add_bookmark("deeper", 0, parent=bookmark) + merger.add_bookmark("deeper", 0, parent=bookmark, italic=True, bold=True) merger.add_metadata({"author": "Martin Thoma"}) merger.add_named_destination("title", 0) merger.set_page_layout("/SinglePage") diff --git a/tests/test_page.py b/tests/test_page.py index cd53d796e..6ac28c762 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,6 +1,5 @@ import json import os -import urllib.request from copy import deepcopy from io import BytesIO @@ -10,6 +9,7 @@ from PyPDF2._page import PageObject from PyPDF2.constants import PageAttributes as PG from PyPDF2.generic import DictionaryObject, NameObject, RectangleObject +from tests import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -45,7 +45,6 @@ def test_read(meta): [ ("crazyones.pdf", None), ("attachment.pdf", None), - # ("side-by-side-subfig.pdf", None), ( "libreoffice-writer-password.pdf", "openpassword", @@ -64,7 +63,7 @@ def test_page_operations(pdf_path, password): output is as expected. """ if pdf_path.startswith("http"): - pdf_path = BytesIO(urllib.request.urlopen(pdf_path).read()) + pdf_path = BytesIO(get_pdf_from_url(pdf_path, pdf_path.split("/")[-1])) else: pdf_path = os.path.join(RESOURCE_ROOT, pdf_path) reader = PdfReader(pdf_path) @@ -73,22 +72,18 @@ def test_page_operations(pdf_path, password): reader.decrypt(password) page: PageObject = reader.pages[0] - with pytest.warns(PendingDeprecationWarning): - page.mergeRotatedScaledTranslatedPage( - page, 90, scale=1, tx=1, ty=1, expand=True - ) + + transformation = Transformation().rotate(90).scale(1).translate(1, 1) + page.add_transformation(transformation, expand=True) page.add_transformation((1, 0, 0, 0, 0, 0)) page.scale(2, 2) page.scale_by(0.5) page.scale_to(100, 100) page.compress_content_streams() page.extract_text() - with pytest.warns(PendingDeprecationWarning): - page.scaleBy(0.5) - with pytest.warns(PendingDeprecationWarning): - page.scaleTo(100, 100) - with pytest.warns(PendingDeprecationWarning): - page.extractText() + page.scale_by(0.5) + page.scale_to(100, 100) + page.extract_text() def test_transformation_equivalence(): diff --git a/tests/test_reader.py b/tests/test_reader.py index 397331918..6270dabef 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -2,7 +2,6 @@ import os import time from io import BytesIO -import urllib.request import pytest @@ -13,6 +12,7 @@ from PyPDF2.constants import Ressources as RES from PyPDF2.errors import PdfReadError, PdfReadWarning from PyPDF2.filters import _xobj_to_image +from tests import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -65,8 +65,7 @@ def test_get_num_pages(src, num_pages): def test_read_metadata(pdf_path, expected): with open(pdf_path, "rb") as inputfile: reader = PdfReader(inputfile) - with pytest.warns(PendingDeprecationWarning): - docinfo = reader.documentInfo + docinfo = reader.metadata assert docinfo is not None metadict = dict(docinfo) assert metadict == expected @@ -161,11 +160,11 @@ def test_get_images(src, nb_images): images_extracted = [] if RES.XOBJECT in page[PG.RESOURCES]: - xObject = page[PG.RESOURCES][RES.XOBJECT].get_object() + x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - for obj in xObject: - if xObject[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(xObject[obj]) + for obj in x_object: + if x_object[obj][IA.SUBTYPE] == "/Image": + extension, byte_stream = _xobj_to_image(x_object[obj]) if extension is not None: filename = obj[1:] + ".png" with open(filename, "wb") as img: @@ -230,9 +229,8 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): ) pdf_stream = io.BytesIO(pdf_data) if should_fail: - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - PdfReader(pdf_stream, strict=strict) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + PdfReader(pdf_stream, strict=strict) assert exc.type == PdfReadError if startx_correction == -1: assert ( @@ -246,9 +244,8 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - reader = PdfReader(path, strict=True) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + reader = PdfReader(path, strict=True) assert "Broken xref table" in exc.value.args[0] with pytest.warns(PdfReadWarning): reader = PdfReader(path, strict=False) @@ -438,9 +435,8 @@ def test_read_prev_0_trailer(): pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - PdfReader(pdf_stream, strict=True) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + PdfReader(pdf_stream, strict=True) assert exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)" @@ -462,7 +458,7 @@ def test_read_missing_startxref(): b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 5 0 R /Size 6 >>\n" - # b"startxref %d\n" + # Removed for this test: b"startxref %d\n" b"%%%%EOF" ) pdf_data = pdf_data % ( @@ -471,7 +467,7 @@ def test_read_missing_startxref(): pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), - # pdf_data.find(b"xref") - 1, + # Removed for this test: pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: @@ -512,16 +508,14 @@ def test_read_unknown_zero_pages(): pdf_stream = io.BytesIO(pdf_data) with pytest.warns(PdfReadWarning): reader = PdfReader(pdf_stream, strict=True) - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - len(reader.pages) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + len(reader.pages) assert exc.value.args[0] == "Could not find object." with pytest.warns(PdfReadWarning): reader = PdfReader(pdf_stream, strict=False) - with pytest.raises(AttributeError) as exc: - with pytest.warns(PdfReadWarning): - len(reader.pages) + with pytest.raises(AttributeError) as exc, pytest.warns(PdfReadWarning): + len(reader.pages) assert exc.value.args[0] == "'NoneType' object has no attribute 'get_object'" @@ -555,7 +549,7 @@ def test_do_not_get_stuck_on_large_files_without_start_xref(): assert parse_duration < 60 -def test_PdfReaderDecryptWhenNoID(): +def test_decrypt_when_no_id(): """ Decrypt an encrypted file that's missing the 'ID' value in its trailer. @@ -589,10 +583,9 @@ def test_issue604(strict): pdf = None bookmarks = None if strict: - with pytest.raises(PdfReadError) as exc: - pdf = PdfReader(f, strict=strict) - with pytest.warns(PdfReadWarning): - bookmarks = pdf._get_outlines() + pdf = PdfReader(f, strict=strict) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + bookmarks = pdf._get_outlines() if "Unknown Destination" not in exc.value.args[0]: raise Exception("Expected exception not raised") return # bookmarks not correct @@ -602,7 +595,6 @@ def test_issue604(strict): bookmarks = pdf._get_outlines() def get_dest_pages(x): - # print(x) if isinstance(x, list): r = [get_dest_pages(y) for y in x] return r @@ -614,7 +606,6 @@ def get_dest_pages(x): b ) in bookmarks: # b can be destination or a list:preferred to just print them out.append(get_dest_pages(b)) - # print(out) def test_decode_permissions(): @@ -639,7 +630,7 @@ def test_decode_permissions(): assert reader.decode_permissions(8) == modify -def test_VirtualList(): +def test_pages_attribute(): pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") reader = PdfReader(pdf_path) @@ -670,8 +661,8 @@ def test_convertToInt_deprecated(): def test_iss925(): - reader = PdfReader(BytesIO(urllib.request.urlopen( - "https://github.com/py-pdf/PyPDF2/files/8796328/1.pdf").read())) + url = "https://github.com/py-pdf/PyPDF2/files/8796328/1.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name="iss925.pdf"))) for page_sliced in reader.pages: page_object = page_sliced.get_object() diff --git a/tests/test_utils.py b/tests/test_utils.py index 5fb1a49d0..321883e2f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,14 @@ import pytest import PyPDF2._utils +from PyPDF2._utils import ( + mark_location, + matrix_multiply, + read_until_regex, + read_until_whitespace, + skip_over_comment, + skip_over_whitespace, +) from PyPDF2.errors import PdfStreamError TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -22,12 +30,12 @@ (io.BytesIO(b" \n"), True), ], ) -def test_skipOverWhitespace(stream, expected): - assert PyPDF2._utils.skip_over_whitespace(stream) == expected +def test_skip_over_whitespace(stream, expected): + assert skip_over_whitespace(stream) == expected -def test_readUntilWhitespace(): - assert PyPDF2._utils.read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" +def test_read_until_whitespace(): + assert read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" @pytest.mark.parametrize( @@ -39,27 +47,25 @@ def test_readUntilWhitespace(): (io.BytesIO(b"% foo%\nbar"), b"bar"), ], ) -def test_skipOverComment(stream, remainder): - PyPDF2._utils.skip_over_comment(stream) +def test_skip_over_comment(stream, remainder): + skip_over_comment(stream) assert stream.read() == remainder -def test_readUntilRegex_premature_ending_raise(): +def test_read_until_regex_premature_ending_raise(): import re stream = io.BytesIO(b"") with pytest.raises(PdfStreamError) as exc: - PyPDF2._utils.read_until_regex(stream, re.compile(b".")) + read_until_regex(stream, re.compile(b".")) assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_readUntilRegex_premature_ending_name(): +def test_read_until_regex_premature_ending_name(): import re stream = io.BytesIO(b"") - assert ( - PyPDF2._utils.read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" - ) + assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" @pytest.mark.parametrize( @@ -70,17 +76,17 @@ def test_readUntilRegex_premature_ending_name(): (((3,), (7,)), ((5, 13),), ((3 * 5, 3 * 13), (7 * 5, 7 * 13))), ], ) -def test_matrixMultiply(a, b, expected): - assert PyPDF2._utils.matrix_multiply(a, b) == expected +def test_matrix_multiply(a, b, expected): + assert matrix_multiply(a, b) == expected -def test_markLocation(): +def test_mark_location(): stream = io.BytesIO(b"abde" * 6000) - PyPDF2._utils.mark_location(stream) + mark_location(stream) os.remove("PyPDF2_pdfLocation.txt") # cleanup -def test_hexStr(): +def test_hex_str(): assert PyPDF2._utils.hex_str(10) == "0xa" @@ -94,4 +100,5 @@ def test_b(): def test_deprecate_no_replacement(): with pytest.raises(PendingDeprecationWarning) as exc: PyPDF2._utils.deprecate_no_replacement("foo") - assert exc.value.args[0] == "foo is deprecated and will be removed in PyPDF2 3.0.0." + error_msg = "foo is deprecated and will be removed in PyPDF2 3.0.0." + assert exc.value.args[0] == error_msg diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 296d65a53..680a00af3 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1,13 +1,13 @@ import binascii import os import sys -import urllib.request from io import BytesIO import pytest from PyPDF2 import PdfReader from PyPDF2.constants import PageAttributes as PG +from tests import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -83,8 +83,20 @@ def test_decrypt(): "/Creator": "Writer", "/Producer": "LibreOffice 6.4", } - # Is extract_text() broken for encrypted files? - # assert reader.pages[0].extract_text().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n" + + +def test_text_extraction_encrypted(): + inputfile = os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf") + reader = PdfReader(inputfile) + assert reader.is_encrypted is True + reader.decrypt("openpassword") + assert ( + reader.pages[0] + .extract_text() + .replace("\n", "") + .strip() + .startswith("Lorem ipsum dolor sit amet") + ) @pytest.mark.parametrize("degree", [0, 90, 180, 270, 360, -90]) @@ -92,8 +104,7 @@ def test_rotate(degree): with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: reader = PdfReader(inputfile) page = reader.pages[0] - with pytest.warns(PendingDeprecationWarning): - page.rotateCounterClockwise(degree) + page.rotate(degree) def test_rotate_45(): @@ -101,8 +112,7 @@ def test_rotate_45(): reader = PdfReader(inputfile) page = reader.pages[0] with pytest.raises(ValueError) as exc: - with pytest.warns(PendingDeprecationWarning): - page.rotateCounterClockwise(45) + page.rotate(45) assert exc.value.args[0] == "Rotation angle must be a multiple of 90" @@ -134,7 +144,7 @@ def test_rotate_45(): def test_extract_textbench(enable, url, pages, print_result=False): if not enable: return - reader = PdfReader(BytesIO(urllib.request.urlopen(url).read())) + reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1]))) for page_number in pages: if print_result: print(f"**************** {url} / page {page_number} ****************") diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 152e2900f..31f8741c2 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -32,12 +32,11 @@ def test_read_xmp(src, has_xmp): assert xmp.dc_contributor == [] -def get_all_tiff(xmp): +def get_all_tiff(xmp: PyPDF2.xmp.XmpInformation): data = {} - with pytest.warns(PendingDeprecationWarning): - tiff_ns = xmp.getNodesInNamespace( - aboutUri="", namespace="http://ns.adobe.com/tiff/1.0/" - ) + tiff_ns = xmp.get_nodes_in_namespace( + about_uri="", namespace="http://ns.adobe.com/tiff/1.0/" + ) for tag in tiff_ns: contents = [] for content in tag.childNodes: