diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 680356feb..70a032fa3 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -100,7 +100,7 @@ def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: if size > 8: raise PdfReadError("invalid size in convert_to_int") - d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d) + d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d d = d[-8:] return struct.unpack(">q", d)[0] @@ -254,7 +254,7 @@ def __init__( ) if isinstance(stream, (str, Path)): with open(stream, "rb") as fh: - stream = BytesIO(b_(fh.read())) + stream = BytesIO(fh.read()) self.read(stream) self.stream = stream @@ -1219,7 +1219,7 @@ def read(self, stream: StreamType) -> None: stream.seek(0, os.SEEK_END) last_mb = stream.tell() - 1024 * 1024 + 1 # offset of last MB of stream line = b"" - while line[:5] != b_("%%EOF"): + while line[:5] != b"%%EOF": if stream.tell() < last_mb: raise PdfReadError("EOF marker not found") line = read_previous_line(stream) @@ -1379,7 +1379,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: # 21-byte entries (or more) due to the use of \r\n # (CRLF) EOL's. Detect that case, and adjust the line # until it does not begin with a \r (CR) or \n (LF). - while line[0] in b_("\x0D\x0A"): + while line[0] in b"\x0D\x0A": stream.seek(-20 + 1, 1) line = stream.read(20) @@ -1392,7 +1392,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: if line[-1] in b"0123456789t": stream.seek(-1, 1) - offset_b, generation_b = line[:16].split(b_(" ")) + offset_b, generation_b = line[:16].split(b" ") offset, generation = int(offset_b), int(generation_b) if generation not in self.xref: self.xref[generation] = {} @@ -1485,7 +1485,7 @@ def _rebuild_xref_table(self, stream: StreamType) -> None: stream.seek(0, 0) f_ = stream.read(-1) - for m in re.finditer(b_(r"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj"), f_): + for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): idnum = int(m.group(1)) generation = int(m.group(2)) if generation not in self.xref: @@ -1566,11 +1566,11 @@ def read_next_end_line(self, stream: StreamType, limit_offset: int = 0) -> bytes if stream.tell() < 2: raise PdfReadError("EOL marker not found") stream.seek(-2, 1) - if x == b_("\n") or x == b_("\r"): # \n = LF; \r = CR + if x == b"\n" or x == b"\r": # \n = LF; \r = CR crlf = False - while x == b_("\n") or x == b_("\r"): + while x == b"\n" or x == b"\r": x = stream.read(1) - if x == b_("\n") or x == b_("\r"): # account for CR+LF + if x == b"\n" or x == b"\r": # account for CR+LF stream.seek(-1, 1) crlf = True if stream.tell() < 2: diff --git a/PyPDF2/_security.py b/PyPDF2/_security.py index d80a6b5a5..041c0e624 100644 --- a/PyPDF2/_security.py +++ b/PyPDF2/_security.py @@ -38,9 +38,9 @@ # ref: pdf1.8 spec section 3.5.2 algorithm 3.2 _encryption_padding = ( - b_("\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56") - + b_("\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c") - + b_("\xa9\xfe\x64\x53\x69\x7a") + b"\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56" + b"\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c" + b"\xa9\xfe\x64\x53\x69\x7a" ) @@ -79,7 +79,7 @@ def _alg32( # 6. (Revision 3 or greater) If document metadata is not being encrypted, # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function. if rev >= 3 and not metadata_encrypt: - m.update(b_("\xff\xff\xff\xff")) + m.update(b"\xff\xff\xff\xff") # 7. Finish the hash. md5_hash = m.digest() # 8. (Revision 3 or greater) Do the following 50 times: Take the output @@ -222,7 +222,7 @@ def _alg35( # (implementator note: I don't know what "arbitrary padding" is supposed to # mean, so I have used null bytes. This seems to match a few other # people's implementations) - return val + (b_("\x00") * 16), key + return val + (b"\x00" * 16), key def RC4_encrypt(key: Union[str, bytes], plaintext: bytes) -> bytes: @@ -239,4 +239,4 @@ def RC4_encrypt(key: Union[str, bytes], plaintext: bytes) -> bytes: S[i], S[j] = S[j], S[i] t = S[(S[i] + S[j]) % 256] retval.append(b_(chr(ord_(plaintext[x]) ^ t))) - return b_("").join(retval) + return b"".join(retval) diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 07b4917a0..f67e29713 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -41,7 +41,7 @@ BytesIO, FileIO, ) -from typing import Any, Dict, Optional, Tuple, Union, overload +from typing import Dict, Optional, Pattern, Tuple, Union, overload try: # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ @@ -86,7 +86,7 @@ def read_non_whitespace(stream: StreamType) -> bytes: """ Finds and reads the next non-whitespace character (ignores whitespace). """ - tok = WHITESPACES[0] + tok = stream.read(1) while tok in WHITESPACES: tok = stream.read(1) return tok @@ -109,11 +109,13 @@ def skip_over_comment(stream: StreamType) -> None: tok = stream.read(1) stream.seek(-1, 1) if tok == b"%": - while tok not in (b_("\n"), b_("\r")): + while tok not in (b"\n", b"\r"): tok = stream.read(1) -def read_until_regex(stream: StreamType, regex: Any, ignore_eof: bool = False) -> bytes: +def read_until_regex( + stream: StreamType, regex: Pattern, ignore_eof: bool = False +) -> bytes: """ Reads until the regular expression pattern matched (ignore the match) :raises PdfStreamError: on premature end-of-file @@ -298,7 +300,7 @@ def hex_str(num: int) -> str: return hex(num).replace("L", "") -WHITESPACES = [b_(x) for x in [" ", "\n", "\r", "\t", "\x00"]] +WHITESPACES = [b" ", b"\n", b"\r", b"\t", b"\x00"] def paeth_predictor(left: int, up: int, up_left: int) -> int: diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index c0528eeaa..9cc5bc012 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -89,7 +89,7 @@ class (typically :class:`PdfReader`). """ def __init__(self) -> None: - self._header = b_("%PDF-1.3") + self._header = b"%PDF-1.3" self._objects: List[Optional[PdfObject]] = [] # array of indirect objects # The root of our page tree node. @@ -735,15 +735,15 @@ def write(self, stream: StreamType) -> None: def _write_header(self, stream: StreamType) -> List[int]: object_positions = [] - stream.write(self._header + b_("\n")) - stream.write(b_("%\xE2\xE3\xCF\xD3\n")) + stream.write(self._header + b"\n") + stream.write(b"%\xE2\xE3\xCF\xD3\n") for i in range(len(self._objects)): obj = self._objects[i] # If the obj is None we can't write anything if obj is not None: idnum = i + 1 object_positions.append(stream.tell()) - stream.write(b_(str(idnum) + " 0 obj\n")) + stream.write(b_(str(idnum)) + b" 0 obj\n") key = None if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum: pack1 = struct.pack(" List[int]: md5_hash = md5(key).digest() key = md5_hash[: min(16, len(self._encrypt_key) + 5)] obj.write_to_stream(stream, key) - stream.write(b_("\nendobj\n")) + stream.write(b"\nendobj\n") return object_positions def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> int: xref_location = stream.tell() - stream.write(b_("xref\n")) + stream.write(b"xref\n") stream.write(b_("0 %s\n" % (len(self._objects) + 1))) stream.write(b_("%010d %05d f \n" % (0, 65535))) for offset in object_positions: @@ -766,7 +766,7 @@ def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> return xref_location def _write_trailer(self, stream: StreamType) -> None: - stream.write(b_("trailer\n")) + stream.write(b"trailer\n") trailer = DictionaryObject() trailer.update( { @@ -1210,31 +1210,31 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: pg_dict = cast(DictionaryObject, self.get_object(self._pages)) pages = cast(ArrayObject, pg_dict[PA.KIDS]) jump_operators = ( - b_("cm"), - b_("w"), - b_("J"), - b_("j"), - b_("M"), - b_("d"), - b_("ri"), - b_("i"), - b_("gs"), - b_("W"), - b_("b"), - b_("s"), - b_("S"), - b_("f"), - b_("F"), - b_("n"), - b_("m"), - b_("l"), - b_("c"), - b_("v"), - b_("y"), - b_("h"), - b_("B"), - b_("Do"), - b_("sh"), + b"cm", + b"w", + b"J", + b"j", + b"M", + b"d", + b"ri", + b"i", + b"gs", + b"W", + b"b", + b"s", + b"S", + b"f", + b"F", + b"n", + b"m", + b"l", + b"c", + b"v", + b"y", + b"h", + b"B", + b"Do", + b"sh", ) for j in range(len(pages)): page = pages[j] @@ -1246,32 +1246,32 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: _operations = [] seq_graphics = False for operands, operator in content.operations: - if operator in [b_("Tj"), b_("'")]: + if operator in [b"Tj", b"'"]: text = operands[0] if ignore_byte_string_object and not isinstance( text, TextStringObject ): operands[0] = TextStringObject() - elif operator == b_('"'): + elif operator == b'"': text = operands[2] if ignore_byte_string_object and not isinstance( text, TextStringObject ): operands[2] = TextStringObject() - elif operator == b_("TJ"): + elif operator == b"TJ": for i in range(len(operands[0])): if ignore_byte_string_object and not isinstance( operands[0][i], TextStringObject ): operands[0][i] = TextStringObject() - if operator == b_("q"): + if operator == b"q": seq_graphics = True - if operator == b_("Q"): + if operator == b"Q": seq_graphics = False if seq_graphics and operator in jump_operators: continue - if operator == b_("re"): + if operator == b"re": continue _operations.append((operands, operator)) @@ -1305,7 +1305,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: if not isinstance(content, ContentStream): content = ContentStream(content, page_ref) for operands, operator in content.operations: - if operator in [b_("Tj"), b_("'")]: + if operator in [b"Tj", b"'"]: text = operands[0] if not ignore_byte_string_object: if isinstance(text, TextStringObject): @@ -1313,7 +1313,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: else: if isinstance(text, (TextStringObject, ByteStringObject)): operands[0] = TextStringObject() - elif operator == b_('"'): + elif operator == b'"': text = operands[2] if not ignore_byte_string_object: if isinstance(text, TextStringObject): @@ -1321,7 +1321,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: else: if isinstance(text, (TextStringObject, ByteStringObject)): operands[2] = TextStringObject() - elif operator == b_("TJ"): + elif operator == b"TJ": for i in range(len(operands[0])): if not ignore_byte_string_object: if isinstance(operands[0][i], TextStringObject): diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index b285d8b24..7b0c692c7 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -67,9 +67,9 @@ ) logger = logging.getLogger(__name__) -ObjectPrefix = b_("/<[tf(n%") +ObjectPrefix = b"/<[tf(n%" NumberSigns = b"+-" -IndirectPattern = re.compile(b_(r"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")) +IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") class PdfObject: @@ -403,27 +403,27 @@ def readStringFromStream( # TODO: PEP8 elif tok == b"\\": tok = stream.read(1) escape_dict = { - b_("n"): b_("\n"), - b_("r"): b_("\r"), - b_("t"): b_("\t"), - b_("b"): b_("\b"), - b_("f"): b_("\f"), - b_("c"): b_(r"\c"), - b_("("): b_("("), - b_(")"): b_(")"), - b_("/"): b_("/"), - b_("\\"): b_("\\"), - b_(" "): b_(" "), - b_("/"): b_("/"), - b_("%"): b_("%"), - b_("<"): b_("<"), - b_(">"): b_(">"), - b_("["): b_("["), - b_("]"): b_("]"), - b_("#"): b_("#"), - b_("_"): b_("_"), - b_("&"): b_("&"), - b_("$"): b_("$"), + b"n": b"\n", + b"r": b"\r", + b"t": b"\t", + b"b": b"\b", + b"f": b"\f", + b"c": rb"\c", + b"(": b"(", + b")": b")", + b"/": b"/", + b"\\": b"\\", + b" ": b" ", + b"/": b"/", + b"%": b"%", + b"<": b"<", + b">": b">", + b"[": b"[", + b"]": b"]", + b"#": b"#", + b"_": b"_", + b"&": b"&", + b"$": b"$", } try: tok = escape_dict[tok] @@ -441,16 +441,16 @@ def readStringFromStream( # TODO: PEP8 else: break tok = b_(chr(int(tok, base=8))) - elif tok in b_("\n\r"): + elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) - if tok not in b_("\n\r"): + if tok not in b"\n\r": stream.seek(-1, 1) # Then don't add anything to the actual string, since this # line break was escaped: - tok = b_("") + tok = b"" else: msg = r"Unexpected escaped string: {}".format(tok.decode("utf8")) # if.strict: PdfReadError(msg) @@ -480,9 +480,9 @@ def write_to_stream( from ._security import RC4_encrypt bytearr = RC4_encrypt(encryption_key, bytearr) # type: ignore - stream.write(b_("<")) + stream.write(b"<") stream.write(hexencode(bytearr)) - stream.write(b_(">")) + stream.write(b">") def writeToStream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -542,13 +542,13 @@ def write_to_stream( obj = ByteStringObject(bytearr) obj.write_to_stream(stream, None) else: - stream.write(b_("(")) + stream.write(b"(") for c in bytearr: - if not chr(c).isalnum() and c != b_(" "): + if not chr(c).isalnum() and c != b" ": stream.write(b_("\\%03o" % ord_(c))) else: stream.write(b_(chr(c))) - stream.write(b_(")")) + stream.write(b")") def writeToStream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -558,8 +558,8 @@ def writeToStream( class NameObject(str, PdfObject): - delimiter_pattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) - surfix = b_("/") + delimiter_pattern = re.compile(rb"\s+|[\(\)<>\[\]{}/%]") + surfix = b"/" def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -669,13 +669,13 @@ def xmpMetadata(self) -> Optional[PdfObject]: # pragma: no cover def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: - stream.write(b_("<<\n")) + stream.write(b"<<\n") for key, value in list(self.items()): key.write_to_stream(stream, encryption_key) - stream.write(b_(" ")) + stream.write(b" ") value.write_to_stream(stream, encryption_key) - stream.write(b_("\n")) - stream.write(b_(">>")) + stream.write(b"\n") + stream.write(b">>") def writeToStream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -706,7 +706,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader eon = get_next_obj_pos(stream.tell(), 2**32, list(pdf.xref), pdf) - 1 curr = stream.tell() rw = stream.read(eon - stream.tell()) - p = rw.find(b_("endstream")) + p = rw.find(b"endstream") if p < 0: raise PdfReadError( f"Unable to find 'endstream' marker for obj starting at {curr}." @@ -715,7 +715,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader return rw[: p - 1] tmp = stream.read(2) - if tmp != b_("<<"): + if tmp != b"<<": raise PdfReadError( "Dictionary read error at byte %s: stream must begin with '<<'" % hex_str(stream.tell()) @@ -723,16 +723,16 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader data: Dict[Any, Any] = {} while True: tok = read_non_whitespace(stream) - if tok == b_("\x00"): + if tok == b"\x00": continue - elif tok == b_("%"): + elif tok == b"%": stream.seek(-1, 1) skip_over_comment(stream) continue if not tok: raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) - if tok == b_(">"): + if tok == b">": stream.read(1) break stream.seek(-1, 1) @@ -757,17 +757,17 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader pos = stream.tell() s = read_non_whitespace(stream) - if s == b_("s") and stream.read(5) == b_("tream"): + if s == b"s" and stream.read(5) == b"tream": eol = stream.read(1) # odd PDF file output has spaces after 'stream' keyword but before EOL. # patch provided by Danial Sandler - while eol == b_(" "): + while eol == b" ": eol = stream.read(1) - if eol not in (b_("\n"), b_("\r")): + if eol not in (b"\n", b"\r"): raise PdfStreamError("Stream data must be followed by a newline") - if eol == b_("\r"): + if eol == b"\r": # read \n after - if stream.read(1) != b_("\n"): + if stream.read(1) != b"\n": stream.seek(-1, 1) # this is a stream object, not a dictionary if SA.LENGTH not in data: @@ -781,7 +781,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader data["__streamdata__"] = stream.read(length) e = read_non_whitespace(stream) ndstream = stream.read(8) - if (e + ndstream) != b_("endstream"): + if (e + ndstream) != b"endstream": # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, @@ -791,7 +791,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader pos = stream.tell() stream.seek(-10, 1) end = stream.read(9) - if end == b_("endstream"): + if end == b"endstream": # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] elif not pdf.strict: @@ -990,14 +990,14 @@ def write_to_stream( self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) DictionaryObject.write_to_stream(self, stream, encryption_key) del self[SA.LENGTH] - stream.write(b_("\nstream\n")) + stream.write(b"\nstream\n") data = self._data if encryption_key: from ._security import RC4_encrypt data = RC4_encrypt(encryption_key, data) stream.write(data) - stream.write(b_("\nendstream")) + stream.write(b"\nendstream") @staticmethod def initializeFromDictionary( @@ -1107,10 +1107,10 @@ def __init__( # multiple StreamObjects to be cat'd together. stream = stream.get_object() if isinstance(stream, ArrayObject): - data = b_("") + data = b"" for s in stream: data += b_(s.get_object().get_data()) - stream_bytes = BytesIO(b_(data)) + stream_bytes = BytesIO(data) else: stream_data = stream.get_data() assert stream_data is not None @@ -1265,7 +1265,7 @@ def read_object( return NullObject.read_from_stream(stream) elif idx == 7: # comment - while tok not in (b_("\r"), b_("\n")): + while tok not in (b"\r", b"\n"): tok = stream.read(1) # Prevents an infinite loop by raising an error if the stream is at # the EOF @@ -1748,21 +1748,21 @@ def getDestArray(self) -> ArrayObject: # pragma: no cover def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: - stream.write(b_("<<\n")) + stream.write(b"<<\n") key = NameObject("/D") key.write_to_stream(stream, encryption_key) - stream.write(b_(" ")) + stream.write(b" ") value = self.dest_array value.write_to_stream(stream, encryption_key) key = NameObject("/S") key.write_to_stream(stream, encryption_key) - stream.write(b_(" ")) + stream.write(b" ") value_s = NameObject("/GoTo") value_s.write_to_stream(stream, encryption_key) - stream.write(b_("\n")) - stream.write(b_(">>")) + stream.write(b"\n") + stream.write(b">>") @property def title(self) -> Optional[str]: @@ -1841,24 +1841,24 @@ class Bookmark(Destination): def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: - stream.write(b_("<<\n")) + stream.write(b"<<\n") for key in [ NameObject(x) for x in ["/Title", "/Parent", "/First", "/Last", "/Next", "/Prev"] if x in self ]: key.write_to_stream(stream, encryption_key) - stream.write(b_(" ")) + stream.write(b" ") value = self.raw_get(key) value.write_to_stream(stream, encryption_key) - stream.write(b_("\n")) + stream.write(b"\n") key = NameObject("/Dest") key.write_to_stream(stream, encryption_key) - stream.write(b_(" ")) + stream.write(b" ") value = self.dest_array value.write_to_stream(stream, encryption_key) - stream.write(b_("\n")) - stream.write(b_(">>")) + stream.write(b"\n") + stream.write(b">>") def createStringObject( @@ -1910,7 +1910,7 @@ def createStringObject( def encode_pdfdocencoding(unicode_string: str) -> bytes: - retval = b_("") + retval = b"" for c in unicode_string: try: retval += b_(chr(_pdfdoc_encoding_rev[c]))