From 8ebd311a4088da81dbd59c7ecc7de13c4e86f595 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sat, 14 Sep 2024 16:02:58 +0200
Subject: [PATCH] MAINT: Simplify test with None and NullObject (#2829)

---
 pypdf/_cmap.py                    |  7 ++++---
 pypdf/_doc_common.py              |  5 +++--
 pypdf/_page.py                    | 21 +++++++++++++--------
 pypdf/_page_labels.py             | 19 ++++++++++++-------
 pypdf/_reader.py                  | 10 ++++++----
 pypdf/_writer.py                  | 14 ++++++++------
 pypdf/filters.py                  |  4 +---
 pypdf/generic/__init__.py         |  2 ++
 pypdf/generic/_base.py            | 10 ++++++++++
 pypdf/generic/_data_structures.py | 11 +++++++----
 pypdf/generic/_fit.py             |  5 +++--
 pypdf/generic/_viewerpref.py      |  6 +++---
 tests/test_generic.py             | 16 ++++++++++++++++
 13 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
index 6c5996703..dcf3678bd 100644
--- a/pypdf/_cmap.py
+++ b/pypdf/_cmap.py
@@ -7,8 +7,8 @@
 from .generic import (
     DecodedStreamObject,
     DictionaryObject,
-    NullObject,
     StreamObject,
+    is_null_or_none,
 )
 
 
@@ -468,7 +468,7 @@ def compute_space_width(
                         cpt += 1
                 sp_width = m / max(1, cpt) / 2
 
-    if sp_width is None or isinstance(sp_width, NullObject):
+    if is_null_or_none(sp_width):
         sp_width = 0.0
     return sp_width
 
@@ -482,8 +482,9 @@ def type1_alternative(
     if "/FontDescriptor" not in ft:
         return map_dict, space_code, int_entry
     ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
-    if ft_desc is None:
+    if is_null_or_none(ft_desc):
         return map_dict, space_code, int_entry
+    assert ft_desc is not None, "mypy"
     txt = ft_desc.get_object().get_data()
     txt = txt.split(b"eexec\n")[0]  # only clear part
     txt = txt.split(b"/Encoding")[1]  # to get the encoding part
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
index 8d07098b4..55c6aad67 100644
--- a/pypdf/_doc_common.py
+++ b/pypdf/_doc_common.py
@@ -85,6 +85,7 @@
     TreeObject,
     ViewerPreferences,
     create_string_object,
+    is_null_or_none,
 )
 from .types import OutlineType, PagemodeType
 from .xmp import XmpInformation
@@ -761,7 +762,7 @@ def _get_inherited(obj: DictionaryObject, key: str) -> Any:
             field = cast(DictionaryObject, field.indirect_reference.get_object())  # type: ignore
         except Exception as exc:
             raise ValueError("field type is invalid") from exc
-        if _get_inherited(field, "/FT") is None:
+        if is_null_or_none(_get_inherited(field, "/FT")):
             raise ValueError("field is not valid")
         ret = []
         if field.get("/Subtype", "") == "/Widget":
@@ -852,7 +853,7 @@ def _get_outline(
                     return outline
 
                 # §12.3.3 Document outline, entries in the outline dictionary
-                if lines is not None and "/First" in lines:
+                if not is_null_or_none(lines) and "/First" in lines:
                     node = cast(DictionaryObject, lines["/First"])
             self._namedDests = self._get_named_destinations()
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 471256eec..e4ec053c8 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -84,6 +84,7 @@
     PdfObject,
     RectangleObject,
     StreamObject,
+    is_null_or_none,
 )
 
 try:
@@ -101,7 +102,7 @@ def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleOb
     retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
     if isinstance(retval, RectangleObject):
         return retval
-    if retval is None:
+    if is_null_or_none(retval):
         for d in defaults:
             retval = self.get(d)
             if retval is not None:
@@ -492,7 +493,8 @@ def __init__(
         self.inline_images: Optional[Dict[str, ImageFile]] = None
         # below Union for mypy but actually Optional[List[str]]
         self.indirect_reference = indirect_reference
-        if indirect_reference is not None:
+        if not is_null_or_none(indirect_reference):
+            assert indirect_reference is not None, "mypy"
             self.update(cast(DictionaryObject, indirect_reference.get_object()))
 
     def hash_bin(self) -> int:
@@ -731,9 +733,10 @@ def _get_inline_images(self) -> Dict[str, ImageFile]:
         entries will be identified as ~1~
         """
         content = self.get_contents()
-        if content is None:
+        if is_null_or_none(content):
             return {}
         imgs_data = []
+        assert content is not None, "mypy"
         for param, ope in content.operations:
             if ope == b"INLINE IMAGE":
                 imgs_data.append(
@@ -1063,7 +1066,7 @@ def replace_contents(
             for i in range(len(content)):
                 content[i] = self.indirect_reference.pdf._add_object(content[i])
 
-        if content is None:
+        if is_null_or_none(content):
             if PG.CONTENTS not in self:
                 return
             else:
@@ -1084,6 +1087,7 @@ def replace_contents(
                 # this will be fixed with the _add_object
                 self[NameObject(PG.CONTENTS)] = content
         else:
+            assert content is not None, "mypy"
             content.indirect_reference = self[
                 PG.CONTENTS
             ].indirect_reference  # TODO: in a future may required generation management
@@ -2218,10 +2222,11 @@ def extract_text(
         if extraction_mode not in ["plain", "layout"]:
             raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
         if extraction_mode == "layout":
-            for visitor in ("visitor_operand_before",
-                            "visitor_operand_after",
-                            "visitor_text",
-                            ):
+            for visitor in (
+                "visitor_operand_before",
+                "visitor_operand_after",
+                "visitor_text",
+            ):
                 if locals()[visitor]:
                     logger_warning(
                         f"Argument {visitor} is ignored in layout mode",
diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py
index b02527950..1bedc003a 100644
--- a/pypdf/_page_labels.py
+++ b/pypdf/_page_labels.py
@@ -62,7 +62,13 @@
 
 from ._protocols import PdfCommonDocProtocol
 from ._utils import logger_warning
-from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject
+from .generic import (
+    ArrayObject,
+    DictionaryObject,
+    NullObject,
+    NumberObject,
+    is_null_or_none,
+)
 
 
 def number2uppercase_roman_numeral(num: int) -> str:
@@ -180,11 +186,13 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
                 # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]}
                 limits = cast(List[int], kid["/Limits"])
                 if limits[0] <= index <= limits[1]:
-                    if kid.get("/Kids", None) is not None:
+                    if not is_null_or_none(kid.get("/Kids", None)):
                         # Recursive definition.
                         level += 1
                         if level == 100:  # pragma: no cover
-                            raise NotImplementedError("Too deep nesting is not supported.")
+                            raise NotImplementedError(
+                                "Too deep nesting is not supported."
+                            )
                         number_tree = kid
                         # Exit the inner `for` loop and continue at the next level with the
                         # next iteration of the `while` loop.
@@ -195,10 +203,7 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str:
                 # and continue with the fallback.
                 break
 
-    logger_warning(
-        f"Could not reliably determine page label for {index}.",
-        __name__
-    )
+    logger_warning(f"Could not reliably determine page label for {index}.", __name__)
     return str(index + 1)  # Fallback if neither /Nums nor /Kids is in the number_tree
 
 
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
index 58c160302..9948cbea3 100644
--- a/pypdf/_reader.py
+++ b/pypdf/_reader.py
@@ -79,6 +79,7 @@
     PdfObject,
     StreamObject,
     TextStringObject,
+    is_null_or_none,
     read_object,
 )
 from .xmp import XmpInformation
@@ -206,11 +207,11 @@ def _info(self) -> Optional[DictionaryObject]:
             /Info Dictionary; None if the entry does not exist
         """
         info = self.trailer.get(TK.INFO, None)
-        if info is None:
+        if is_null_or_none(info):
             return None
         else:
             info = info.get_object()
-            if info is None:
+            if info == None:  # noqa: E711
                 raise PdfReadError(
                     "Trailer not found or does not point to document information directory"
                 )
@@ -225,7 +226,7 @@ def _ID(self) -> Optional[ArrayObject]:
             /ID array; None if the entry does not exist
         """
         id = self.trailer.get(TK.ID, None)
-        return None if id is None else cast(ArrayObject, id.get_object())
+        return None if is_null_or_none(id) else cast(ArrayObject, id.get_object())
 
     def _repr_mimebundle_(
         self,
@@ -298,8 +299,9 @@ def _get_page_number_by_indirect(
                 x.indirect_reference.idnum: i for i, x in enumerate(self.pages)  # type: ignore
             }
 
-        if indirect_reference is None or isinstance(indirect_reference, NullObject):
+        if is_null_or_none(indirect_reference):
             return None
+        assert isinstance(indirect_reference, (int, IndirectObject)), "mypy"
         if isinstance(indirect_reference, int):
             idnum = indirect_reference
         else:
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 1e6cb9e26..4d4cca329 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -107,6 +107,7 @@
     ViewerPreferences,
     create_string_object,
     hex_to_rgb,
+    is_null_or_none,
 )
 from .pagerange import PageRange, PageRangeSpec
 from .types import (
@@ -499,7 +500,7 @@ def _add_page(
             cast(ArrayObject, node[PA.KIDS]).append(page.indirect_reference)
             self.flattened_pages.append(page)
         cpt = 1000
-        while node is not None:
+        while not is_null_or_none(node):
             node = cast(DictionaryObject, node.get_object())
             node[NameObject(PA.COUNT)] = NumberObject(cast(int, node[PA.COUNT]) + 1)
             node = node.get(PA.PARENT, None)
@@ -612,8 +613,9 @@ def _get_page_number_by_indirect(
             The page number or None
         """
         # to provide same function as in PdfReader
-        if indirect_reference is None or isinstance(indirect_reference, NullObject):
+        if is_null_or_none(indirect_reference):
             return None
+        assert indirect_reference is not None, "mypy"
         if isinstance(indirect_reference, int):
             indirect_reference = IndirectObject(indirect_reference, 0, self)
         obj = indirect_reference.get_object()
@@ -928,7 +930,7 @@ def _update_field_annotation(
             )
             dr = dr.get_object().get("/Font", DictionaryObject()).get_object()
         font_res = dr.get(font_name, None)
-        if font_res is not None:
+        if not is_null_or_none(font_res):
             font_res = cast(DictionaryObject, font_res.get_object())
             font_subtype, _, font_encoding, font_map = build_char_map_from_dict(
                 200, font_res
@@ -1566,9 +1568,9 @@ def metadata(self) -> Optional[DocumentInformation]:
         Retrieve/set the PDF file's document information dictionary, if it exists.
 
         Args:
-            value: Dictionary with the entries to set. If None, remove the /Info entry from the PDF.
+            value: dict with the entries to be set. if None : remove the /Info entry from the pdf.
 
-        Note that some PDF files use (XMP) metadata streams instead of document
+        Note that some PDF files use (xmp)metadata streams instead of document
         information dictionaries, and these metadata streams will not be
         accessed by this function.
         """
@@ -2981,7 +2983,7 @@ def _get_filtered_outline(
         if node is None:
             node = NullObject()
         node = node.get_object()
-        if node is None or isinstance(node, NullObject):
+        if is_null_or_none(node):
             node = DictionaryObject()
         if node.get("/Type", "") == "/Outlines" or "/Title" not in node:
             node = node.get("/First", None)
diff --git a/pypdf/filters.py b/pypdf/filters.py
index 7589c8051..e2fdd0d8c 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -746,9 +746,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
     )
 
     # for error reporting
-    if (
-        hasattr(x_object_obj, "indirect_reference") and x_object_obj is None
-    ):  # pragma: no cover
+    if x_object_obj is None:  # pragma: no cover
         obj_as_text = x_object_obj.indirect_reference.__repr__()
     else:
         obj_as_text = x_object_obj.__repr__()
diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py
index 63ccf1bdc..d9b0ea488 100644
--- a/pypdf/generic/__init__.py
+++ b/pypdf/generic/__init__.py
@@ -46,6 +46,7 @@
     PdfObject,
     TextStringObject,
     encode_pdfdocencoding,
+    is_null_or_none,
 )
 from ._data_structures import (
     ArrayObject,
@@ -235,6 +236,7 @@ def link(
     "encode_pdfdocencoding",
     "decode_pdfdocencoding",
     "hex_to_rgb",
+    "is_null_or_none",
     "read_hex_string_from_stream",
     "read_string_from_stream",
 ]
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index d02a79810..fd7d1a8ff 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -214,6 +214,16 @@ def __repr__(self) -> str:
         return "NullObject"
 
 
+def is_null_or_none(x: Any) -> bool:
+    """
+    Returns:
+        True if x is None or NullObject.
+    """
+    return x is None or (
+        isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject)
+    )
+
+
 class BooleanObject(PdfObject):
     def __init__(self, value: Any) -> None:
         self.value = value
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index 215f2c75e..cc4b4a032 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -79,6 +79,7 @@
     NumberObject,
     PdfObject,
     TextStringObject,
+    is_null_or_none,
 )
 from ._fit import Fit
 from ._image_inline import (
@@ -451,7 +452,7 @@ def xmp_metadata(self) -> Optional[XmpInformationProtocol]:
         from ..xmp import XmpInformation
 
         metadata = self.get("/Metadata", None)
-        if metadata is None:
+        if is_null_or_none(metadata):
             return None
         metadata = metadata.get_object()
 
@@ -651,7 +652,7 @@ def children(self) -> Iterable[Any]:
             if child == self[NameObject("/Last")]:
                 return
             child_ref = child.get(NameObject("/Next"))  # type: ignore
-            if child_ref is None:
+            if is_null_or_none(child_ref):
                 return
             child = child_ref.get_object()
 
@@ -661,8 +662,9 @@ def add_child(self, child: Any, pdf: PdfWriterProtocol) -> None:
     def inc_parent_counter_default(
         self, parent: Union[None, IndirectObject, "TreeObject"], n: int
     ) -> None:
-        if parent is None:
+        if is_null_or_none(parent):
             return
+        assert parent is not None, "mypy"
         parent = cast("TreeObject", parent.get_object())
         if "/Count" in parent:
             parent[NameObject("/Count")] = NumberObject(
@@ -673,8 +675,9 @@ def inc_parent_counter_default(
     def inc_parent_counter_outline(
         self, parent: Union[None, IndirectObject, "TreeObject"], n: int
     ) -> None:
-        if parent is None:
+        if is_null_or_none(parent):
             return
+        assert parent is not None, "mypy"
         parent = cast("TreeObject", parent.get_object())
         #  BooleanObject requires comparison with == not is
         opn = parent.get("/%is_open%", True) == True  # noqa
diff --git a/pypdf/generic/_fit.py b/pypdf/generic/_fit.py
index 4132f4b71..c44d12b4c 100644
--- a/pypdf/generic/_fit.py
+++ b/pypdf/generic/_fit.py
@@ -1,5 +1,7 @@
 from typing import Any, Optional, Tuple, Union
 
+from ._base import is_null_or_none
+
 
 class Fit:
     def __init__(
@@ -9,8 +11,7 @@ def __init__(
 
         self.fit_type = NameObject(fit_type)
         self.fit_args = [
-            NullObject() if a is None or isinstance(a, NullObject) else FloatObject(a)
-            for a in fit_args
+            NullObject() if is_null_or_none(a) else FloatObject(a) for a in fit_args
         ]
 
     @classmethod
diff --git a/pypdf/generic/_viewerpref.py b/pypdf/generic/_viewerpref.py
index a12f2d446..72f89d9ae 100644
--- a/pypdf/generic/_viewerpref.py
+++ b/pypdf/generic/_viewerpref.py
@@ -32,7 +32,7 @@
     Optional,
 )
 
-from ._base import BooleanObject, NameObject, NumberObject
+from ._base import BooleanObject, NameObject, NumberObject, is_null_or_none
 from ._data_structures import ArrayObject, DictionaryObject
 
 f_obj = BooleanObject(False)
@@ -156,8 +156,8 @@ def _add_prop_int(key: str, deft: Optional[int]) -> property:
 
     def __init__(self, obj: Optional[DictionaryObject] = None) -> None:
         super().__init__(self)
-        if obj is not None:
-            self.update(obj.items())
+        if not is_null_or_none(obj):
+            self.update(obj.items())  # type: ignore
         try:
             self.indirect_reference = obj.indirect_reference  # type: ignore
         except AttributeError:
diff --git a/tests/test_generic.py b/tests/test_generic.py
index a13aa7b09..d5fad26d7 100644
--- a/tests/test_generic.py
+++ b/tests/test_generic.py
@@ -33,6 +33,7 @@
     TreeObject,
     create_string_object,
     encode_pdfdocencoding,
+    is_null_or_none,
     read_hex_string_from_stream,
     read_object,
     read_string_from_stream,
@@ -1139,3 +1140,18 @@ def test_missing_hashbin():
     assert NullObject().hash_bin() == hash((NullObject,))
     t = ByteStringObject(b"123")
     assert t.hash_bin() == hash((ByteStringObject, b"123"))
+
+
+def test_is_null_or_none():
+    assert is_null_or_none(NullObject())
+    assert not is_null_or_none(PdfObject())
+
+    reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
+    # used with get
+    assert is_null_or_none(reader.root_object.get("/do_no_exist"))
+    # object unknown...
+    assert is_null_or_none(IndirectObject(99999, 0, reader).get_object())
+    # ... or which has been replaced with NullObject
+    writer = PdfWriter(reader)
+    writer.pages[0]["/Contents"].append(writer._add_object(NullObject()))
+    assert is_null_or_none(writer.pages[0]["/Contents"][-1])