From bc299012f5253f04c3d990c8d1b3eec97c7d2006 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Wed, 3 Apr 2024 21:20:46 +0200 Subject: [PATCH] ENH: Add support for /Kids in page labels (#2562) * ENH: Add support for /Kids in page labels --------- Co-authored-by: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> --- pypdf/_page_labels.py | 112 +++++++++++++++++++++++--------------- tests/test_page_labels.py | 51 +++++++++++++++++ 2 files changed, 119 insertions(+), 44 deletions(-) diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index 2befe4b23..916296c07 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -57,11 +57,11 @@ aa to zz for the next 26, and so on) """ -from typing import Iterator, Optional, Tuple, cast +from typing import Iterator, List, Optional, Tuple, cast from ._protocols import PdfCommonDocProtocol from ._utils import logger_warning -from .generic import ArrayObject, DictionaryObject, NumberObject +from .generic import ArrayObject, DictionaryObject, NullObject, NumberObject def number2uppercase_roman_numeral(num: int) -> str: @@ -116,6 +116,42 @@ def number2lowercase_letter(number: int) -> str: return number2uppercase_letter(number).lower() +def get_label_from_nums(dictionary_object: DictionaryObject, index: int) -> str: + # [Nums] shall be an array of the form + # [ key 1 value 1 key 2 value 2 ... key n value n ] + # where each key_i is an integer and the corresponding + # value_i shall be the object associated with that key. + # The keys shall be sorted in numerical order, + # analogously to the arrangement of keys in a name tree + # as described in 7.9.6, "Name Trees." + nums = cast(ArrayObject, dictionary_object["/Nums"]) + i = 0 + value = None + start_index = 0 + while i < len(nums): + start_index = nums[i] + value = nums[i + 1].get_object() + if i + 2 == len(nums): + break + if nums[i + 2] > index: + break + i += 2 + m = { + None: lambda n: "", + "/D": lambda n: str(n), + "/R": number2uppercase_roman_numeral, + "/r": number2lowercase_roman_numeral, + "/A": number2uppercase_letter, + "/a": number2lowercase_letter, + } + # if /Nums array is not following the specification or if /Nums is empty + if not isinstance(value, dict): + return str(index + 1) # Fallback + start = value.get("/St", 1) + prefix = value.get("/P", "") + return prefix + m[value.get("/S")](index - start_index + start) + + def index2label(reader: PdfCommonDocProtocol, index: int) -> str: """ See 7.9.7 "Number Trees". @@ -132,49 +168,37 @@ def index2label(reader: PdfCommonDocProtocol, index: int) -> str: return str(index + 1) # Fallback number_tree = cast(DictionaryObject, root["/PageLabels"].get_object()) if "/Nums" in number_tree: - # [Nums] shall be an array of the form - # [ key 1 value 1 key 2 value 2 ... key n value n ] - # where each key_i is an integer and the corresponding - # value_i shall be the object associated with that key. - # The keys shall be sorted in numerical order, - # analogously to the arrangement of keys in a name tree - # as described in 7.9.6, "Name Trees." - nums = cast(ArrayObject, number_tree["/Nums"]) - i = 0 - value = None - start_index = 0 - while i < len(nums): - start_index = nums[i] - value = nums[i + 1].get_object() - if i + 2 == len(nums): + return get_label_from_nums(number_tree, index) + if "/Kids" in number_tree and not isinstance(number_tree["/Kids"], NullObject): + # number_tree = {'/Kids': [IndirectObject(7333, 0, 140132998195856), ...]} + # Limit maximum depth. + level = 0 + while level < 100: + kids = cast(List[DictionaryObject], number_tree["/Kids"]) + for kid in kids: + # kid = {'/Limits': [0, 63], '/Nums': [0, {'/P': 'C1'}, ...]} + limits = cast(List[int], kid["/Limits"]) + if limits[0] <= index <= limits[1]: + if kid.get("/Kids", None) is not None: + # Recursive definition. + level += 1 + if level == 100: # pragma: no cover + raise NotImplementedError("Too deep nesting is not supported.") + number_tree = kid + # Exit the inner `for` loop and continue at the next level with the + # next iteration of the `while` loop. + break + return get_label_from_nums(kid, index) + else: + # When there are no kids, make sure to exit the `while` loop directly + # and continue with the fallback. break - if nums[i + 2] > index: - break - i += 2 - m = { - None: lambda n: "", - "/D": lambda n: str(n), - "/R": number2uppercase_roman_numeral, - "/r": number2lowercase_roman_numeral, - "/A": number2uppercase_letter, - "/a": number2lowercase_letter, - } - # if /Nums array is not following the specification or if /Nums is empty - if not isinstance(value, dict): - return str(index + 1) # Fallback - start = value.get("/St", 1) - prefix = value.get("/P", "") - return prefix + m[value.get("/S")](index - start_index + start) - if "/Kids" in number_tree or "/Limits" in number_tree: - logger_warning( - ( - "/Kids or /Limits found in PageLabels. " - "This is not yet supported." - ), - __name__, - ) - # TODO: Implement /Kids and /Limits for number tree - return str(index + 1) # Fallback if /Nums is not in the number_tree + + logger_warning( + f"Could not reliably determine page label for {index}.", + __name__ + ) + return str(index + 1) # Fallback if neither /Nums nor /Kids is in the number_tree def nums_insert( diff --git a/tests/test_page_labels.py b/tests/test_page_labels.py index 1eb6f6aab..03f8e156a 100644 --- a/tests/test_page_labels.py +++ b/tests/test_page_labels.py @@ -1,10 +1,12 @@ """Test the pypdf._page_labels module.""" from io import BytesIO +from pathlib import Path import pytest from pypdf import PdfReader from pypdf._page_labels import ( + get_label_from_nums, index2label, number2lowercase_letter, number2lowercase_roman_numeral, @@ -15,6 +17,7 @@ nums_next, ) from pypdf.generic import ( + ArrayObject, DictionaryObject, NameObject, NullObject, @@ -23,6 +26,10 @@ from . import get_data_from_url +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" + @pytest.mark.parametrize( ("number", "expected"), @@ -103,3 +110,47 @@ def test_index2label(caplog): r.trailer["/Root"]["/PageLabels"][NameObject("/Kids")] = NullObject() assert index2label(r, 1) == "2" assert caplog.text != "" + + +@pytest.mark.enable_socket() +def test_index2label_kids(): + url = "https://www.bk.admin.ch/dam/bk/de/dokumente/terminologie/publikation_25_jahre_rtd.pdf.download.pdf/Terminologie_Epochen,%20Schwerpunkte,%20Umsetzungen.pdf" # noqa: E501 + r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids.pdf"))) + expected = [ + "C1", + "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", + "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", + ] + list(map(str, range(1, 284))) + for x in ["20", "44", "58", "82", "94", "116", "154", "166", "192", "224", "250"]: + # Some page labels are unused. Removing them is still easier than copying the + # whole list itself here. + expected.remove(x) + assert r.page_labels == expected + + +@pytest.mark.enable_socket() +def test_index2label_kids__recursive(caplog): + url = "https://github.com/py-pdf/pypdf/files/14842446/tt1.pdf" + r = PdfReader(BytesIO(get_data_from_url(url=url, name="index2label_kids_recursive.pdf"))) + expected = [ + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", + "M", "N", "O", "P", "17", "18", "19" + ] + assert r.page_labels == expected + assert caplog.text != "" + + +def test_get_label_from_nums__empty_nums_list(): + dictionary_object = DictionaryObject() + dictionary_object[NameObject("/Nums")] = ArrayObject() + assert get_label_from_nums(dictionary_object, 13) == "14" + + +def test_index2label__empty_kids_list(): + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + number_tree = DictionaryObject() + number_tree[NameObject("/Kids")] = ArrayObject() + root = reader.root_object + root[NameObject("/PageLabels")] = number_tree + + assert index2label(reader, 42) == "43"