From 76831f154b24f046cc526ef26a5ef1d80112e16a Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Fri, 17 May 2024 13:55:11 -0700 Subject: [PATCH] refactor: `partition_pdf()` pass `kwargs` through `fast` strategy pipeline (#3040) This PR aims to pass `kwargs` through `fast` strategy pipeline, which was missing as part of the previous PR - https://github.com/Unstructured-IO/unstructured/pull/3030. I also did some code refactoring in this PR, so I recommend reviewing this PR commit by commit. ### Summary - pass `kwargs` through `fast` strategy pipeline, which will allow users to specify additional params like `sort_mode` - refactor: code reorganization - cut a release for `0.14.0` ### Testing CI should pass --- CHANGELOG.md | 2 +- .../partition/pdf_image/test_image.py | 20 +- .../partition/pdf_image/test_pdf.py | 4 +- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 545 ++++++++---------- .../partition/pdf_image/pdf_image_utils.py | 60 +- 6 files changed, 320 insertions(+), 313 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c82d914043..122afe416d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.0-dev15 +## 0.14.0 ### BREAKING CHANGES diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index 790be5f5e3..7e5b59ab05 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -326,7 +326,7 @@ def test_partition_image_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image(filename=filename) @@ -340,7 +340,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES) @@ -356,7 +356,7 @@ def test_partition_image_metadata_date_custom_metadata_date( expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image( @@ -375,7 +375,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date", return_value=mocked_last_modification_date, ) elements = image.partition_image( @@ -393,7 +393,7 @@ def test_partition_image_from_file_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: @@ -408,7 +408,7 @@ def test_partition_image_from_file_explicit_get_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: @@ -423,7 +423,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date( ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) @@ -439,7 +439,7 @@ def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_da ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) @@ -459,7 +459,7 @@ def test_partition_image_from_file_metadata_date_custom_metadata_date( expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: @@ -479,7 +479,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met expected_last_modification_date = "2009-07-05T09:24:28" mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) with open(filename, "rb") as f: diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 48a7ffd91d..8ada03d078 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -715,11 +715,11 @@ def test_partition_pdf_metadata_date( expected_last_modification_date = None mocker.patch( - "unstructured.partition.pdf.get_last_modified_date_from_file", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file", return_value=mocked_last_modification_date, ) mocker.patch( - "unstructured.partition.pdf.get_last_modified_date", + "unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date", return_value=mocked_last_modification_date, ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1a54bfd152..75aa56beb6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.0-dev15" # pragma: no cover +__version__ = "0.14.0" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 78001f9dd4..ff03d4a7c6 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -7,10 +7,9 @@ import re import warnings from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast +from typing import IO, TYPE_CHECKING, Any, Optional, cast import numpy as np -import pdf2image import wrapt from pdfminer import psparser from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox @@ -41,11 +40,8 @@ from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common import ( - convert_to_bytes, document_to_element_list, exactly_one, - get_last_modified_date, - get_last_modified_date_from_file, ocr_data_to_elements, spooled_to_bytes_io_if_needed, ) @@ -54,6 +50,8 @@ from unstructured.partition.pdf_image.pdf_image_utils import ( annotate_layout_elements, check_element_types_to_extract, + convert_pdf_to_images, + get_the_last_modification_date_pdf_or_img, save_elements, ) from unstructured.partition.pdf_image.pdfminer_processing import ( @@ -311,6 +309,7 @@ def partition_pdf_or_image( out_elements = _partition_pdf_with_pdfparser( extracted_elements=extracted_elements, include_page_breaks=include_page_breaks, + **kwargs, ) return out_elements @@ -353,19 +352,133 @@ def extractable_elements( ) -def get_the_last_modification_date_pdf_or_img( - file: Optional[bytes | IO[bytes]] = None, - filename: Optional[str] = "", - date_from_file_object: bool = False, -) -> str | None: - last_modification_date = None - if not file and filename: - last_modification_date = get_last_modified_date(filename=filename) - elif not filename and file: - last_modification_date = ( - get_last_modified_date_from_file(file) if date_from_file_object else None +def _partition_pdf_with_pdfminer( + filename: str, + file: Optional[IO[bytes]], + languages: list[str], + metadata_last_modified: Optional[str], + starting_page_number: int = 1, + **kwargs: Any, +) -> list[list[Element]]: + """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster + processing or detectron2 is not available. + + Implementation is based on the `extract_text` implemenation in pdfminer.six, but + modified to support tracking page numbers and working with file-like objects. + + ref: https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py + """ + if languages is None: + languages = ["eng"] + + exactly_one(filename=filename, file=file) + if filename: + with open_filename(filename, "rb") as fp: + fp = cast(IO[bytes], fp) + elements = _process_pdfminer_pages( + fp=fp, + filename=filename, + languages=languages, + metadata_last_modified=metadata_last_modified, + starting_page_number=starting_page_number, + **kwargs, + ) + + elif file: + elements = _process_pdfminer_pages( + fp=file, + filename=filename, + languages=languages, + metadata_last_modified=metadata_last_modified, + starting_page_number=starting_page_number, + **kwargs, ) - return last_modification_date + + return elements + + +@requires_dependencies("pdfminer") +def _process_pdfminer_pages( + fp: IO[bytes], + filename: str, + languages: list[str], + metadata_last_modified: Optional[str], + annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, + starting_page_number: int = 1, + **kwargs, +) -> list[list[Element]]: + """Uses PDFMiner to split a document into pages and process them.""" + + elements = [] + + for page_number, (page, page_layout) in enumerate( + open_pdfminer_pages_generator(fp), start=starting_page_number + ): + width, height = page_layout.width, page_layout.height + + page_elements: list[Element] = [] + annotation_list = [] + + coordinate_system = PixelSpace( + width=width, + height=height, + ) + if page.annots: + annotation_list = get_uris(page.annots, height, coordinate_system, page_number) + + for obj in page_layout: + x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) + bbox = (x1, y1, x2, y2) + + urls_metadata: list[dict[str, Any]] = [] + + if len(annotation_list) > 0 and isinstance(obj, LTTextBox): + annotations_within_element = check_annotations_within_element( + annotation_list, + bbox, + page_number, + annotation_threshold, + ) + _, words = get_word_bounding_box_from_element(obj, height) + for annot in annotations_within_element: + urls_metadata.append(map_bbox_and_index(words, annot)) + + if hasattr(obj, "get_text"): + _text_snippets: list[str] = [obj.get_text()] + else: + _text = _extract_text(obj) + _text_snippets = re.split(PARAGRAPH_PATTERN, _text) + + for _text in _text_snippets: + _text, moved_indices = clean_extra_whitespace_with_index_run(_text) + if _text.strip(): + points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) + element = element_from_text( + _text, + coordinates=points, + coordinate_system=coordinate_system, + ) + coordinates_metadata = CoordinatesMetadata( + points=points, + system=coordinate_system, + ) + links = _get_links_from_urls_metadata(urls_metadata, moved_indices) + + element.metadata = ElementMetadata( + filename=filename, + page_number=page_number, + coordinates=coordinates_metadata, + last_modified=metadata_last_modified, + links=links, + languages=languages, + ) + element.metadata.detection_origin = "pdfminer" + page_elements.append(element) + + page_elements = _combine_list_elements(page_elements, coordinate_system) + elements.append(page_elements) + + return elements @requires_dependencies("unstructured_inference") @@ -600,6 +713,124 @@ def _partition_pdf_or_image_local( return out_elements +def _partition_pdf_with_pdfparser( + extracted_elements: list[list[Element]], + include_page_breaks: bool = False, + sort_mode: str = SORT_MODE_XY_CUT, + **kwargs, +): + """Partitions a PDF using pdfparser.""" + elements = [] + + for page_elements in extracted_elements: + # NOTE(crag, christine): always do the basic sort first for deterministic order across + # python versions. + sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC) + if sort_mode != SORT_MODE_BASIC: + sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode) + + elements += sorted_page_elements + + if include_page_breaks: + elements.append(PageBreak(text="")) + + return elements + + +def _partition_pdf_or_image_with_ocr( + filename: str = "", + file: Optional[bytes | IO[bytes]] = None, + include_page_breaks: bool = False, + languages: Optional[list[str]] = ["eng"], + is_image: bool = False, + metadata_last_modified: Optional[str] = None, + starting_page_number: int = 1, + **kwargs: Any, +): + """Partitions an image or PDF using OCR. For PDFs, each page is converted + to an image prior to processing.""" + + elements = [] + if is_image: + images = [] + image = PILImage.open(file) if file is not None else PILImage.open(filename) + images.append(image) + + for page_number, image in enumerate(images, start=starting_page_number): + page_elements = _partition_pdf_or_image_with_ocr_from_image( + image=image, + languages=languages, + page_number=page_number, + include_page_breaks=include_page_breaks, + metadata_last_modified=metadata_last_modified, + **kwargs, + ) + elements.extend(page_elements) + else: + for page_number, image in enumerate( + convert_pdf_to_images(filename, file), start=starting_page_number + ): + page_elements = _partition_pdf_or_image_with_ocr_from_image( + image=image, + languages=languages, + page_number=page_number, + include_page_breaks=include_page_breaks, + metadata_last_modified=metadata_last_modified, + **kwargs, + ) + elements.extend(page_elements) + + return elements + + +def _partition_pdf_or_image_with_ocr_from_image( + image: PILImage.Image, + languages: Optional[list[str]] = None, + page_number: int = 1, + include_page_breaks: bool = False, + metadata_last_modified: Optional[str] = None, + sort_mode: str = SORT_MODE_XY_CUT, + **kwargs: Any, +) -> list[Element]: + """Extract `unstructured` elements from an image using OCR and perform partitioning.""" + + from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent + + ocr_agent = OCRAgent.get_agent() + ocr_languages = prepare_languages_for_tesseract(languages) + + # NOTE(christine): `unstructured_pytesseract.image_to_string()` returns sorted text + if ocr_agent.is_text_sorted(): + sort_mode = SORT_MODE_DONT + + ocr_data = ocr_agent.get_layout_elements_from_image( + image=image, + ocr_languages=ocr_languages, + ) + + metadata = ElementMetadata( + last_modified=metadata_last_modified, + filetype=image.format, + page_number=page_number, + languages=languages, + ) + + page_elements = ocr_data_to_elements( + ocr_data, + image_size=image.size, + common_metadata=metadata, + ) + + sorted_page_elements = page_elements + if sort_mode != SORT_MODE_DONT: + sorted_page_elements = sort_page_elements(page_elements, sort_mode) + + if include_page_breaks: + sorted_page_elements.append(PageBreak(text="")) + + return page_elements + + def _process_uncategorized_text_elements(elements: list[Element]): """Processes a list of elements, creating a new list where elements with the category `UncategorizedText` are replaced with corresponding @@ -617,51 +848,6 @@ def _process_uncategorized_text_elements(elements: list[Element]): return out_elements -def _partition_pdf_with_pdfminer( - filename: str, - file: Optional[IO[bytes]], - languages: list[str], - metadata_last_modified: Optional[str], - starting_page_number: int = 1, - **kwargs: Any, -) -> list[list[Element]]: - """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster - processing or detectron2 is not available. - - Implementation is based on the `extract_text` implemenation in pdfminer.six, but - modified to support tracking page numbers and working with file-like objects. - - ref: https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py - """ - if languages is None: - languages = ["eng"] - - exactly_one(filename=filename, file=file) - if filename: - with open_filename(filename, "rb") as fp: - fp = cast(IO[bytes], fp) - elements = _process_pdfminer_pages( - fp=fp, - filename=filename, - languages=languages, - metadata_last_modified=metadata_last_modified, - starting_page_number=starting_page_number, - **kwargs, - ) - - elif file: - elements = _process_pdfminer_pages( - fp=file, - filename=filename, - languages=languages, - metadata_last_modified=metadata_last_modified, - starting_page_number=starting_page_number, - **kwargs, - ) - - return elements - - def _extract_text(item: LTItem) -> str: """Recursively extracts text from PDFMiner objects to account for scenarios where the text is in a sub-container.""" @@ -694,90 +880,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs): return wrapped(resources) -@requires_dependencies("pdfminer") -def _process_pdfminer_pages( - fp: IO[bytes], - filename: str, - languages: list[str], - metadata_last_modified: Optional[str], - annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, - starting_page_number: int = 1, - **kwargs, -) -> list[list[Element]]: - """Uses PDFMiner to split a document into pages and process them.""" - - elements = [] - - for page_number, (page, page_layout) in enumerate( - open_pdfminer_pages_generator(fp), start=starting_page_number - ): - width, height = page_layout.width, page_layout.height - - page_elements: list[Element] = [] - annotation_list = [] - - coordinate_system = PixelSpace( - width=width, - height=height, - ) - if page.annots: - annotation_list = get_uris(page.annots, height, coordinate_system, page_number) - - for obj in page_layout: - x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) - bbox = (x1, y1, x2, y2) - - urls_metadata: list[dict[str, Any]] = [] - - if len(annotation_list) > 0 and isinstance(obj, LTTextBox): - annotations_within_element = check_annotations_within_element( - annotation_list, - bbox, - page_number, - annotation_threshold, - ) - _, words = get_word_bounding_box_from_element(obj, height) - for annot in annotations_within_element: - urls_metadata.append(map_bbox_and_index(words, annot)) - - if hasattr(obj, "get_text"): - _text_snippets: list[str] = [obj.get_text()] - else: - _text = _extract_text(obj) - _text_snippets = re.split(PARAGRAPH_PATTERN, _text) - - for _text in _text_snippets: - _text, moved_indices = clean_extra_whitespace_with_index_run(_text) - if _text.strip(): - points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) - element = element_from_text( - _text, - coordinates=points, - coordinate_system=coordinate_system, - ) - coordinates_metadata = CoordinatesMetadata( - points=points, - system=coordinate_system, - ) - links = _get_links_from_urls_metadata(urls_metadata, moved_indices) - - element.metadata = ElementMetadata( - filename=filename, - page_number=page_number, - coordinates=coordinates_metadata, - last_modified=metadata_last_modified, - links=links, - languages=languages, - ) - element.metadata.detection_origin = "pdfminer" - page_elements.append(element) - - page_elements = _combine_list_elements(page_elements, coordinate_system) - elements.append(page_elements) - - return elements - - def _combine_list_elements( elements: list[Element], coordinate_system: PixelSpace | PointSpace ) -> list[Element]: @@ -854,157 +956,6 @@ def _combine_coordinates_into_element1( return copy.deepcopy(element1) -def _partition_pdf_with_pdfparser( - extracted_elements: list[list[Element]], - include_page_breaks: bool = False, - sort_mode: str = SORT_MODE_XY_CUT, -): - """Partitions a PDF using pdfparser.""" - elements = [] - - for page_elements in extracted_elements: - # NOTE(crag, christine): always do the basic sort first for deterministic order across - # python versions. - sorted_page_elements = sort_page_elements(page_elements, SORT_MODE_BASIC) - if sort_mode != SORT_MODE_BASIC: - sorted_page_elements = sort_page_elements(sorted_page_elements, sort_mode) - - elements += sorted_page_elements - - if include_page_breaks: - elements.append(PageBreak(text="")) - - return elements - - -def convert_pdf_to_images( - filename: str = "", - file: Optional[bytes | IO[bytes]] = None, - chunk_size: int = 10, -) -> Iterator[PILImage.Image]: - # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on) - exactly_one(filename=filename, file=file) - if file is not None: - f_bytes = convert_to_bytes(file) - info = pdf2image.pdfinfo_from_bytes(f_bytes) - else: - f_bytes = None - info = pdf2image.pdfinfo_from_path(filename) - - total_pages = info["Pages"] - for start_page in range(1, total_pages + 1, chunk_size): - end_page = min(start_page + chunk_size - 1, total_pages) - if f_bytes is not None: - chunk_images = pdf2image.convert_from_bytes( - f_bytes, - first_page=start_page, - last_page=end_page, - ) - else: - chunk_images = pdf2image.convert_from_path( - filename, - first_page=start_page, - last_page=end_page, - ) - - for image in chunk_images: - yield image - - -def _partition_pdf_or_image_with_ocr( - filename: str = "", - file: Optional[bytes | IO[bytes]] = None, - include_page_breaks: bool = False, - languages: Optional[list[str]] = ["eng"], - is_image: bool = False, - metadata_last_modified: Optional[str] = None, - starting_page_number: int = 1, - **kwargs: Any, -): - """Partitions an image or PDF using OCR. For PDFs, each page is converted - to an image prior to processing.""" - - elements = [] - if is_image: - images = [] - image = PILImage.open(file) if file is not None else PILImage.open(filename) - images.append(image) - - for page_number, image in enumerate(images, start=starting_page_number): - page_elements = _partition_pdf_or_image_with_ocr_from_image( - image=image, - languages=languages, - page_number=page_number, - include_page_breaks=include_page_breaks, - metadata_last_modified=metadata_last_modified, - **kwargs, - ) - elements.extend(page_elements) - else: - for page_number, image in enumerate( - convert_pdf_to_images(filename, file), start=starting_page_number - ): - page_elements = _partition_pdf_or_image_with_ocr_from_image( - image=image, - languages=languages, - page_number=page_number, - include_page_breaks=include_page_breaks, - metadata_last_modified=metadata_last_modified, - **kwargs, - ) - elements.extend(page_elements) - - return elements - - -def _partition_pdf_or_image_with_ocr_from_image( - image: PILImage.Image, - languages: Optional[list[str]] = None, - page_number: int = 1, - include_page_breaks: bool = False, - metadata_last_modified: Optional[str] = None, - sort_mode: str = SORT_MODE_XY_CUT, - **kwargs: Any, -) -> list[Element]: - """Extract `unstructured` elements from an image using OCR and perform partitioning.""" - - from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent - - ocr_agent = OCRAgent.get_agent() - ocr_languages = prepare_languages_for_tesseract(languages) - - # NOTE(christine): `unstructured_pytesseract.image_to_string()` returns sorted text - if ocr_agent.is_text_sorted(): - sort_mode = SORT_MODE_DONT - - ocr_data = ocr_agent.get_layout_elements_from_image( - image=image, - ocr_languages=ocr_languages, - ) - - metadata = ElementMetadata( - last_modified=metadata_last_modified, - filetype=image.format, - page_number=page_number, - languages=languages, - ) - - page_elements = ocr_data_to_elements( - ocr_data, - image_size=image.size, - common_metadata=metadata, - ) - - sorted_page_elements = page_elements - if sort_mode != SORT_MODE_DONT: - sorted_page_elements = sort_page_elements(page_elements, sort_mode) - - if include_page_breaks: - sorted_page_elements.append(PageBreak(text="")) - - return page_elements - - def check_coords_within_boundary( coordinates: CoordinatesMetadata, boundary: CoordinatesMetadata, diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index e57a97e6ea..891652deaf 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import base64 import os import re @@ -5,7 +7,7 @@ from copy import deepcopy from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast +from typing import IO, TYPE_CHECKING, BinaryIO, Iterator, List, Optional, Tuple, Union, cast import cv2 import numpy as np @@ -14,7 +16,12 @@ from unstructured.documents.elements import ElementType from unstructured.logger import logger -from unstructured.partition.common import convert_to_bytes +from unstructured.partition.common import ( + convert_to_bytes, + exactly_one, + get_last_modified_date, + get_last_modified_date_from_file, +) from unstructured.partition.utils.config import env_config if TYPE_CHECKING: @@ -364,3 +371,52 @@ def annotate_layout_elements( raise e else: raise FileNotFoundError(f'File "{filename}" not found!') from e + + +def convert_pdf_to_images( + filename: str = "", + file: Optional[bytes | IO[bytes]] = None, + chunk_size: int = 10, +) -> Iterator[Image.Image]: + # Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on) + exactly_one(filename=filename, file=file) + if file is not None: + f_bytes = convert_to_bytes(file) + info = pdf2image.pdfinfo_from_bytes(f_bytes) + else: + f_bytes = None + info = pdf2image.pdfinfo_from_path(filename) + + total_pages = info["Pages"] + for start_page in range(1, total_pages + 1, chunk_size): + end_page = min(start_page + chunk_size - 1, total_pages) + if f_bytes is not None: + chunk_images = pdf2image.convert_from_bytes( + f_bytes, + first_page=start_page, + last_page=end_page, + ) + else: + chunk_images = pdf2image.convert_from_path( + filename, + first_page=start_page, + last_page=end_page, + ) + + for image in chunk_images: + yield image + + +def get_the_last_modification_date_pdf_or_img( + file: Optional[bytes | IO[bytes]] = None, + filename: Optional[str] = "", + date_from_file_object: bool = False, +) -> str | None: + last_modification_date = None + if not file and filename: + last_modification_date = get_last_modified_date(filename=filename) + elif not filename and file: + last_modification_date = ( + get_last_modified_date_from_file(file) if date_from_file_object else None + ) + return last_modification_date