diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 2074b94c..36f61191 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -39,8 +39,9 @@ def unload(self): class PdfDocumentBackend(ABC): @abstractmethod - def __init__(self, path_or_stream: Union[BytesIO, Path]): - pass + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + self.path_or_stream = path_or_stream + self.document_hash = document_hash @abstractmethod def load_page(self, page_no: int) -> PdfPageBackend: @@ -56,4 +57,7 @@ def is_valid(self) -> bool: @abstractmethod def unload(self): - pass + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index add8198e..18f6c69e 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -1,6 +1,5 @@ import logging import random -import time from io import BytesIO from pathlib import Path from typing import Iterable, Optional, Union @@ -17,11 +16,14 @@ class DoclingParsePageBackend(PdfPageBackend): - def __init__(self, page_obj: PdfPage, docling_page_obj): + def __init__( + self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage + ): super().__init__(page_obj) self._ppage = page_obj - self._dpage = docling_page_obj - self.text_page = None + + parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) + self._dpage = parsed_page["pages"][0] def get_text_in_rect(self, bbox: BoundingBox) -> str: # Find intersecting cells on the page @@ -168,38 +170,39 @@ def get_size(self) -> PageSize: def unload(self): self._ppage = None self._dpage = None - self.text_page = None class DoclingParseDocumentBackend(PdfDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path]): - super().__init__(path_or_stream) - self._pdoc = pdfium.PdfDocument(path_or_stream) - # Parsing cells with docling_parser call - parser = pdf_parser() + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) - start_pb_time = time.time() + self._pdoc = pdfium.PdfDocument(path_or_stream) + self.parser = pdf_parser() + success = False if isinstance(path_or_stream, BytesIO): - self._parser_doc = parser.find_cells_from_bytesio(path_or_stream) - else: - self._parser_doc = parser.find_cells(str(path_or_stream)) + success = self.parser.load_document_from_bytesio( + document_hash, path_or_stream + ) + elif isinstance(path_or_stream, Path): + success = self.parser.load_document(document_hash, str(path_or_stream)) - end_pb_time = time.time() - start_pb_time - _log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}") + if not success: + raise RuntimeError("docling-parse could not load this document.") def page_count(self) -> int: - return len(self._parser_doc["pages"]) + return len(self._pdoc) # To be replaced with docling-parse API def load_page(self, page_no: int) -> DoclingParsePageBackend: return DoclingParsePageBackend( - self._pdoc[page_no], self._parser_doc["pages"][page_no] + self.parser, self.document_hash, page_no, self._pdoc[page_no] ) def is_valid(self) -> bool: return self.page_count() > 0 def unload(self): + super().unload() + self.parser.unload_document(self.document_hash) self._pdoc.close() self._pdoc = None - self._parser_doc = None diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 2d0621bb..56758b1d 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -215,8 +215,8 @@ def unload(self): class PyPdfiumDocumentBackend(PdfDocumentBackend): - def __init__(self, path_or_stream: Union[BytesIO, Path]): - super().__init__(path_or_stream) + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) self._pdoc = pdfium.PdfDocument(path_or_stream) def page_count(self) -> int: @@ -229,5 +229,6 @@ def is_valid(self) -> bool: return self.page_count() > 0 def unload(self): + super().unload() self._pdoc.close() self._pdoc = None diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index fe19afbc..5726b76d 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -79,7 +79,9 @@ def __init__( self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = pdf_backend(path_or_stream=path_or_stream) + self._backend = pdf_backend( + path_or_stream=path_or_stream, document_hash=self.document_hash + ) elif isinstance(path_or_stream, BytesIO): self.file = PurePath(filename) @@ -89,7 +91,9 @@ def __init__( self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = pdf_backend(path_or_stream=path_or_stream) + self._backend = pdf_backend( + path_or_stream=path_or_stream, document_hash=self.document_hash + ) if self.document_hash and self._backend.page_count() > 0: self.page_count = self._backend.page_count() diff --git a/docling/document_converter.py b/docling/document_converter.py index 71c42ee8..8b1b0e15 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -141,6 +141,8 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument: start_doc_time = time.time() converted_doc = ConvertedDocument(input=in_doc) + _log.info(f"Processing document {in_doc.file.name}") + if not in_doc.valid: converted_doc.status = ConversionStatus.FAILURE return converted_doc diff --git a/examples/batch_convert.py b/examples/batch_convert.py index d3d0e282..76bbdcd4 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -1,10 +1,15 @@ import json import logging import time +from io import BytesIO from pathlib import Path from typing import Iterable -from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.base_models import ( + ConversionStatus, + DocumentStream, + PipelineOptions, +) from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -52,7 +57,11 @@ def main(): Path("./test/data/redp5695.pdf"), ] - doc_converter = DocumentConverter() + # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) + # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] + # input = DocumentConversionInput.from_streams(docs) + + doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/poetry.lock b/poetry.lock index 4b24f128..5aa8f20a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -822,35 +822,34 @@ tqdm = ">=4.64.0,<5.0.0" [[package]] name = "docling-parse" -version = "0.2.0" +version = "1.0.0" description = "Simple package to extract text with coordinates from programmatic PDFs" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:3ec6458d36bd33862ae1ca38accbcd2ddc8a881fb5a3ab0aeb9e023bc20d8e04"}, - {file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:898ee83f1e6f97dd34362948fcc70753fa95c83f77eddf48de5e352db10402f7"}, - {file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:9247e6902f979d23860e4b819b0145a9f55be78b14cf2906ac98f8fb0e9627cd"}, - {file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ebd0f091bdb106f1c3f72448aedfee52a904cb01e4de73827446e30fc3ac3b54"}, - {file = "docling_parse-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9846bd3347a41337d6e83d7fbfbc636274ed3863ac375f4ca5eac1ea0eb88b8f"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:b71b0f9bfe033f9c872eb8298cd1cf5420b5cad74708ae2008257202fe1218a6"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:aa0e840a9007c673f9fededf04e2372b3d1bde7c6360ac7d1b49a78ad58145f8"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66e622564073fe5dce4b104b5c80cafea2ae1114efa886ef0bc0f1b1488163a9"}, - {file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:96e5c6b1d4f7df936b2461908e99eb5fe756486d6414de71bced8324f4ce2108"}, - {file = "docling_parse-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aeaec873f8f3f8549a2511a321cfb3dc9958d9731f538e2c619fba41eea98c5"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:f3e917407a6eb4e71ce4b82ca7aefb9366e750d526011554f9aeae33fdfd53d5"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:0e4dde0bcffe59c7e1b9f2146eac2789f6a350571f66de5f4c58e8bf031ad5f6"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:12f393a0cba357016e8704e6836e553506b893d5ba16f19e47b0d201c8f6dc6d"}, - {file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e07f6439fbb53c3898cd24d7d6628dcc514097314eac4832b095291dbd9c23e0"}, - {file = "docling_parse-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cea14f84e196d01f5ae77f59bc9640c488fde9a4eaf25433a7372794ca9433fc"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:1d7b7dc072d029869387c2ec8f2d816d066a62d79f18d5c6d037b19b1cda07c6"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:acff58ac3ae9c1198956e9dd566949e4ea06c130f9e0050b2a88c7150716fd4f"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:06c688993087b763e7aaa10a8282b2cbe615b6c68540f3538998a6bc85f944f0"}, - {file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:179595753f74d121ad21e4d422e4360a5e54a36c48def130d7d93886807fcdac"}, - {file = "docling_parse-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08be7f229bbf4b89d2dba77a80939f6dbdc3a434a26342a6380dc40e25e69fcb"}, + {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:068db83a192b21783cc7bc66e9d3efb9072a57edeb8c07ef1a83a93353efcc36"}, + {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:f57f9bba3ac6a81fc30c34bb08261d7308b0a780d90cbee903821aec2f5fbd88"}, + {file = "docling_parse-1.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ae02643485eb28cb54bac8523243a536751c561dddd86846a8dd9b3804a3c491"}, + {file = "docling_parse-1.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:01cbb011a337bc4dcdddb281841378af36cbce0898bdf528543c7c54d66e6ecc"}, + {file = "docling_parse-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdf142dea82f0a5f5e1bcaa74cc9feeda12899077589e3eb6c728d334b43cdda"}, + {file = "docling_parse-1.0.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:8834a8387a55b4082c20da184e7d09f705c17558c465da9a5f35974b19013fe5"}, + {file = "docling_parse-1.0.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:4d1cfe98a7594fac3c7afd8fb08b28e4b1aba8b317e60cc64a85fb19043230b0"}, + {file = "docling_parse-1.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:f5da27cd03f1ba8859ebde525db388dd1d862be2712f38a13b6985f95061280c"}, + {file = "docling_parse-1.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8aa6bdda40483af52591bdff11a578837eb4d6be51c12d44b4e489f520757ae6"}, + {file = "docling_parse-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5c4b80a8d5e8f832910f32188501a9a6718a0223fb9921ee7cc5cfe62adb857"}, + {file = "docling_parse-1.0.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:c86b263b4b089c3a71cde2a4fb8314614350dd76b3769b0950b371c2964e10d6"}, + {file = "docling_parse-1.0.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:93ef15628d663c036d48d466bf3de7c90a172cf52ba11883990640c758331720"}, + {file = "docling_parse-1.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:37218472773ed94b8ed07eeccfa68457f064227759350404fea5f45c311242a7"}, + {file = "docling_parse-1.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:9f863d9788c62dd34b2cdfd79480785e9a6bb382144b630ceb8b527aaee56351"}, + {file = "docling_parse-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0358eb13822ce2120362d6e7d63eb80a50d819b5bed5a2ccb7bd9beee4d83a61"}, + {file = "docling_parse-1.0.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:5651185fbec4357b7638e1a39a0854a712a0cc74d6644518e64f066ce38ed976"}, + {file = "docling_parse-1.0.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:d5efedf361b4c58e372d355c0bb3fa5a20dcd3d002952ccbafb09580a924f426"}, + {file = "docling_parse-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4a67df4699b4ffc2b01e77395ef35843ab23f40ac62bcdf593b6cc1f443eca6"}, ] [package.dependencies] cibuildwheel = ">=2.20.0,<3.0.0" +tabulate = ">=0.9.0,<1.0.0" [[package]] name = "docutils" @@ -5142,4 +5141,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "4b0af4695af17ce1cdbcd04b4c29360cacd866acc77b5a0529749651ee633323" +content-hash = "98d40c4d763018d5aa79b8c0ec00adac2fc06a036a9850b60f8ecce14db7cbcc" diff --git a/pyproject.toml b/pyproject.toml index 13a33b1b..d2974fd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" -docling-parse = "^0.2.0" +docling-parse = "^1.0.0" certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = "^1.14.1"