feat: Upgrade docling-parse PDF backend and interface to use page-by-…

…page parsing (#44) * Use docling-parse page-by-page Signed-off-by: Christoph Auer <[email protected]> * Propagate document_hash to PDF backends, use docling-parse 1.0.0 Signed-off-by: Christoph Auer <[email protected]> * Upgrade lockfile Signed-off-by: Christoph Auer <[email protected]> * repin after more packages on pypi Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
DS4SD · Aug 22, 2024 · a8c6b29 · a8c6b29
1 parent f7c50c8
commit a8c6b29
Show file tree

Hide file tree

Showing 8 changed files with 73 additions and 51 deletions.
diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py
@@ -39,8 +39,9 @@ def unload(self):
 
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        pass
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        self.path_or_stream = path_or_stream
+        self.document_hash = document_hash
 
     @abstractmethod
     def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +57,7 @@ def is_valid(self) -> bool:
 
     @abstractmethod
     def unload(self):
-        pass
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
@@ -1,6 +1,5 @@
 import logging
 import random
-import time
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, Optional, Union
@@ -17,11 +16,14 @@
 
 
 class DoclingParsePageBackend(PdfPageBackend):
-    def __init__(self, page_obj: PdfPage, docling_page_obj):
+    def __init__(
+        self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
+    ):
         super().__init__(page_obj)
         self._ppage = page_obj
-        self._dpage = docling_page_obj
-        self.text_page = None
+
+        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
+        self._dpage = parsed_page["pages"][0]
 
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
         # Find intersecting cells on the page
@@ -168,38 +170,39 @@ def get_size(self) -> PageSize:
     def unload(self):
         self._ppage = None
         self._dpage = None
-        self.text_page = None
 
 
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
-        # Parsing cells with docling_parser call
-        parser = pdf_parser()
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
 
-        start_pb_time = time.time()
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self.parser = pdf_parser()
 
+        success = False
         if isinstance(path_or_stream, BytesIO):
-            self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
-        else:
-            self._parser_doc = parser.find_cells(str(path_or_stream))
+            success = self.parser.load_document_from_bytesio(
+                document_hash, path_or_stream
+            )
+        elif isinstance(path_or_stream, Path):
+            success = self.parser.load_document(document_hash, str(path_or_stream))
 
-        end_pb_time = time.time() - start_pb_time
-        _log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}")
+        if not success:
+            raise RuntimeError("docling-parse could not load this document.")
 
     def page_count(self) -> int:
-        return len(self._parser_doc["pages"])
+        return len(self._pdoc)  # To be replaced with docling-parse API
 
     def load_page(self, page_no: int) -> DoclingParsePageBackend:
         return DoclingParsePageBackend(
-            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+            self.parser, self.document_hash, page_no, self._pdoc[page_no]
         )
 
     def is_valid(self) -> bool:
         return self.page_count() > 0
 
     def unload(self):
+        super().unload()
+        self.parser.unload_document(self.document_hash)
         self._pdoc.close()
         self._pdoc = None
-        self._parser_doc = None
diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
@@ -215,8 +215,8 @@ def unload(self):
 
 
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
         self._pdoc = pdfium.PdfDocument(path_or_stream)
 
     def page_count(self) -> int:
@@ -229,5 +229,6 @@ def is_valid(self) -> bool:
         return self.page_count() > 0
 
     def unload(self):
+        super().unload()
         self._pdoc.close()
         self._pdoc = None
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -79,7 +79,9 @@ def __init__(
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
 
             elif isinstance(path_or_stream, BytesIO):
                 self.file = PurePath(filename)
@@ -89,7 +91,9 @@ def __init__(
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
 
             if self.document_hash and self._backend.page_count() > 0:
                 self.page_count = self._backend.page_count()

diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -141,6 +141,8 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
         start_doc_time = time.time()
         converted_doc = ConvertedDocument(input=in_doc)
 
+        _log.info(f"Processing document {in_doc.file.name}")
+
         if not in_doc.valid:
             converted_doc.status = ConversionStatus.FAILURE
             return converted_doc

diff --git a/examples/batch_convert.py b/examples/batch_convert.py
@@ -1,10 +1,15 @@
 import json
 import logging
 import time
+from io import BytesIO
 from pathlib import Path
 from typing import Iterable
 
-from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DocumentStream,
+    PipelineOptions,
+)
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter
 
@@ -52,7 +57,11 @@ def main():
         Path("./test/data/redp5695.pdf"),
     ]
 
-    doc_converter = DocumentConverter()
+    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
+    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # input = DocumentConversionInput.from_streams(docs)
+
+    doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))
 
     input = DocumentConversionInput.from_paths(input_doc_paths)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = "^1.7"
-docling-parse = "^0.2.0"
+docling-parse = "^1.0.0"
 certifi = ">=2024.7.4"
 rtree = "^1.3.0"
 scipy = "^1.14.1"