Skip to content

Commit

Permalink
feat: Upgrade docling-parse PDF backend and interface to use page-by-…
Browse files Browse the repository at this point in the history
…page parsing (#44)

* Use docling-parse page-by-page

Signed-off-by: Christoph Auer <[email protected]>

* Propagate document_hash to PDF backends, use docling-parse 1.0.0

Signed-off-by: Christoph Auer <[email protected]>

* Upgrade lockfile

Signed-off-by: Christoph Auer <[email protected]>

* repin after more packages on pypi

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
  • Loading branch information
cau-git and dolfim-ibm authored Aug 22, 2024
1 parent f7c50c8 commit a8c6b29
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 51 deletions.
10 changes: 7 additions & 3 deletions docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def unload(self):

class PdfDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path]):
pass
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
self.path_or_stream = path_or_stream
self.document_hash = document_hash

@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
Expand All @@ -56,4 +57,7 @@ def is_valid(self) -> bool:

@abstractmethod
def unload(self):
pass
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()

self.path_or_stream = None
41 changes: 22 additions & 19 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import random
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Union
Expand All @@ -17,11 +16,14 @@


class DoclingParsePageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage, docling_page_obj):
def __init__(
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
):
super().__init__(page_obj)
self._ppage = page_obj
self._dpage = docling_page_obj
self.text_page = None

parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = parsed_page["pages"][0]

def get_text_in_rect(self, bbox: BoundingBox) -> str:
# Find intersecting cells on the page
Expand Down Expand Up @@ -168,38 +170,39 @@ def get_size(self) -> PageSize:
def unload(self):
self._ppage = None
self._dpage = None
self.text_page = None


class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call
parser = pdf_parser()
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)

start_pb_time = time.time()
self._pdoc = pdfium.PdfDocument(path_or_stream)
self.parser = pdf_parser()

success = False
if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else:
self._parser_doc = parser.find_cells(str(path_or_stream))
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))

end_pb_time = time.time() - start_pb_time
_log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}")
if not success:
raise RuntimeError("docling-parse could not load this document.")

def page_count(self) -> int:
return len(self._parser_doc["pages"])
return len(self._pdoc) # To be replaced with docling-parse API

def load_page(self, page_no: int) -> DoclingParsePageBackend:
return DoclingParsePageBackend(
self._pdoc[page_no], self._parser_doc["pages"][page_no]
self.parser, self.document_hash, page_no, self._pdoc[page_no]
)

def is_valid(self) -> bool:
return self.page_count() > 0

def unload(self):
super().unload()
self.parser.unload_document(self.document_hash)
self._pdoc.close()
self._pdoc = None
self._parser_doc = None
5 changes: 3 additions & 2 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,8 @@ def unload(self):


class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream)
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream)

def page_count(self) -> int:
Expand All @@ -229,5 +229,6 @@ def is_valid(self) -> bool:
return self.page_count() > 0

def unload(self):
super().unload()
self._pdoc.close()
self._pdoc = None
8 changes: 6 additions & 2 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ def __init__(
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream)
self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)

elif isinstance(path_or_stream, BytesIO):
self.file = PurePath(filename)
Expand All @@ -89,7 +91,9 @@ def __init__(
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream)
self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)

if self.document_hash and self._backend.page_count() > 0:
self.page_count = self._backend.page_count()
Expand Down
2 changes: 2 additions & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
start_doc_time = time.time()
converted_doc = ConvertedDocument(input=in_doc)

_log.info(f"Processing document {in_doc.file.name}")

if not in_doc.valid:
converted_doc.status = ConversionStatus.FAILURE
return converted_doc
Expand Down
13 changes: 11 additions & 2 deletions examples/batch_convert.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import json
import logging
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter

Expand Down Expand Up @@ -52,7 +57,11 @@ def main():
Path("./test/data/redp5695.pdf"),
]

doc_converter = DocumentConverter()
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)

doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))

input = DocumentConversionInput.from_paths(input_doc_paths)

Expand Down
43 changes: 21 additions & 22 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = "^1.7"
docling-parse = "^0.2.0"
docling-parse = "^1.0.0"
certifi = ">=2024.7.4"
rtree = "^1.3.0"
scipy = "^1.14.1"
Expand Down

0 comments on commit a8c6b29

Please sign in to comment.