Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Page-level error reporting from PDF backend #47

Merged
merged 6 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@


class PdfPageBackend(ABC):
def __init__(self, page_obj: Any) -> object:
pass

@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
Expand All @@ -32,6 +30,10 @@ def get_page_image(
def get_size(self) -> "PageSize":
pass

@abstractmethod
def is_valid(self) -> bool:
pass

@abstractmethod
def unload(self):
pass
Expand Down
21 changes: 12 additions & 9 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
def __init__(
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
):
super().__init__(page_obj)
self._ppage = page_obj

parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)

self._dpage = None
self.broken_page = "pages" not in parsed_page
if not self.broken_page:
self.valid = "pages" in parsed_page
if self.valid:
self._dpage = parsed_page["pages"][0]
else:
raise RuntimeError(
f"Page {page_no} of document {document_hash} could not be parsed."
_log.info(
f"An error occured when loading page {page_no} of document {document_hash}."
)

def is_valid(self) -> bool:
return self.valid

def get_text_in_rect(self, bbox: BoundingBox) -> str:
if self.broken_page:
if not self.valid:
return ""
# Find intersecting cells on the page
text_piece = ""
Expand Down Expand Up @@ -70,7 +71,7 @@ def get_text_cells(self) -> Iterable[Cell]:
cells = []
cell_counter = 0

if self.broken_page:
if not self.valid:
return cells

page_size = self.get_size()
Expand Down Expand Up @@ -201,7 +202,9 @@ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
success = self.parser.load_document(document_hash, str(path_or_stream))

if not success:
raise RuntimeError("docling-parse could not load this document.")
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
)

def page_count(self) -> int:
return len(self._pdoc) # To be replaced with docling-parse API
Expand Down
31 changes: 26 additions & 5 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import random
from io import BytesIO
from pathlib import Path
Expand All @@ -7,17 +8,32 @@
import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from pypdfium2._helpers.misc import PdfiumError

from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize

_log = logging.getLogger(__name__)


class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage):
super().__init__(page_obj)
self._ppage = page_obj
def __init__(
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
_log.info(
f"An exception occured when loading page {page_no} of document {document_hash}.",
exc_info=True,
)
self.valid = False
self.text_page = None

def is_valid(self) -> bool:
return self.valid

def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
Expand Down Expand Up @@ -217,13 +233,18 @@ def unload(self):
class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream)
try:
self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
) from e

def page_count(self) -> int:
return len(self._pdoc)

def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc[page_no])
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)

def is_valid(self) -> bool:
return self.page_count() > 0
Expand Down
14 changes: 13 additions & 1 deletion docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
STARTED = auto()
FAILURE = auto()
SUCCESS = auto()
SUCCESS_WITH_ERRORS = auto()
PARTIAL_SUCCESS = auto()


class DocInputType(str, Enum):
Expand All @@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
BOTTOMLEFT = auto()


class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()


class ErrorItem(BaseModel):
component_type: DoclingComponentType
module_name: str
error_message: str


class PageSize(BaseModel):
width: float = 0.0
height: float = 0.0
Expand Down
3 changes: 2 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
AssembledUnit,
ConversionStatus,
DocumentStream,
ErrorItem,
FigureElement,
Page,
PageElement,
Expand Down Expand Up @@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
input: InputDocument

status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: List[Dict] = [] # structure to keep errors
errors: List[ErrorItem] = [] # structure to keep errors

pages: List[Page] = []
assembled: Optional[AssembledUnit] = None
Expand Down
26 changes: 22 additions & 4 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
AssembledUnit,
AssembleOptions,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
Expand Down Expand Up @@ -157,7 +159,6 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
for page_batch in chunkify(
converted_doc.pages, settings.perf.page_batch_size
):

start_pb_time = time.time()
# Pipeline

Expand Down Expand Up @@ -205,12 +206,27 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
converted_doc.pages = all_assembled_pages
self.assemble_doc(converted_doc)

converted_doc.status = ConversionStatus.SUCCESS
status = ConversionStatus.SUCCESS
for page in converted_doc.pages:
if not page._backend.is_valid():
converted_doc.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS

converted_doc.status = status

except Exception as e:
converted_doc.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(f"Encountered an error during conversion: {trace}")
_log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)

end_doc_time = time.time() - start_doc_time
_log.info(
Expand All @@ -230,7 +246,9 @@ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
# Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
# default scale
page.get_image(scale=1.0)
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale

# user requested scales
if self.assemble_options.images_scale is not None:
Expand Down
21 changes: 13 additions & 8 deletions examples/batch_convert.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
import json
import logging
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable

from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter

Expand All @@ -24,6 +19,7 @@ def export_documents(

success_count = 0
failure_count = 0
partial_success_count = 0

for doc in converted_docs:
if doc.status == ConversionStatus.SUCCESS:
Expand All @@ -37,12 +33,21 @@ def export_documents(
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown())
elif doc.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {doc.input.file} was partially converted with the following errors:"
)
for item in doc.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1

_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)


Expand All @@ -61,7 +66,7 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)

doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))
doc_converter = DocumentConverter()

input = DocumentConversionInput.from_paths(input_doc_paths)

Expand Down
Loading