Skip to content

Commit

Permalink
Introduce page-level error checks
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Aug 23, 2024
1 parent cae20ac commit 21f9775
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
4 changes: 4 additions & 0 deletions docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ def get_page_image(
def get_size(self) -> "PageSize":
pass

@abstractmethod
def is_valid(self) -> bool:
pass

@abstractmethod
def unload(self):
pass
Expand Down
12 changes: 7 additions & 5 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,18 @@ def __init__(
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
):
self._ppage = page_obj

parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)

self._dpage = None
self.broken_page = "pages" not in parsed_page
if not self.broken_page:
self.valid = "pages" in parsed_page
if self.valid:
self._dpage = parsed_page["pages"][0]

def is_valid(self) -> bool:
return self.valid

def get_text_in_rect(self, bbox: BoundingBox) -> str:
if self.broken_page:
if not self.valid:
return ""
# Find intersecting cells on the page
text_piece = ""
Expand Down Expand Up @@ -65,7 +67,7 @@ def get_text_cells(self) -> Iterable[Cell]:
cells = []
cell_counter = 0

if self.broken_page:
if not self.valid:
return cells

page_size = self.get_size()
Expand Down
4 changes: 4 additions & 0 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, pdfium_doc: pdfium.PdfDocument, page_no: int):
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
self.text_page = None
self.valid = True

def is_valid(self) -> bool:
return self.valid

def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
Expand Down

0 comments on commit 21f9775

Please sign in to comment.