From dc473efed030f6817de9ecf0a095ebfa749f1485 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Wed, 23 Oct 2024 14:34:54 +0200 Subject: [PATCH 1/2] fix: set valid=false for invalid backends Signed-off-by: Michele Dolfi --- docling/datamodel/document.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e688af20..d21f6f97 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -143,11 +143,13 @@ def __init__( self.valid = False except (FileNotFoundError, OSError) as e: + self.valid = False _log.exception( f"File {self.file.name} not found or cannot be opened.", exc_info=e ) # raise except RuntimeError as e: + self.valid = False _log.exception( f"An unexpected error occurred while opening the document {self.file.name}", exc_info=e, @@ -166,6 +168,8 @@ def _init_doc( ) self._backend = backend(self, path_or_stream=path_or_stream) + if not self._backend.is_valid(): + self.valid = False class DocumentFormat(str, Enum): From 51c477cfc391cb9b43f48be5ce231824b45a1289 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 23 Oct 2024 14:46:05 +0200 Subject: [PATCH 2/2] Add test case for InputDocument Signed-off-by: Christoph Auer --- tests/test_input_doc.py | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/test_input_doc.py diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py new file mode 100644 index 00000000..3f7dd0c1 --- /dev/null +++ b/tests/test_input_doc.py @@ -0,0 +1,58 @@ +from io import BytesIO +from pathlib import Path + +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import DocumentStream, InputFormat +from docling.datamodel.document import InputDocument + + +def test_in_doc_from_valid_path(): + + test_doc_path = Path("./tests/data/2206.01062.pdf") + doc = _make_input_doc(test_doc_path) + assert doc.valid == True + + +def test_in_doc_from_invalid_path(): + test_doc_path = Path("./tests/does/not/exist.pdf") + + doc = _make_input_doc(test_doc_path) + + assert doc.valid == False + + +def test_in_doc_from_valid_buf(): + + buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read()) + stream = DocumentStream(name="my_doc.pdf", stream=buf) + + doc = _make_input_doc_from_stream(stream) + assert doc.valid == True + + +def test_in_doc_from_invalid_buf(): + + buf = BytesIO(b"") + stream = DocumentStream(name="my_doc.pdf", stream=buf) + + doc = _make_input_doc_from_stream(stream) + assert doc.valid == False + + +def _make_input_doc(path): + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + ) + return in_doc + + +def _make_input_doc_from_stream(doc_stream): + in_doc = InputDocument( + path_or_stream=doc_stream.stream, + format=InputFormat.PDF, + filename=doc_stream.name, + backend=PyPdfiumDocumentBackend, + ) + return in_doc