perf: prevent temp file leftovers, reuse core type (#487)

* chore: reuse DocumentStream from docling-core Signed-off-by: Panos Vagenas <[email protected]> * update docling-core version Signed-off-by: Panos Vagenas <[email protected]> * [skip ci] document import line Signed-off-by: Panos Vagenas <[email protected]> * fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490) use new resolve_source_to_x functions Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]> Signed-off-by: Christoph Auer <[email protected]>
DS4SD · Dec 3, 2024 · 418d815 · 418d815
1 parent 32e9b4a
commit 418d815
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 99 deletions.
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -2,14 +2,15 @@
 import json
 import logging
 import re
+import tempfile
 import time
 import warnings
 from enum import Enum
 from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 
 import typer
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_path
 
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -256,95 +257,98 @@ def convert(
     if from_formats is None:
         from_formats = [e for e in InputFormat]
 
-    input_doc_paths: List[Path] = []
-    for src in input_sources:
-        source = resolve_file_source(source=src)
-        if not source.exists():
-            err_console.print(
-                f"[red]Error: The input file {source} does not exist.[/red]"
-            )
-            raise typer.Abort()
-        elif source.is_dir():
-            for fmt in from_formats:
-                for ext in FormatToExtensions[fmt]:
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+    with tempfile.TemporaryDirectory() as tempdir:
+        input_doc_paths: List[Path] = []
+        for src in input_sources:
+            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+            if not source.exists():
+                err_console.print(
+                    f"[red]Error: The input file {source} does not exist.[/red]"
+                )
+                raise typer.Abort()
+            elif source.is_dir():
+                for fmt in from_formats:
+                    for ext in FormatToExtensions[fmt]:
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+            else:
+                input_doc_paths.append(source)
+
+        if to_formats is None:
+            to_formats = [OutputFormat.MARKDOWN]
+
+        export_json = OutputFormat.JSON in to_formats
+        export_md = OutputFormat.MARKDOWN in to_formats
+        export_txt = OutputFormat.TEXT in to_formats
+        export_doctags = OutputFormat.DOCTAGS in to_formats
+
+        if ocr_engine == OcrEngine.EASYOCR:
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.RAPIDOCR:
+            ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
         else:
-            input_doc_paths.append(source)
-
-    if to_formats is None:
-        to_formats = [OutputFormat.MARKDOWN]
-
-    export_json = OutputFormat.JSON in to_formats
-    export_md = OutputFormat.MARKDOWN in to_formats
-    export_txt = OutputFormat.TEXT in to_formats
-    export_doctags = OutputFormat.DOCTAGS in to_formats
-
-    if ocr_engine == OcrEngine.EASYOCR:
-        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT_CLI:
-        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT:
-        ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.OCRMAC:
-        ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.RAPIDOCR:
-        ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
-    else:
-        raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
-
-    ocr_lang_list = _split_list(ocr_lang)
-    if ocr_lang_list is not None:
-        ocr_options.lang = ocr_lang_list
-
-    pipeline_options = PdfPipelineOptions(
-        do_ocr=ocr,
-        ocr_options=ocr_options,
-        do_table_structure=True,
-    )
-    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
-    pipeline_options.table_structure_options.mode = table_mode
-
-    if artifacts_path is not None:
-        pipeline_options.artifacts_path = artifacts_path
-
-    if pdf_backend == PdfBackend.DLPARSE_V1:
-        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-    elif pdf_backend == PdfBackend.DLPARSE_V2:
-        backend = DoclingParseV2DocumentBackend
-    elif pdf_backend == PdfBackend.PYPDFIUM2:
-        backend = PyPdfiumDocumentBackend
-    else:
-        raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-
-    format_options: Dict[InputFormat, FormatOption] = {
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+
+        ocr_lang_list = _split_list(ocr_lang)
+        if ocr_lang_list is not None:
+            ocr_options.lang = ocr_lang_list
+
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=ocr,
+            ocr_options=ocr_options,
+            do_table_structure=True,
         )
-    }
-    doc_converter = DocumentConverter(
-        allowed_formats=from_formats,
-        format_options=format_options,
-    )
+        pipeline_options.table_structure_options.do_cell_matching = (
+            True  # do_cell_matching
+        )
+        pipeline_options.table_structure_options.mode = table_mode
 
-    start_time = time.time()
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path
 
-    conv_results = doc_converter.convert_all(
-        input_doc_paths, raises_on_error=abort_on_error
-    )
+        if pdf_backend == PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        elif pdf_backend == PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        elif pdf_backend == PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        else:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
 
-    output.mkdir(parents=True, exist_ok=True)
-    export_documents(
-        conv_results,
-        output_dir=output,
-        export_json=export_json,
-        export_md=export_md,
-        export_txt=export_txt,
-        export_doctags=export_doctags,
-    )
+        format_options: Dict[InputFormat, FormatOption] = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        }
+        doc_converter = DocumentConverter(
+            allowed_formats=from_formats,
+            format_options=format_options,
+        )
+
+        start_time = time.time()
+
+        conv_results = doc_converter.convert_all(
+            input_doc_paths, raises_on_error=abort_on_error
+        )
+
+        output.mkdir(parents=True, exist_ok=True)
+        export_documents(
+            conv_results,
+            output_dir=output,
+            export_json=export_json,
+            export_md=export_md,
+            export_txt=export_txt,
+            export_doctags=export_doctags,
+        )
 
-    end_time = time.time() - start_time
+        end_time = time.time() - start_time
 
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
 

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -1,5 +1,4 @@
 from enum import Enum, auto
-from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from docling_core.types.doc import (
@@ -9,6 +8,9 @@
     Size,
     TableCell,
 )
+from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+    DocumentStream,
+)
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
 
@@ -207,10 +209,3 @@ def get_image(self, scale: float = 1.0) -> Optional[Image]:
     @property
     def image(self) -> Optional[Image]:
         return self.get_image(scale=self._default_image_scale)
-
-
-class DocumentStream(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    name: str
-    stream: BytesIO
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -32,7 +32,7 @@
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_stream
 from pydantic import BaseModel
 from typing_extensions import deprecated
 
@@ -459,7 +459,7 @@ def docs(
         self, format_options: Dict[InputFormat, "FormatOption"]
     ) -> Iterable[InputDocument]:
         for item in self.path_or_stream_iterator:
-            obj = resolve_file_source(item) if isinstance(item, str) else item
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
             format = self._guess_format(obj)
             if format not in format_options.keys():
                 _log.info(

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = ">=2.0.0,<2.10"
-docling-core = "^2.5.1"
+docling-core = "^2.6.1"
 docling-ibm-models = "^2.0.6"
 deepsearch-glm = "^0.26.1"
 filetype = "^1.2.0"