From 418d8159bd5b4d59c8a6d65945dd7a9eb1a59faa Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 3 Dec 2024 10:40:28 +0100 Subject: [PATCH] perf: prevent temp file leftovers, reuse core type (#487) * chore: reuse DocumentStream from docling-core Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * update docling-core version Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * [skip ci] document import line Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490) use new resolve_source_to_x functions Signed-off-by: Michele Dolfi --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: Michele Dolfi Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Christoph Auer --- docling/cli/main.py | 172 ++++++++++++++++--------------- docling/datamodel/base_models.py | 11 +- docling/datamodel/document.py | 4 +- poetry.lock | 15 ++- pyproject.toml | 2 +- 5 files changed, 105 insertions(+), 99 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index ac8b19aa..ec2c0777 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -2,6 +2,7 @@ import json import logging import re +import tempfile import time import warnings from enum import Enum @@ -9,7 +10,7 @@ from typing import Annotated, Dict, Iterable, List, Optional, Type import typer -from docling_core.utils.file import resolve_file_source +from docling_core.utils.file import resolve_source_to_path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend @@ -256,95 +257,98 @@ def convert( if from_formats is None: from_formats = [e for e in InputFormat] - input_doc_paths: List[Path] = [] - for src in input_sources: - source = resolve_file_source(source=src) - if not source.exists(): - err_console.print( - f"[red]Error: The input file {source} does not exist.[/red]" - ) - raise typer.Abort() - elif source.is_dir(): - for fmt in from_formats: - for ext in FormatToExtensions[fmt]: - input_doc_paths.extend(list(source.glob(f"**/*.{ext}"))) - input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}"))) + with tempfile.TemporaryDirectory() as tempdir: + input_doc_paths: List[Path] = [] + for src in input_sources: + source = resolve_source_to_path(source=src, workdir=Path(tempdir)) + if not source.exists(): + err_console.print( + f"[red]Error: The input file {source} does not exist.[/red]" + ) + raise typer.Abort() + elif source.is_dir(): + for fmt in from_formats: + for ext in FormatToExtensions[fmt]: + input_doc_paths.extend(list(source.glob(f"**/*.{ext}"))) + input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}"))) + else: + input_doc_paths.append(source) + + if to_formats is None: + to_formats = [OutputFormat.MARKDOWN] + + export_json = OutputFormat.JSON in to_formats + export_md = OutputFormat.MARKDOWN in to_formats + export_txt = OutputFormat.TEXT in to_formats + export_doctags = OutputFormat.DOCTAGS in to_formats + + if ocr_engine == OcrEngine.EASYOCR: + ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr) + elif ocr_engine == OcrEngine.TESSERACT_CLI: + ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr) + elif ocr_engine == OcrEngine.TESSERACT: + ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr) + elif ocr_engine == OcrEngine.OCRMAC: + ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr) + elif ocr_engine == OcrEngine.RAPIDOCR: + ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr) else: - input_doc_paths.append(source) - - if to_formats is None: - to_formats = [OutputFormat.MARKDOWN] - - export_json = OutputFormat.JSON in to_formats - export_md = OutputFormat.MARKDOWN in to_formats - export_txt = OutputFormat.TEXT in to_formats - export_doctags = OutputFormat.DOCTAGS in to_formats - - if ocr_engine == OcrEngine.EASYOCR: - ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr) - elif ocr_engine == OcrEngine.TESSERACT_CLI: - ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr) - elif ocr_engine == OcrEngine.TESSERACT: - ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr) - elif ocr_engine == OcrEngine.OCRMAC: - ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr) - elif ocr_engine == OcrEngine.RAPIDOCR: - ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr) - else: - raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") - - ocr_lang_list = _split_list(ocr_lang) - if ocr_lang_list is not None: - ocr_options.lang = ocr_lang_list - - pipeline_options = PdfPipelineOptions( - do_ocr=ocr, - ocr_options=ocr_options, - do_table_structure=True, - ) - pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching - pipeline_options.table_structure_options.mode = table_mode - - if artifacts_path is not None: - pipeline_options.artifacts_path = artifacts_path - - if pdf_backend == PdfBackend.DLPARSE_V1: - backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V2: - backend = DoclingParseV2DocumentBackend - elif pdf_backend == PdfBackend.PYPDFIUM2: - backend = PyPdfiumDocumentBackend - else: - raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") - - format_options: Dict[InputFormat, FormatOption] = { - InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, - backend=backend, # pdf_backend + raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") + + ocr_lang_list = _split_list(ocr_lang) + if ocr_lang_list is not None: + ocr_options.lang = ocr_lang_list + + pipeline_options = PdfPipelineOptions( + do_ocr=ocr, + ocr_options=ocr_options, + do_table_structure=True, ) - } - doc_converter = DocumentConverter( - allowed_formats=from_formats, - format_options=format_options, - ) + pipeline_options.table_structure_options.do_cell_matching = ( + True # do_cell_matching + ) + pipeline_options.table_structure_options.mode = table_mode - start_time = time.time() + if artifacts_path is not None: + pipeline_options.artifacts_path = artifacts_path - conv_results = doc_converter.convert_all( - input_doc_paths, raises_on_error=abort_on_error - ) + if pdf_backend == PdfBackend.DLPARSE_V1: + backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend + elif pdf_backend == PdfBackend.DLPARSE_V2: + backend = DoclingParseV2DocumentBackend + elif pdf_backend == PdfBackend.PYPDFIUM2: + backend = PyPdfiumDocumentBackend + else: + raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") - output.mkdir(parents=True, exist_ok=True) - export_documents( - conv_results, - output_dir=output, - export_json=export_json, - export_md=export_md, - export_txt=export_txt, - export_doctags=export_doctags, - ) + format_options: Dict[InputFormat, FormatOption] = { + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + backend=backend, # pdf_backend + ) + } + doc_converter = DocumentConverter( + allowed_formats=from_formats, + format_options=format_options, + ) + + start_time = time.time() + + conv_results = doc_converter.convert_all( + input_doc_paths, raises_on_error=abort_on_error + ) + + output.mkdir(parents=True, exist_ok=True) + export_documents( + conv_results, + output_dir=output, + export_json=export_json, + export_md=export_md, + export_txt=export_txt, + export_doctags=export_doctags, + ) - end_time = time.time() - start_time + end_time = time.time() - start_time _log.info(f"All documents were converted in {end_time:.2f} seconds.") diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 311d6d01..8e584f4b 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,5 +1,4 @@ from enum import Enum, auto -from io import BytesIO from typing import TYPE_CHECKING, Dict, List, Optional, Union from docling_core.types.doc import ( @@ -9,6 +8,9 @@ Size, TableCell, ) +from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location + DocumentStream, +) from PIL.Image import Image from pydantic import BaseModel, ConfigDict @@ -207,10 +209,3 @@ def get_image(self, scale: float = 1.0) -> Optional[Image]: @property def image(self) -> Optional[Image]: return self.get_image(scale=self._default_image_scale) - - -class DocumentStream(BaseModel): - model_config = ConfigDict(arbitrary_types_allowed=True) - - name: str - stream: BytesIO diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index be4e9a12..2fadb7f9 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -32,7 +32,7 @@ ) from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument -from docling_core.utils.file import resolve_file_source +from docling_core.utils.file import resolve_source_to_stream from pydantic import BaseModel from typing_extensions import deprecated @@ -459,7 +459,7 @@ def docs( self, format_options: Dict[InputFormat, "FormatOption"] ) -> Iterable[InputDocument]: for item in self.path_or_stream_iterator: - obj = resolve_file_source(item) if isinstance(item, str) else item + obj = resolve_source_to_stream(item) if isinstance(item, str) else item format = self._guess_format(obj) if format not in format_options.keys(): _log.info( diff --git a/poetry.lock b/poetry.lock index 9e057ed0..92072569 100644 --- a/poetry.lock +++ b/poetry.lock @@ -896,13 +896,13 @@ files = [ [[package]] name = "docling-core" -version = "2.5.1" +version = "2.6.1" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.5.1-py3-none-any.whl", hash = "sha256:e4a5626520714c25a5ec2f9f7495407e730485257c2272e8467faae7357435bf"}, - {file = "docling_core-2.5.1.tar.gz", hash = "sha256:e9a7c7c46f869b13747436a2ce42df3632af655e1a3af574dfcd114e71dcbb75"}, + {file = "docling_core-2.6.1-py3-none-any.whl", hash = "sha256:8e7a5bc0ce13289567738481949fed3ab580f2d8cea7525b246159233d81b26b"}, + {file = "docling_core-2.6.1.tar.gz", hash = "sha256:c8af45e0873611120cc24757d567d37e053a54e2ce060b7b5b44efd0d73f75e5"}, ] [package.dependencies] @@ -913,6 +913,7 @@ pillow = ">=10.3.0,<11.0.0" pydantic = ">=2.6.0,<2.10" pyyaml = ">=5.1,<7.0.0" tabulate = ">=0.9.0,<0.10.0" +typing-extensions = ">=4.12.2,<5.0.0" [[package]] name = "docling-ibm-models" @@ -3200,6 +3201,7 @@ files = [ {file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"}, {file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"}, {file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"}, + {file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"}, ] [[package]] @@ -6028,6 +6030,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -7646,4 +7653,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "2e7c27ffe32d556a66cc1008a7147a90c17f63b01d2a6cde3e7b941ba7e268d7" +content-hash = "ee3b3d938295f0057567c10fb808a0d95ed2fe9a32f459d489b4b29aacf710c8" diff --git a/pyproject.toml b/pyproject.toml index 4d4ceac9..ed030471 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ packages = [{include = "docling"}] ###################### python = "^3.9" pydantic = ">=2.0.0,<2.10" -docling-core = "^2.5.1" +docling-core = "^2.6.1" docling-ibm-models = "^2.0.6" deepsearch-glm = "^0.26.1" filetype = "^1.2.0"