Skip to content

Commit

Permalink
perf: prevent temp file leftovers, reuse core type (#487)
Browse files Browse the repository at this point in the history
* chore: reuse DocumentStream from docling-core

Signed-off-by: Panos Vagenas <[email protected]>

* update docling-core version

Signed-off-by: Panos Vagenas <[email protected]>

* [skip ci] document  import line

Signed-off-by: Panos Vagenas <[email protected]>

* fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490)

use new resolve_source_to_x functions

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Panos Vagenas <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
vagenas and dolfim-ibm authored Dec 3, 2024
1 parent 32e9b4a commit 418d815
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 99 deletions.
172 changes: 88 additions & 84 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import json
import logging
import re
import tempfile
import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type

import typer
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_path

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
Expand Down Expand Up @@ -256,95 +257,98 @@ def convert(
if from_formats is None:
from_formats = [e for e in InputFormat]

input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_file_source(source=src)
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
)
raise typer.Abort()
elif source.is_dir():
for fmt in from_formats:
for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
)
raise typer.Abort()
elif source.is_dir():
for fmt in from_formats:
for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else:
input_doc_paths.append(source)

if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]

export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats

if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
input_doc_paths.append(source)

if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]

export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats

if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list

pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode

if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path

if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list

pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
)
pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching
)
pipeline_options.table_structure_options.mode = table_mode

start_time = time.time()
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path

conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
)

start_time = time.time()

conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)

output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)

end_time = time.time() - start_time
end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

Expand Down
11 changes: 3 additions & 8 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union

from docling_core.types.doc import (
Expand All @@ -9,6 +8,9 @@
Size,
TableCell,
)
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict

Expand Down Expand Up @@ -207,10 +209,3 @@ def get_image(self, scale: float = 1.0) -> Optional[Image]:
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)


class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

name: str
stream: BytesIO
4 changes: 2 additions & 2 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel
from typing_extensions import deprecated

Expand Down Expand Up @@ -459,7 +459,7 @@ def docs(
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
_log.info(
Expand Down
15 changes: 11 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ packages = [{include = "docling"}]
######################
python = "^3.9"
pydantic = ">=2.0.0,<2.10"
docling-core = "^2.5.1"
docling-core = "^2.6.1"
docling-ibm-models = "^2.0.6"
deepsearch-glm = "^0.26.1"
filetype = "^1.2.0"
Expand Down

0 comments on commit 418d815

Please sign in to comment.