From e125b9b24d41c122a74f9f68b3132ce096f07985 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Sun, 8 Dec 2024 18:32:08 +0100 Subject: [PATCH 1/4] fix: main: Introduce format options for Image with the same pdf pipeline_options. Add RapidOcrOptions to the Union of ocr_options for PdfPipelineOptions Signed-off-by: Nikos Livathinos --- docling/cli/main.py | 6 ++++-- docling/datamodel/pipeline_options.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 87a93d15..a2e6962e 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -342,11 +342,13 @@ def convert( else: raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") - format_options: Dict[InputFormat, FormatOption] = { - InputFormat.PDF: PdfFormatOption( + pdf_format_option = PdfFormatOption( pipeline_options=pipeline_options, backend=backend, # pdf_backend ) + format_options: Dict[InputFormat, FormatOption] = { + InputFormat.PDF: pdf_format_option, + InputFormat.IMAGE: pdf_format_option, } doc_converter = DocumentConverter( allowed_formats=from_formats, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 63e0d3c6..13560132 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -143,7 +143,7 @@ class PdfPipelineOptions(PipelineOptions): table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: Union[ - EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions + EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions, RapidOcrOptions ] = Field(EasyOcrOptions(), discriminator="kind") images_scale: float = 1.0 From 64c738288091e012a18ad9d1cfa120c6b1859c75 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Sun, 8 Dec 2024 18:35:06 +0100 Subject: [PATCH 2/4] fix: Silence the tqdm messages during the downloading of model files Signed-off-by: Nikos Livathinos --- docling/pipeline/standard_pdf_pipeline.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 40105a38..f733ced9 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -98,6 +98,11 @@ def download_models_hf( ) -> Path: from huggingface_hub import snapshot_download + # Disable tqdm prints used by HF + from tqdm import tqdm + from functools import partialmethod + tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) + download_path = snapshot_download( repo_id="ds4sd/docling-models", force_download=force, From 04977aac9fe90009f41948744755907bea0f7c34 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Sun, 8 Dec 2024 22:14:48 +0100 Subject: [PATCH 3/4] fix: Code styling Signed-off-by: Nikos Livathinos --- docling/cli/main.py | 6 +++--- docling/datamodel/pipeline_options.py | 6 +++++- docling/pipeline/standard_pdf_pipeline.py | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index a2e6962e..cb21365e 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -343,9 +343,9 @@ def convert( raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") pdf_format_option = PdfFormatOption( - pipeline_options=pipeline_options, - backend=backend, # pdf_backend - ) + pipeline_options=pipeline_options, + backend=backend, # pdf_backend + ) format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 13560132..9be3ee82 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions): table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: Union[ - EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions, RapidOcrOptions + EasyOcrOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, + OcrMacOptions, + RapidOcrOptions, ] = Field(EasyOcrOptions(), discriminator="kind") images_scale: float = 1.0 diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index f733ced9..42ce238f 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -96,11 +96,13 @@ def __init__(self, pipeline_options: PdfPipelineOptions): def download_models_hf( local_dir: Optional[Path] = None, force: bool = False ) -> Path: + from functools import partialmethod + from huggingface_hub import snapshot_download # Disable tqdm prints used by HF from tqdm import tqdm - from functools import partialmethod + tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) download_path = snapshot_download( From bb83bc3f8d3506c06ffa886f39d51b463e339af8 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Mon, 9 Dec 2024 14:25:18 +0100 Subject: [PATCH 4/4] fix: Use the HF API to disable the tqdm progress bars Signed-off-by: Nikos Livathinos --- docling/pipeline/standard_pdf_pipeline.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 42ce238f..6f5e1542 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -96,15 +96,10 @@ def __init__(self, pipeline_options: PdfPipelineOptions): def download_models_hf( local_dir: Optional[Path] = None, force: bool = False ) -> Path: - from functools import partialmethod - from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars - # Disable tqdm prints used by HF - from tqdm import tqdm - - tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) - + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/docling-models", force_download=force,