From 904d24d6005d113c50bde0ad398fdaafbbfb3027 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:54:53 +0100 Subject: [PATCH] fix: allow to explicitly initialize the pipeline (#189) * feat: allow to explicitly initialize the pipeline Signed-off-by: Michele Dolfi * clean examples Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/document_converter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docling/document_converter.py b/docling/document_converter.py index d6d4a630..f2d29e62 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -139,6 +139,10 @@ def __init__( self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} + def initialize_pipeline(self, format: InputFormat): + """Initialize the conversion pipeline for the selected format.""" + self._get_pipeline(doc_format=format) + @validate_call(config=ConfigDict(strict=True)) def convert( self, @@ -219,13 +223,13 @@ def _convert( else: _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.") - def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: + def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: assert self.format_to_options is not None - fopt = self.format_to_options.get(doc.format) + fopt = self.format_to_options.get(doc_format) if fopt is None: - raise RuntimeError(f"Could not get pipeline for document {doc.file}") + raise RuntimeError(f"Could not get pipeline for {doc_format}") else: pipeline_class = fopt.pipeline_cls pipeline_options = fopt.pipeline_options @@ -256,7 +260,7 @@ def _execute_pipeline( self, in_doc: InputDocument, raises_on_error: bool ) -> ConversionResult: if in_doc.valid: - pipeline = self._get_pipeline(in_doc) + pipeline = self._get_pipeline(in_doc.format) if pipeline is None: # Can't find a default pipeline. Should this raise? if raises_on_error: raise RuntimeError(