diff --git a/docling/cli/main.py b/docling/cli/main.py index 60a3c296a..d68224b4b 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -194,6 +194,13 @@ def convert( help="Show version information.", ), ] = None, + document_timeout: Annotated[ + Optional[float], + typer.Option( + ..., + help="The timeout for processing each document, in seconds.", + ), + ] = None, ): logging.basicConfig(level=logging.INFO) @@ -238,6 +245,7 @@ def convert( do_ocr=ocr, ocr_options=ocr_options, do_table_structure=True, + document_timeout=document_timeout, ) pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching pipeline_options.table_structure_options.mode = table_mode diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 2b9d228c5..f4295ceb9 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -82,3 +82,4 @@ class PdfPipelineOptions(PipelineOptions): generate_page_images: bool = False generate_picture_images: bool = False generate_table_images: bool = False + document_timeout: Optional[float] = None diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 5013ad584..181588b0c 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -126,6 +126,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: # conv_res.status = ConversionStatus.FAILURE # return conv_res + total_elapsed_time = 0 with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): for i in range(0, conv_res.input.page_count): @@ -136,7 +137,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: for page_batch in chunkify( conv_res.pages, settings.perf.page_batch_size ): - start_pb_time = time.time() + start_batch_time = time.monotonic() # 1. Initialise the page resources init_pages = map( @@ -149,8 +150,16 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: for p in pipeline_pages: # Must exhaust! pass - end_pb_time = time.time() - start_pb_time - _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") + end_batch_time = time.monotonic() + total_elapsed_time += end_batch_time - start_batch_time + if self.pipeline_options.document_timeout is not None and total_elapsed_time > self.pipeline_options.document_timeout: + _log.warning( + f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds" + ) + conv_res.status = ConversionStatus.PARTIAL_SUCCESS + break + + _log.debug(f"Finished converting page batch time={end_batch_time:.3f}") except Exception as e: conv_res.status = ConversionStatus.FAILURE