Skip to content

Commit

Permalink
enhancement: Add timeout limit to document parsing job. DS4SD#270
Browse files Browse the repository at this point in the history
Testing:
(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100.123
INFO:docling.document_converter:Going to convert document batch...
Fetching 9 files: 100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 27513.66it/s]
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 23.67 sec.
INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md
INFO:docling.cli.main:Processed 1 docs, of which 0 failed
INFO:docling.cli.main:All documents were converted in 23.68 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=5.4567
INFO:docling.document_converter:Going to convert document batch...
Fetching 9 files: 100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 50805.84it/s]
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
WARNING:docling.pipeline.base_pipeline:Document processing time (6.477 seconds) exceeded the specified timeout of 5.457 seconds
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 10.65 sec.
WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmp9v8ng4n3/2206.01062v1.pdf failed to convert.
INFO:docling.cli.main:Processed 1 docs, of which 1 failed
INFO:docling.cli.main:All documents were converted in 10.65 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062
INFO:docling.document_converter:Going to convert document batch...
Fetching 9 files: 100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 85792.58it/s]
INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf
INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 21.84 sec.
INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md
INFO:docling.cli.main:Processed 1 docs, of which 0 failed
INFO:docling.cli.main:All documents were converted in 21.85 seconds.

(.venv) mario@Abhisheks-MacBook-Air docling % docling

 Usage: docling [OPTIONS] source

╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────╮
│ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. │
│                                 [default: None]                                                   │
│                                 [required]                                                        │
╰───────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────╮
│ --from                                       [docx|pptx|html|image|pd  Specify input formats to   │
│                                              f|asciidoc|md]            convert from. Defaults to  │
│                                                                        all formats.               │
│                                                                        [default: None]            │
│ --to                                         [md|json|text|doctags]    Specify output formats.    │
│                                                                        Defaults to Markdown.      │
│                                                                        [default: None]            │
│ --ocr                 --no-ocr                                         If enabled, the bitmap     │
│                                                                        content will be processed  │
│                                                                        using OCR.                 │
│                                                                        [default: ocr]             │
│ --force-ocr           --no-force-ocr                                   Replace any existing text  │
│                                                                        with OCR generated text    │
│                                                                        over the full content.     │
│                                                                        [default: no-force-ocr]    │
│ --ocr-engine                                 [easyocr|tesseract_cli|t  The OCR engine to use.     │
│                                              esseract]                 [default: easyocr]         │
│ --pdf-backend                                [pypdfium2|dlparse_v1|dl  The PDF backend to use.    │
│                                              parse_v2]                 [default: dlparse_v1]      │
│ --table-mode                                 [fast|accurate]           The mode to use in the     │
│                                                                        table structure model.     │
│                                                                        [default: fast]            │
│ --artifacts-path                             PATH                      If provided, the location  │
│                                                                        of the model artifacts.    │
│                                                                        [default: None]            │
│ --abort-on-error      --no-abort-on-error                              If enabled, the bitmap     │
│                                                                        content will be processed  │
│                                                                        using OCR.                 │
│                                                                        [default:                  │
│                                                                        no-abort-on-error]         │
│ --output                                     PATH                      Output directory where     │
│                                                                        results are saved.         │
│                                                                        [default: .]               │
│ --version                                                              Show version information.  │
│ --document-timeout                           FLOAT                     The timeout for processing │
│                                                                        each document, in seconds. │
│                                                                        [default: None]            │
│ --help                                                                 Show this message and      │
│                                                                        exit.                      │
╰───────────────────────────────────────────────────────────────────────────────────────────────────╯
  • Loading branch information
Abhishek Kumar authored and Abhishek Kumar committed Nov 22, 2024
1 parent 2c0c439 commit 0b82dbf
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 3 deletions.
8 changes: 8 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,13 @@ def convert(
help="Show version information.",
),
] = None,
document_timeout: Annotated[
Optional[float],
typer.Option(
...,
help="The timeout for processing each document, in seconds.",
),
] = None,
):
logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -238,6 +245,7 @@ def convert(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
document_timeout=document_timeout,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode
Expand Down
1 change: 1 addition & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,4 @@ class PdfPipelineOptions(PipelineOptions):
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = False
document_timeout: Optional[float] = None
15 changes: 12 additions & 3 deletions docling/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
# conv_res.status = ConversionStatus.FAILURE
# return conv_res

total_elapsed_time = 0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

for i in range(0, conv_res.input.page_count):
Expand All @@ -136,7 +137,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
for page_batch in chunkify(
conv_res.pages, settings.perf.page_batch_size
):
start_pb_time = time.time()
start_batch_time = time.monotonic()

# 1. Initialise the page resources
init_pages = map(
Expand All @@ -149,8 +150,16 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
for p in pipeline_pages: # Must exhaust!
pass

end_pb_time = time.time() - start_pb_time
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
end_batch_time = time.monotonic()
total_elapsed_time += end_batch_time - start_batch_time
if self.pipeline_options.document_timeout is not None and total_elapsed_time > self.pipeline_options.document_timeout:
_log.warning(
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
)
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
break

_log.debug(f"Finished converting page batch time={end_batch_time:.3f}")

except Exception as e:
conv_res.status = ConversionStatus.FAILURE
Expand Down

0 comments on commit 0b82dbf

Please sign in to comment.