Merge branch 'main' into nli/performance_main

Signed-off-by: Nikos Livathinos <[email protected]>
DS4SD · Dec 13, 2024 · 6209cf3 · 6209cf3
2 parents 30dbab5 + 365a1e7
commit 6209cf3
Show file tree

Hide file tree

Showing 17 changed files with 575 additions and 884 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,30 @@
+## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12
+
+### Feature
+
+* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/DS4SD/docling/issues/552)) ([`3da166e`](https://github.com/DS4SD/docling/commit/3da166eafa3c119de961510341cb92397652c222))
+
+### Fix
+
+* Do not import python modules from deepsearch-glm ([#569](https://github.com/DS4SD/docling/issues/569)) ([`aee9c0b`](https://github.com/DS4SD/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf))
+* Handle no result from RapidOcr reader ([#558](https://github.com/DS4SD/docling/issues/558)) ([`f45499c`](https://github.com/DS4SD/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1))
+* Make enum serializable with human-readable value ([#555](https://github.com/DS4SD/docling/issues/555)) ([`a7df337`](https://github.com/DS4SD/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250))
+
+### Documentation
+
+* Update chunking usage docs, minor reorg ([#550](https://github.com/DS4SD/docling/issues/550)) ([`d0c9e8e`](https://github.com/DS4SD/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695))
+
+## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
+
+### Feature
+
+* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
+
+### Fix
+
+* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
+* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
+
 ## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
 
 ### Feature

diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -29,8 +29,10 @@
     AcceleratorDevice,
     AcceleratorOptions,
     EasyOcrOptions,
+    OcrEngine,
     OcrMacOptions,
     OcrOptions,
+    PdfBackend,
     PdfPipelineOptions,
     RapidOcrOptions,
     TableFormerMode,
@@ -70,22 +72,6 @@ def version_callback(value: bool):
         raise typer.Exit()
 
 
-# Define an enum for the backend options
-class PdfBackend(str, Enum):
-    PYPDFIUM2 = "pypdfium2"
-    DLPARSE_V1 = "dlparse_v1"
-    DLPARSE_V2 = "dlparse_v2"
-
-
-# Define an enum for the ocr engines
-class OcrEngine(str, Enum):
-    EASYOCR = "easyocr"
-    TESSERACT_CLI = "tesseract_cli"
-    TESSERACT = "tesseract"
-    OCRMAC = "ocrmac"
-    RAPIDOCR = "rapidocr"
-
-
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
@@ -266,6 +252,13 @@ def convert(
             help="Show version information.",
         ),
     ] = None,
+    document_timeout: Annotated[
+        Optional[float],
+        typer.Option(
+            ...,
+            help="The timeout for processing each document, in seconds.",
+        ),
+    ] = None,
     num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
     device: Annotated[
         AcceleratorDevice, typer.Option(..., help="Accelerator device")
@@ -355,6 +348,7 @@ def convert(
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,
+            document_timeout=document_timeout,
         )
         pipeline_options.table_structure_options.do_cell_matching = (
             True  # do_cell_matching

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -19,12 +19,12 @@
 
 
 class ConversionStatus(str, Enum):
-    PENDING = auto()
-    STARTED = auto()
-    FAILURE = auto()
-    SUCCESS = auto()
-    PARTIAL_SUCCESS = auto()
-    SKIPPED = auto()
+    PENDING = "pending"
+    STARTED = "started"
+    FAILURE = "failure"
+    SUCCESS = "success"
+    PARTIAL_SUCCESS = "partial_success"
+    SKIPPED = "skipped"
 
 
 class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ class OutputFormat(str, Enum):
 
 
 class DocInputType(str, Enum):
-    PATH = auto()
-    STREAM = auto()
+    PATH = "path"
+    STREAM = "stream"
 
 
 class DoclingComponentType(str, Enum):
-    DOCUMENT_BACKEND = auto()
-    MODEL = auto()
-    DOC_ASSEMBLER = auto()
-    USER_INPUT = auto()
+    DOCUMENT_BACKEND = "document_backend"
+    MODEL = "model"
+    DOC_ASSEMBLER = "doc_assembler"
+    USER_INPUT = "user_input"
 
 
 class ErrorItem(BaseModel):

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -33,6 +33,7 @@
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
+from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
 from typing_extensions import deprecated
 
@@ -191,259 +192,7 @@ class ConversionResult(BaseModel):
     @property
     @deprecated("Use document instead.")
     def legacy_document(self):
-        reverse_label_mapping = {
-            DocItemLabel.CAPTION.value: "Caption",
-            DocItemLabel.FOOTNOTE.value: "Footnote",
-            DocItemLabel.FORMULA.value: "Formula",
-            DocItemLabel.LIST_ITEM.value: "List-item",
-            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
-            DocItemLabel.PAGE_HEADER.value: "Page-header",
-            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
-            DocItemLabel.SECTION_HEADER.value: "Section-header",
-            DocItemLabel.TABLE.value: "Table",
-            DocItemLabel.TEXT.value: "Text",
-            DocItemLabel.TITLE.value: "Title",
-            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
-            DocItemLabel.CODE.value: "Code",
-            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
-            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
-            DocItemLabel.FORM.value: "Form",
-            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
-            DocItemLabel.PARAGRAPH.value: "paragraph",
-        }
-
-        title = ""
-        desc = DsDocumentDescription(logs=[])
-
-        page_hashes = [
-            PageReference(
-                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
-                page=p.page_no,
-                model="default",
-            )
-            for p in self.document.pages.values()
-        ]
-
-        file_info = DsFileInfoObject(
-            filename=self.input.file.name,
-            document_hash=self.input.document_hash,
-            num_pages=self.input.page_count,
-            page_hashes=page_hashes,
-        )
-
-        main_text = []
-        tables = []
-        figures = []
-        equations = []
-        footnotes = []
-        page_headers = []
-        page_footers = []
-
-        embedded_captions = set()
-        for ix, (item, level) in enumerate(
-            self.document.iterate_items(self.document.body)
-        ):
-
-            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
-                caption = item.caption_text(self.document)
-                if caption:
-                    embedded_captions.add(caption)
-
-        for item, level in self.document.iterate_items():
-            if isinstance(item, DocItem):
-                item_type = item.label
-
-                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
-
-                    if isinstance(item, ListItem) and item.marker:
-                        text = f"{item.marker} {item.text}"
-                    else:
-                        text = item.text
-
-                    # Can be empty.
-                    prov = [
-                        Prov(
-                            bbox=p.bbox.as_tuple(),
-                            page=p.page_no,
-                            span=[0, len(item.text)],
-                        )
-                        for p in item.prov
-                    ]
-                    main_text.append(
-                        BaseText(
-                            text=text,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            name=reverse_label_mapping[item.label],
-                            prov=prov,
-                        )
-                    )
-
-                    # skip captions of they are embedded in the actual
-                    # floating object
-                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
-                        continue
-
-                elif isinstance(item, TableItem) and item.data:
-                    index = len(tables)
-                    ref_str = f"#/tables/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-
-                    # Initialise empty table data grid (only empty cells)
-                    table_data = [
-                        [
-                            TableCell(
-                                text="",
-                                # bbox=[0,0,0,0],
-                                spans=[[i, j]],
-                                obj_type="body",
-                            )
-                            for j in range(item.data.num_cols)
-                        ]
-                        for i in range(item.data.num_rows)
-                    ]
-
-                    # Overwrite cells in table data for which there is actual cell content.
-                    for cell in item.data.table_cells:
-                        for i in range(
-                            min(cell.start_row_offset_idx, item.data.num_rows),
-                            min(cell.end_row_offset_idx, item.data.num_rows),
-                        ):
-                            for j in range(
-                                min(cell.start_col_offset_idx, item.data.num_cols),
-                                min(cell.end_col_offset_idx, item.data.num_cols),
-                            ):
-                                celltype = "body"
-                                if cell.column_header:
-                                    celltype = "col_header"
-                                elif cell.row_header:
-                                    celltype = "row_header"
-                                elif cell.row_section:
-                                    celltype = "row_section"
-
-                                def make_spans(cell):
-                                    for rspan in range(
-                                        min(
-                                            cell.start_row_offset_idx,
-                                            item.data.num_rows,
-                                        ),
-                                        min(
-                                            cell.end_row_offset_idx, item.data.num_rows
-                                        ),
-                                    ):
-                                        for cspan in range(
-                                            min(
-                                                cell.start_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                            min(
-                                                cell.end_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                        ):
-                                            yield [rspan, cspan]
-
-                                spans = list(make_spans(cell))
-                                table_data[i][j] = GlmTableCell(
-                                    text=cell.text,
-                                    bbox=(
-                                        cell.bbox.as_tuple()
-                                        if cell.bbox is not None
-                                        else None
-                                    ),  # check if this is bottom-left
-                                    spans=spans,
-                                    obj_type=celltype,
-                                    col=j,
-                                    row=i,
-                                    row_header=cell.row_header,
-                                    row_section=cell.row_section,
-                                    col_header=cell.column_header,
-                                    row_span=[
-                                        cell.start_row_offset_idx,
-                                        cell.end_row_offset_idx,
-                                    ],
-                                    col_span=[
-                                        cell.start_col_offset_idx,
-                                        cell.end_col_offset_idx,
-                                    ],
-                                )
-
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-
-                    tables.append(
-                        DsSchemaTable(
-                            text=caption,
-                            num_cols=item.data.num_cols,
-                            num_rows=item.data.num_rows,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            data=table_data,
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, 0],
-                                )
-                                for p in item.prov
-                            ],
-                        )
-                    )
-
-                elif isinstance(item, PictureItem):
-                    index = len(figures)
-                    ref_str = f"#/figures/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-
-                    figures.append(
-                        Figure(
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, len(caption)],
-                                )
-                                for p in item.prov
-                            ],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            text=caption,
-                            # data=[[]],
-                        )
-                    )
-
-        page_dimensions = [
-            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
-            for p in self.document.pages.values()
-        ]
-
-        ds_doc = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            equations=equations,
-            footnotes=footnotes,
-            page_headers=page_headers,
-            page_footers=page_footers,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-        )
-
-        return ds_doc
+        return docling_document_to_legacy(self.document)
 
 
 class _DummyBackend(AbstractDocumentBackend):