feat: Support tableformer model choice (#90)

* Support tableformer model choice Signed-off-by: Christoph Auer <[email protected]> * Update datamodel structure Signed-off-by: Christoph Auer <[email protected]> * Update docs Signed-off-by: Christoph Auer <[email protected]> * Cleanup Signed-off-by: Christoph Auer <[email protected]> * Add test unit for table options Signed-off-by: Christoph Auer <[email protected]> * Ensure import backwards-compatibility for PipelineOptions Signed-off-by: Christoph Auer <[email protected]> * Update README Signed-off-by: Christoph Auer <[email protected]> * Adjust parameters on custom_convert Signed-off-by: Christoph Auer <[email protected]> * Update Dockerfile Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Christoph Auer <[email protected]>
DS4SD · Sep 26, 2024 · d6df76f · d6df76f
1 parent 39977b5
commit d6df76f
Show file tree

Hide file tree

Showing 16 changed files with 711 additions and 592 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
 ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
 
 RUN apt-get update \
-    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
+    && apt-get install -y libgl1 libglib2.0-0 curl wget git procps \
     && apt-get clean
 
 # This will install torch with *only* cpu support

diff --git a/README.md b/README.md
@@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t
 
 
 ```python
+from docling.datamodel.pipeline_options import PipelineOptions
+
 pipeline_options = PipelineOptions(do_table_structure=True)
 pipeline_options.table_structure_options.do_cell_matching = False  # uses text cells predicted from table structure model
 
@@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
 )
 ```
 
+Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.
+
+```python
+from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
+
+pipeline_options = PipelineOptions(do_table_structure=True)
+pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
+
+doc_converter = DocumentConverter(
+    artifacts_path=artifacts_path,
+    pipeline_options=pipeline_options,
+)
+```
+
 ### Impose limits on the document size
 
 You can limit the file size and number of pages which should be allowed to process per document:

diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -12,8 +12,9 @@
 
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -9,6 +9,10 @@
 from typing_extensions import Self
 
 from docling.backend.abstract_backend import PdfPageBackend
+from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
+    PipelineOptions,
+    TableStructureOptions,
+)
 
 
 class ConversionStatus(str, Enum):
@@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
     stream: BytesIO
 
 
-class TableStructureOptions(BaseModel):
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-
-
-class PipelineOptions(BaseModel):
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-
-    table_structure_options: TableStructureOptions = TableStructureOptions()
-
-
 class AssembleOptions(BaseModel):
     keep_page_images: Annotated[
         bool,

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -4,13 +4,13 @@
 from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 
 from docling_core.types import BaseCell, BaseText
-from docling_core.types import BoundingBox as DsBoundingBox
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -0,0 +1,25 @@
+from enum import Enum, auto
+
+from pydantic import BaseModel
+
+
+class TableFormerMode(str, Enum):
+    FAST = auto()
+    ACCURATE = auto()
+
+
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+    mode: TableFormerMode = TableFormerMode.FAST
+
+
+class PipelineOptions(BaseModel):
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+
+    table_structure_options: TableStructureOptions = TableStructureOptions()
diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -18,13 +18,13 @@
     DoclingComponentType,
     ErrorItem,
     Page,
-    PipelineOptions,
 )
 from docling.datamodel.document import (
     ConversionResult,
     DocumentConversionInput,
     InputDocument,
 )
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel

diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
@@ -1,4 +1,5 @@
 import copy
+from pathlib import Path
 from typing import Iterable, List
 
 import numpy
@@ -12,16 +13,22 @@
     TableElement,
     TableStructurePrediction,
 )
+from docling.datamodel.pipeline_options import TableFormerMode
 
 
 class TableStructureModel:
     def __init__(self, config):
         self.config = config
         self.do_cell_matching = config["do_cell_matching"]
+        self.mode = config["mode"]
 
         self.enabled = config["enabled"]
         if self.enabled:
-            artifacts_path = config["artifacts_path"]
+            artifacts_path: Path = config["artifacts_path"]
+
+            if self.mode == TableFormerMode.ACCURATE:
+                artifacts_path = artifacts_path / "fat"
+
             # Third Party
             import docling_ibm_models.tableformer.common as c
 

diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py
@@ -1,7 +1,8 @@
 from pathlib import Path
 from typing import Callable, Iterable, List
 
-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.pipeline_options import PipelineOptions
 
 
 class BaseModelPipeline:

diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
@@ -32,6 +32,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
                     "artifacts_path": artifacts_path
                     / StandardModelPipeline._table_model_path,
                     "enabled": pipeline_options.do_table_structure,
+                    "mode": pipeline_options.table_structure_options.mode,
                     "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                 }
             ),

diff --git a/examples/batch_convert.py b/examples/batch_convert.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Iterable
 
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.document_converter import DocumentConverter
 

diff --git a/examples/custom_convert.py b/examples/custom_convert.py
@@ -82,7 +82,7 @@ def main():
     # PyPdfium with OCR
     # -----------------
     # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=False
+    # pipeline_options.do_ocr=True
     # pipeline_options.do_table_structure=True
     # pipeline_options.table_structure_options.do_cell_matching = True