Skip to content

Commit

Permalink
feat: Support tableformer model choice (#90)
Browse files Browse the repository at this point in the history
* Support tableformer model choice

Signed-off-by: Christoph Auer <[email protected]>

* Update datamodel structure

Signed-off-by: Christoph Auer <[email protected]>

* Update docs

Signed-off-by: Christoph Auer <[email protected]>

* Cleanup

Signed-off-by: Christoph Auer <[email protected]>

* Add test unit for table options

Signed-off-by: Christoph Auer <[email protected]>

* Ensure import backwards-compatibility for PipelineOptions

Signed-off-by: Christoph Auer <[email protected]>

* Update README

Signed-off-by: Christoph Auer <[email protected]>

* Adjust parameters on custom_convert

Signed-off-by: Christoph Auer <[email protected]>

* Update Dockerfile

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git authored Sep 26, 2024
1 parent 39977b5 commit d6df76f
Show file tree
Hide file tree
Showing 16 changed files with 711 additions and 592 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"

RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git procps \
&& apt-get clean

# This will install torch with *only* cpu support
Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ This can improve output quality if you find that multiple columns in extracted t


```python
from docling.datamodel.pipeline_options import PipelineOptions

pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model

Expand All @@ -168,6 +170,20 @@ doc_converter = DocumentConverter(
)
```

Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures.

```python
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode

pipeline_options = PipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model

doc_converter = DocumentConverter(
artifacts_path=artifacts_path,
pipeline_options=pipeline_options,
)
```

### Impose limits on the document size

You can limit the file size and number of pages which should be allowed to process per document:
Expand Down
3 changes: 2 additions & 1 deletion docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
Expand Down
20 changes: 4 additions & 16 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from typing_extensions import Self

from docling.backend.abstract_backend import PdfPageBackend
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
PipelineOptions,
TableStructureOptions,
)


class ConversionStatus(str, Enum):
Expand Down Expand Up @@ -298,22 +302,6 @@ class DocumentStream(BaseModel):
stream: BytesIO


class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)


class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text

table_structure_options: TableStructureOptions = TableStructureOptions()


class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,
Expand Down
2 changes: 1 addition & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure
from pydantic import BaseModel
from typing_extensions import deprecated
Expand Down
25 changes: 25 additions & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from enum import Enum, auto

from pydantic import BaseModel


class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()


class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST


class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text

table_structure_options: TableStructureOptions = TableStructureOptions()
2 changes: 1 addition & 1 deletion docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel
Expand Down
9 changes: 8 additions & 1 deletion docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
from pathlib import Path
from typing import Iterable, List

import numpy
Expand All @@ -12,16 +13,22 @@
TableElement,
TableStructurePrediction,
)
from docling.datamodel.pipeline_options import TableFormerMode


class TableStructureModel:
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]
self.mode = config["mode"]

self.enabled = config["enabled"]
if self.enabled:
artifacts_path = config["artifacts_path"]
artifacts_path: Path = config["artifacts_path"]

if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "fat"

# Third Party
import docling_ibm_models.tableformer.common as c

Expand Down
3 changes: 2 additions & 1 deletion docling/pipeline/base_model_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from pathlib import Path
from typing import Callable, Iterable, List

from docling.datamodel.base_models import Page, PipelineOptions
from docling.datamodel.base_models import Page
from docling.datamodel.pipeline_options import PipelineOptions


class BaseModelPipeline:
Expand Down
3 changes: 2 additions & 1 deletion docling/pipeline/standard_model_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path

from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
Expand Down Expand Up @@ -32,6 +32,7 @@ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"mode": pipeline_options.table_structure_options.mode,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
Expand Down
2 changes: 1 addition & 1 deletion examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
from typing import Iterable

from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

Expand Down
2 changes: 1 addition & 1 deletion examples/custom_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def main():
# PyPdfium with OCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True

Expand Down
Loading

0 comments on commit d6df76f

Please sign in to comment.