Skip to content

Commit

Permalink
Merge branch 'main' of github.com:DS4SD/docling-core into dev/add-to_…
Browse files Browse the repository at this point in the history
…indented_text
  • Loading branch information
cau-git committed Oct 22, 2024
2 parents 62106a7 + d09fe7e commit bfcd07f
Show file tree
Hide file tree
Showing 6 changed files with 1,881 additions and 237 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ poetry run pytest test

Docling Core contains 3 top-level data types:

- **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
The DoclingDocument type also models the metadata that may be attached to the converted document.
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
Expand Down
5 changes: 4 additions & 1 deletion docling_core/transforms/chunker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@
"""Define the chunker types."""

from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.transforms.chunker.hierarchical_chunker import (
DocMeta,
HierarchicalChunker,
)
2 changes: 1 addition & 1 deletion docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


class BaseMeta(BaseModel):
"""Metadata base class."""
"""Chunk metadata base class."""

excluded_embed: ClassVar[list[str]] = []
excluded_llm: ClassVar[list[str]] = []
Expand Down
64 changes: 58 additions & 6 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@
from __future__ import annotations

import logging
from typing import Any, ClassVar, Iterator, Optional
import re
from typing import Any, ClassVar, Final, Iterator, Literal, Optional

from pandas import DataFrame
from pydantic import Field
from pydantic import Field, StringConstraints, field_validator
from typing_extensions import Annotated

from docling_core.search.package import VERSION_PATTERN
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
from docling_core.types.doc import DoclingDocument as DLDocument
from docling_core.types import DoclingDocument as DLDocument
from docling_core.types.doc.document import (
DocItem,
DocumentOrigin,
LevelNumber,
ListItem,
SectionHeaderItem,
Expand All @@ -25,16 +29,31 @@
)
from docling_core.types.doc.labels import DocItemLabel

_VERSION: Final = "1.0.0"

_KEY_SCHEMA_NAME = "schema_name"
_KEY_VERSION = "version"
_KEY_DOC_ITEMS = "doc_items"
_KEY_HEADINGS = "headings"
_KEY_CAPTIONS = "captions"
_KEY_ORIGIN = "origin"

_logger = logging.getLogger(__name__)


class DocMeta(BaseMeta):
"""Data model for Hierarchical Chunker metadata."""
"""Data model for Hierarchical Chunker chunk metadata."""

schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
default="docling_core.transforms.chunker.DocMeta",
alias=_KEY_SCHEMA_NAME,
)
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
Field(
default=_VERSION,
alias=_KEY_VERSION,
)
)
doc_items: list[DocItem] = Field(
alias=_KEY_DOC_ITEMS,
min_length=1,
Expand All @@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
alias=_KEY_CAPTIONS,
min_length=1,
)
origin: Optional[DocumentOrigin] = Field(
default=None,
alias=_KEY_ORIGIN,
)

excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
excluded_embed: ClassVar[list[str]] = [
_KEY_SCHEMA_NAME,
_KEY_VERSION,
_KEY_DOC_ITEMS,
_KEY_ORIGIN,
]
excluded_llm: ClassVar[list[str]] = [
_KEY_SCHEMA_NAME,
_KEY_VERSION,
_KEY_DOC_ITEMS,
_KEY_ORIGIN,
]

@field_validator(_KEY_VERSION)
@classmethod
def check_version_is_compatible(cls, v: str) -> str:
"""Check if this meta item version is compatible with current version."""
current_match = re.match(VERSION_PATTERN, _VERSION)
doc_match = re.match(VERSION_PATTERN, v)
if (
doc_match is None
or current_match is None
or doc_match["major"] != current_match["major"]
or doc_match["minor"] > current_match["minor"]
):
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
else:
return _VERSION


class DocChunk(BaseChunk):
Expand Down Expand Up @@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
for k in sorted(heading_by_level)
]
or None,
origin=dl_doc.origin,
),
)
list_items = [] # reset
Expand Down Expand Up @@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
captions=captions,
origin=dl_doc.origin,
),
)
yield c
Expand All @@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
doc_items=list_items,
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
origin=dl_doc.origin,
),
)
Loading

0 comments on commit bfcd07f

Please sign in to comment.