Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: extend chunk meta with schema, version, origin #49

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docling_core/transforms/chunker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@
"""Define the chunker types."""

from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.transforms.chunker.hierarchical_chunker import (
DocMeta,
HierarchicalChunker,
)
2 changes: 1 addition & 1 deletion docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


class BaseMeta(BaseModel):
"""Metadata base class."""
"""Chunk metadata base class."""

excluded_embed: ClassVar[list[str]] = []
excluded_llm: ClassVar[list[str]] = []
Expand Down
64 changes: 58 additions & 6 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@
from __future__ import annotations

import logging
from typing import Any, ClassVar, Iterator, Optional
import re
from typing import Any, ClassVar, Final, Iterator, Literal, Optional

from pandas import DataFrame
from pydantic import Field
from pydantic import Field, StringConstraints, field_validator
from typing_extensions import Annotated

from docling_core.search.package import VERSION_PATTERN
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
from docling_core.types.doc import DoclingDocument as DLDocument
from docling_core.types import DoclingDocument as DLDocument
from docling_core.types.doc.document import (
DocItem,
DocumentOrigin,
LevelNumber,
ListItem,
SectionHeaderItem,
Expand All @@ -25,16 +29,31 @@
)
from docling_core.types.doc.labels import DocItemLabel

_VERSION: Final = "1.0.0"

_KEY_SCHEMA_NAME = "schema_name"
_KEY_VERSION = "version"
_KEY_DOC_ITEMS = "doc_items"
_KEY_HEADINGS = "headings"
_KEY_CAPTIONS = "captions"
_KEY_ORIGIN = "origin"

_logger = logging.getLogger(__name__)


class DocMeta(BaseMeta):
"""Data model for Hierarchical Chunker metadata."""
"""Data model for Hierarchical Chunker chunk metadata."""

schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with setting a full path to mark the schema. We may want to do the same with DoclingDocument schema name on https://github.com/DS4SD/docling-core/blob/main/docling_core/types/doc/document.py#L784

default="docling_core.transforms.chunker.DocMeta",
alias=_KEY_SCHEMA_NAME,
)
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
Field(
default=_VERSION,
alias=_KEY_VERSION,
)
)
doc_items: list[DocItem] = Field(
alias=_KEY_DOC_ITEMS,
min_length=1,
Expand All @@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
alias=_KEY_CAPTIONS,
min_length=1,
)
origin: Optional[DocumentOrigin] = Field(
default=None,
alias=_KEY_ORIGIN,
)

excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
excluded_embed: ClassVar[list[str]] = [
_KEY_SCHEMA_NAME,
_KEY_VERSION,
_KEY_DOC_ITEMS,
_KEY_ORIGIN,
]
excluded_llm: ClassVar[list[str]] = [
_KEY_SCHEMA_NAME,
_KEY_VERSION,
_KEY_DOC_ITEMS,
_KEY_ORIGIN,
]

@field_validator(_KEY_VERSION)
@classmethod
def check_version_is_compatible(cls, v: str) -> str:
"""Check if this meta item version is compatible with current version."""
current_match = re.match(VERSION_PATTERN, _VERSION)
doc_match = re.match(VERSION_PATTERN, v)
if (
doc_match is None
or current_match is None
or doc_match["major"] != current_match["major"]
or doc_match["minor"] > current_match["minor"]
):
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
else:
return _VERSION
Comment on lines +91 to +103
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, will just need to account for the fact that a validator parameter for target version should be included too as it is different for the two cases (pydantic/pydantic#2938)



class DocChunk(BaseChunk):
Expand Down Expand Up @@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
for k in sorted(heading_by_level)
]
or None,
origin=dl_doc.origin,
),
)
list_items = [] # reset
Expand Down Expand Up @@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
captions=captions,
origin=dl_doc.origin,
),
)
yield c
Expand All @@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
doc_items=list_items,
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
or None,
origin=dl_doc.origin,
),
)
Loading
Loading