-
Notifications
You must be signed in to change notification settings - Fork 28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: extend chunk meta with schema, version, origin #49
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,15 +8,19 @@ | |
from __future__ import annotations | ||
|
||
import logging | ||
from typing import Any, ClassVar, Iterator, Optional | ||
import re | ||
from typing import Any, ClassVar, Final, Iterator, Literal, Optional | ||
|
||
from pandas import DataFrame | ||
from pydantic import Field | ||
from pydantic import Field, StringConstraints, field_validator | ||
from typing_extensions import Annotated | ||
|
||
from docling_core.search.package import VERSION_PATTERN | ||
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta | ||
from docling_core.types.doc import DoclingDocument as DLDocument | ||
from docling_core.types import DoclingDocument as DLDocument | ||
from docling_core.types.doc.document import ( | ||
DocItem, | ||
DocumentOrigin, | ||
LevelNumber, | ||
ListItem, | ||
SectionHeaderItem, | ||
|
@@ -25,16 +29,31 @@ | |
) | ||
from docling_core.types.doc.labels import DocItemLabel | ||
|
||
_VERSION: Final = "1.0.0" | ||
|
||
_KEY_SCHEMA_NAME = "schema_name" | ||
_KEY_VERSION = "version" | ||
_KEY_DOC_ITEMS = "doc_items" | ||
_KEY_HEADINGS = "headings" | ||
_KEY_CAPTIONS = "captions" | ||
_KEY_ORIGIN = "origin" | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class DocMeta(BaseMeta): | ||
"""Data model for Hierarchical Chunker metadata.""" | ||
"""Data model for Hierarchical Chunker chunk metadata.""" | ||
|
||
schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field( | ||
default="docling_core.transforms.chunker.DocMeta", | ||
alias=_KEY_SCHEMA_NAME, | ||
) | ||
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = ( | ||
Field( | ||
default=_VERSION, | ||
alias=_KEY_VERSION, | ||
) | ||
) | ||
doc_items: list[DocItem] = Field( | ||
alias=_KEY_DOC_ITEMS, | ||
min_length=1, | ||
|
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta): | |
alias=_KEY_CAPTIONS, | ||
min_length=1, | ||
) | ||
origin: Optional[DocumentOrigin] = Field( | ||
default=None, | ||
alias=_KEY_ORIGIN, | ||
) | ||
|
||
excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS] | ||
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS] | ||
excluded_embed: ClassVar[list[str]] = [ | ||
_KEY_SCHEMA_NAME, | ||
_KEY_VERSION, | ||
_KEY_DOC_ITEMS, | ||
_KEY_ORIGIN, | ||
] | ||
excluded_llm: ClassVar[list[str]] = [ | ||
_KEY_SCHEMA_NAME, | ||
_KEY_VERSION, | ||
_KEY_DOC_ITEMS, | ||
_KEY_ORIGIN, | ||
] | ||
|
||
@field_validator(_KEY_VERSION) | ||
@classmethod | ||
def check_version_is_compatible(cls, v: str) -> str: | ||
"""Check if this meta item version is compatible with current version.""" | ||
current_match = re.match(VERSION_PATTERN, _VERSION) | ||
doc_match = re.match(VERSION_PATTERN, v) | ||
if ( | ||
doc_match is None | ||
or current_match is None | ||
or doc_match["major"] != current_match["major"] | ||
or doc_match["minor"] > current_match["minor"] | ||
): | ||
raise ValueError(f"incompatible version {v} with schema version {_VERSION}") | ||
else: | ||
return _VERSION | ||
Comment on lines
+91
to
+103
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should reuse the validator in https://github.com/DS4SD/docling-core/blob/main/docling_core/types/doc/document.py#L1418 to avoid code duplication There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, will just need to account for the fact that a validator parameter for target version should be included too as it is different for the two cases (pydantic/pydantic#2938) |
||
|
||
|
||
class DocChunk(BaseChunk): | ||
|
@@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: | |
for k in sorted(heading_by_level) | ||
] | ||
or None, | ||
origin=dl_doc.origin, | ||
), | ||
) | ||
list_items = [] # reset | ||
|
@@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: | |
headings=[heading_by_level[k] for k in sorted(heading_by_level)] | ||
or None, | ||
captions=captions, | ||
origin=dl_doc.origin, | ||
), | ||
) | ||
yield c | ||
|
@@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: | |
doc_items=list_items, | ||
headings=[heading_by_level[k] for k in sorted(heading_by_level)] | ||
or None, | ||
origin=dl_doc.origin, | ||
), | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree with setting a full path to mark the schema. We may want to do the same with
DoclingDocument
schema name on https://github.com/DS4SD/docling-core/blob/main/docling_core/types/doc/document.py#L784