Skip to content

Commit

Permalink
first attempt at updating the schemas
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Sep 9, 2024
1 parent a4ec3b5 commit 9ee834b
Show file tree
Hide file tree
Showing 15 changed files with 61 additions and 56 deletions.
22 changes: 11 additions & 11 deletions docling_core/resources/schemas/generated/ccs_document_schema.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "ExportedCCSDocument",
"title": "ExportedDocument",
"type": "object",
"properties": {
"name": {
Expand All @@ -16,10 +16,10 @@
"type": "string"
},
"description": {
"$ref": "#/definitions/CCSDocumentDescription"
"$ref": "#/definitions/DocumentDescription"
},
"file-info": {
"$ref": "#/definitions/CCSFileInfoObject"
"$ref": "#/definitions/FileInfoObject"
},
"main-text": {
"title": "Main-Text",
Expand Down Expand Up @@ -274,8 +274,8 @@
"date"
]
},
"CCSDocumentDescription": {
"title": "CCSDocumentDescription",
"DocumentDescription": {
"title": "DocumentDescription",
"type": "object",
"properties": {
"title": {
Expand Down Expand Up @@ -414,8 +414,8 @@
"logs"
]
},
"CCSFileInfoDescription": {
"title": "CCSFileInfoDescription",
"FileInfoDescription": {
"title": "FileInfoDescription",
"type": "object",
"properties": {
"author": {
Expand Down Expand Up @@ -462,8 +462,8 @@
"page"
]
},
"CCSFileInfoObject": {
"title": "CCSFileInfoObject",
"FileInfoObject": {
"title": "FileInfoObject",
"type": "object",
"properties": {
"filename": {
Expand All @@ -487,7 +487,7 @@
"type": "string"
},
"description": {
"$ref": "#/definitions/CCSFileInfoDescription"
"$ref": "#/definitions/FileInfoDescription"
},
"page-hashes": {
"title": "Page-Hashes",
Expand Down Expand Up @@ -1068,4 +1068,4 @@
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"type": "string"
},
"description": {
"title": "CCSDocumentDescription",
"title": "DocumentDescription",
"type": "object",
"properties": {
"title": {
Expand Down Expand Up @@ -1126,4 +1126,4 @@
"file-info",
"main-text"
]
}
}
4 changes: 2 additions & 2 deletions docling_core/search/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@


class S3Path(BaseModel, extra="forbid"):
"""The path details within a cloud object storage for CCS-parsed files."""
"""The path details within a cloud object storage for Documents."""

bucket: StrictStr
prefix: StrictStr
Expand All @@ -30,7 +30,7 @@ def __hash__(self):


class S3CcsData(BaseModel, extra="forbid"):
"""The access details to a cloud object storage for CCS-parsed files."""
"""The access details to a cloud object storage for Documents."""

endpoint: StrictStr
paths: UniqueList[S3Path] = Field(min_length=1)
Expand Down
6 changes: 3 additions & 3 deletions docling_core/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
Ref,
)
from docling_core.types.doc.document import ( # noqa
CCSDocumentDescription as DocumentDescription,
DocumentDescription as DocumentDescription,
)
from docling_core.types.doc.document import CCSFileInfoObject as FileInfoObject # noqa
from docling_core.types.doc.document import ExportedCCSDocument as Document # noqa
from docling_core.types.doc.document import FileInfoObject as FileInfoObject # noqa
from docling_core.types.doc.document import LayoutDocument as Document # noqa
from docling_core.types.gen.generic import Generic # noqa
from docling_core.types.rec.record import Record # noqa
2 changes: 1 addition & 1 deletion docling_core/types/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ class Log(AliasModel, extra="forbid"):
json_schema_extra=es_field(type="keyword", ignore_above=8191),
)
agent: StrictStr = Field(
description="The Docling agent that performed the task, e.g., CCS or CXS.",
description="The Docling agent that performed the task.",
json_schema_extra=es_field(type="keyword", ignore_above=8191),
)
type_: StrictStr = Field(
Expand Down
2 changes: 1 addition & 1 deletion docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MIT
#

"""Define common models across CCS objects."""
"""Define common models across Documents."""
from typing import Annotated, Literal, Optional, Union

from pydantic import BaseModel, Field, PositiveInt, StrictStr
Expand Down
2 changes: 1 addition & 1 deletion docling_core/types/doc/doc_ann.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MIT
#

"""Models for annotations and predictions in CCS."""
"""Models for annotations and predictions on Documents."""
from typing import Any

from pydantic import BaseModel
Expand Down
2 changes: 1 addition & 1 deletion docling_core/types/doc/doc_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MIT
#

"""Models for CCS objects with OCR."""
"""Models for Document objects with OCR."""
from typing import Any, Dict, List, Literal

from pydantic import BaseModel, Field
Expand Down
2 changes: 1 addition & 1 deletion docling_core/types/doc/doc_raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MIT
#

"""Models for CCS objects in raw format."""
"""Models for Document objects in raw format."""
from typing import Any, List, Optional

from pydantic import BaseModel, Field
Expand Down
13 changes: 9 additions & 4 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class FileInfoDescription(BaseModel, extra="forbid"):
creation_date: Optional[str] = None # datetime


class FileInfoObject(FileInfoObject, extra="forbid"):
class LayoutFileInfoObject(FileInfoObject, extra="forbid"):
"""File info object."""

num_pages: Optional[int] = Field(default=None, alias="#-pages")
Expand Down Expand Up @@ -260,7 +260,7 @@ class SimpleDocument(
CollectionNameTypeT,
],
):
"""Minimal model for a document."""
"""Simple model for a document."""

name: StrictStr = Field(alias="_name")
obj_type: Optional[StrictStr] = Field("document", alias="type")
Expand All @@ -280,7 +280,7 @@ class SimpleDocument(


class LayoutDocument(
MinimalDocument,
SimpleDocument,
Generic[
DescriptionAdvancedT,
DescriptionAnalyticsT,
Expand Down Expand Up @@ -361,7 +361,12 @@ def export_to_markdown(
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: Optional[List[str]] = ["title", "subtitle-level-1", "paragraph", "caption"]
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
],
) -> str:
r"""Serialize to Markdown.
Expand Down
4 changes: 2 additions & 2 deletions test/data/json_schemas/base_log.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"x-es-type": "keyword"
},
"agent": {
"description": "The Docling agent that performed the task, e.g., CCS or CXS.",
"description": "The Docling agent that performed the task.",
"title": "Agent",
"type": "string",
"x-es-ignore_above": 8191,
Expand Down Expand Up @@ -58,4 +58,4 @@
],
"title": "Log",
"type": "object"
}
}
6 changes: 3 additions & 3 deletions test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
Log,
StrictDateTime,
)
from docling_core.types.doc.document import CCSDocumentDescription
from docling_core.types.doc.document import DocumentDescription
from docling_core.types.rec.record import RecordDescription


Expand Down Expand Up @@ -215,11 +215,11 @@ def test_collection_document_info():
"alias": ["patent"],
},
}
CCSDocumentDescription(**desc_dict)
DocumentDescription(**desc_dict)

desc_dict["collection"]["type"] = "Record"
with pytest.raises(ValidationError, match="collection.type"):
CCSDocumentDescription(**desc_dict)
DocumentDescription(**desc_dict)


def test_collection_record_info():
Expand Down
38 changes: 19 additions & 19 deletions test/test_doc_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@
LanguageT,
)
from docling_core.types.doc.document import (
CCSDocument,
CCSDocumentDescription,
DocumentDescription,
LayoutDocument,
Publication,
)


def test_ccs_document():
"""Validate data with CCSDocument schema."""
"""Validate data with Document schema."""
for filename in glob.glob("test/data/doc/doc-*.json"):
with open(filename) as file_obj:
file_json = file_obj.read()
try:
# do not pass strict=True, since date input values are not an instance of datetime.
CCSDocument.model_validate_json(file_json)
LayoutDocument.model_validate_json(file_json)
# try as well as dictionary
doc = json.loads(file_json)
CCSDocument.model_validate(doc)
LayoutDocument.model_validate(doc)
except ValidationError as e:
print(f"Validation error in file {filename}:\n{e.json()}")
raise
Expand All @@ -43,8 +43,8 @@ def test_ccs_document():
try:
with open("test/data/doc/error-1.json") as file_obj:
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
assert False, f"Data in file {filename} should be invalid for CCSDocument model"
LayoutDocument.model_validate_json(file_json)
assert False, f"Data in file {filename} should be invalid for Document model"
except ValidationError as e:
for error in e.errors():
print(type(error))
Expand All @@ -58,15 +58,15 @@ def test_ccs_document():
open("test/data/doc/error-2.json") as file_obj,
):
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
LayoutDocument.model_validate_json(file_json)

# check doc-error-3 is invalid for wrong types in citation_count and reference_count
with (
pytest.raises(ValidationError, match="count"),
open("test/data/doc/error-3.json") as file_obj,
):
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
LayoutDocument.model_validate_json(file_json)


def test_publication_journal():
Expand All @@ -89,44 +89,44 @@ def test_description_advanced_t():
desc = json.load(file_obj)["description"]

# without advanced
CCSDocumentDescription.model_validate(desc)
DocumentDescription.model_validate(desc)

# any dictionary is valid, since it is not parametrized
CCSDocumentDescription(**desc, advanced={"serial": "CXS12345"})
CCSDocumentDescription(**desc, advanced={0: "CXS12345"})
DocumentDescription(**desc, advanced={"serial": "CXS12345"})
DocumentDescription(**desc, advanced={0: "CXS12345"})
with pytest.raises(
ValidationError, match="should be a valid dictionary or instance of BaseModel"
):
CCSDocumentDescription(**desc, advanced=False)
DocumentDescription(**desc, advanced=False)

class MyAdvanced(BaseModel):
serial: str
comment: Optional[str] = None

# with a model and bound specification
adv_inst = MyAdvanced(serial="CXS12345", comment="public document")
CCSDocumentDescription(**desc, advanced=adv_inst)
DocumentDescription(**desc, advanced=adv_inst)
with pytest.raises(ValidationError, match="Field required"):
CCSDocumentDescription(**desc, advanced=MyAdvanced(comment="public document"))
DocumentDescription(**desc, advanced=MyAdvanced(comment="public document"))

# with a model and generic type specification
advanced = MyAdvanced(serial="CXS12345", comment="public document")
CCSDocumentDescription[
DocumentDescription[
MyAdvanced,
DescriptionAnalyticsT,
IdentifierTypeT,
LanguageT,
CollectionNameTypeT,
](**desc)
CCSDocumentDescription[
DocumentDescription[
MyAdvanced,
DescriptionAnalyticsT,
IdentifierTypeT,
LanguageT,
CollectionNameTypeT,
](**desc, advanced=adv_inst)
with pytest.raises(ValidationError, match="Field required"):
CCSDocumentDescription[
DocumentDescription[
MyAdvanced,
DescriptionAnalyticsT,
IdentifierTypeT,
Expand All @@ -135,7 +135,7 @@ class MyAdvanced(BaseModel):
](**desc, advanced={})

# deriving a new type
MyDocument = CCSDocumentDescription[
MyDocument = DocumentDescription[
MyAdvanced,
DescriptionAnalyticsT,
IdentifierTypeT,
Expand Down
6 changes: 3 additions & 3 deletions test/test_doc_schema_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

from pydantic import ValidationError

from docling_core.types.doc.document import CCSDocument
from docling_core.types.doc.document import LayoutDocument


def test_ccs_document_update():
"""Validate data with CCSDocument extract."""
"""Validate data with Document extract."""
filename = "test/data/doc/ext-1.json"
try:
with open(filename) as f:
Expand All @@ -21,7 +21,7 @@ def test_ccs_document_update():
if "$ref" in item:
assert False, f"$ref should not be in file {filename}"

doc = CCSDocument.model_validate(raw_doc)
doc = LayoutDocument.model_validate(raw_doc)

if doc.description.abstract:
assert False, f"Abstract should not be present"
Expand Down
Loading

0 comments on commit 9ee834b

Please sign in to comment.