Skip to content

Commit

Permalink
Updates for document construction API and formta
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 20, 2024
1 parent bdbd93e commit 7cd81c0
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 49 deletions.
138 changes: 122 additions & 16 deletions docling_core/types/experimental/document.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import hashlib
from typing import Any, Dict, List, Optional, Tuple, Union

from pydantic import AnyUrl, BaseModel, Field
from pydantic import AnyUrl, BaseModel, Field, computed_field, conint, ConfigDict

from docling_core.types.experimental.base import BoundingBox, Size

Uint64 = conint(ge=0, le=(2**64 - 1))

class FigureData(BaseModel): # TBD
pass
Expand All @@ -12,10 +14,16 @@ class FigureData(BaseModel): # TBD
class TableData(BaseModel): # TBD
pass

class FileInfo(BaseModel):
document_hash: str

class RefItem(BaseModel):
cref: str = Field(alias="$ref")

model_config = ConfigDict(
populate_by_name=True,
)

def resolve(self, doc: "DoclingDocument"):
_, path, index_str = self.cref.split("/")
index = int(index_str)
Expand All @@ -35,34 +43,52 @@ class ProvenanceItem(BaseModel):
bbox: BoundingBox
charspan: Tuple[int, int]

class NodeItem(BaseModel):
parent: Optional[RefItem] = None
children: List[RefItem] = []

class GroupItem(NodeItem):
name: str

class DocItem(BaseModel):
class DocItem(NodeItem):
dloc: str # format spec ({document_hash}{json-path})
hash: int
label: str
parent: Optional[RefItem]
children: List[RefItem]
prov: List[ProvenanceItem]
prov: List[ProvenanceItem] = []

@computed_field
@property
def hash(self) -> Uint64: # TODO align with hasher on deepsearch-glm
if not len(self.dloc):
return 0
hash_object = hashlib.sha256(self.dloc.encode('utf-8'))

# Convert the hash to an integer
hash_int = int.from_bytes(hash_object.digest(), 'big')

# Mask it to fit within 64 bits
return Uint64(hash_int & 0xFFFFFFFFFFFFFFFF) # 64-bit unsigned integer mask



class TextItem(DocItem):
orig: str # untreated representation
text: str # sanitized representation

class Section(TextItem):
level: conint(ge=1, le=100) = 1

class FloatingItem(DocItem):
caption: Optional[Union[RefItem, TextItem]]
references: List[Union[RefItem, TextItem]]
footnotes: List[Union[RefItem, TextItem]]
data: Any
image: Optional[ImageRef]
caption: Optional[RefItem] = None
references: List[RefItem] = []
footnotes: List[RefItem] = []
image: Optional[ImageRef] = None


class FigureItem(DocItem):
class FigureItem(FloatingItem):
data: FigureData


class TableItem(DocItem):
class TableItem(FloatingItem):
data: TableData


Expand All @@ -73,8 +99,8 @@ class KeyValueItem(DocItem):


class DocumentContent(BaseModel):
furniture: List[RefItem] = []
body: List[RefItem] = []
furniture: GroupItem = GroupItem(name="_root_") # List[RefItem] = []
body: GroupItem = GroupItem(name="_root_") # List[RefItem] = []
texts: List[TextItem] = []
figures: List[FigureItem] = []
tables: List[TableItem] = []
Expand All @@ -89,5 +115,85 @@ class PageItem(DocumentContent):

class DoclingDocument(DocumentContent):
description: Any
file_info: Any
file_info: FileInfo
pages: Dict[int, PageItem] = {} # empty as default

#def add_furniture_group(self, name: str):
# group = GroupItem(name=name)
# self.furniture.children.append(group)
# return group

def add_group(self, name: str, parent: Optional[GroupItem] = None) -> GroupItem:
if not parent:
parent = self.body

group = GroupItem(name=name)
parent.children.append(group)
return group

def add_paragraph(self, label: str, text: str, orig: Optional[str] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None,
item_cls=TextItem):
if not parent:
parent = self.body

if not orig:
orig = text

text_index = len(self.texts)
cref = f"#/texts/{text_index}"
dloc = f"{self.file_info.document_hash}{cref}"
text_item = item_cls(label=label, text=text, orig=orig, dloc=dloc)
if prov:
text_item.prov.append(prov)

self.texts.append(text_item)
parent.children.append(RefItem(cref=cref))

return text_item

def add_table(self, data: TableData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
if not parent:
parent = self.body

table_index = len(self.tables)
cref = f"#/tables/{table_index}"
dloc = f"{self.file_info.document_hash}{cref}"

tbl_item = TableItem(label="table", data=data, dloc=dloc)
if prov:
tbl_item.prov.append(prov)
if caption:
tbl_item.caption = caption

self.tables.append(tbl_item)
parent.children.append(RefItem(cref=cref))

return tbl_item


def add_figure(self, data: FigureData, caption: Optional[RefItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
if not parent:
parent = self.body

figure_index = len(self.figures)
cref = f"#/figures/{figure_index}"
dloc = f"{self.file_info.document_hash}{cref}"

fig_item = FigureItem(label="figure", data=data, dloc=dloc)
if prov:
fig_item.prov.append(prov)
if caption:
fig_item.caption = caption

self.figures.append(fig_item)
parent.children.append(RefItem(cref=cref))

return fig_item


def add_heading(self, label: str, text: str, orig: Optional[str] = None, level: conint(ge=1, le=100) = 1,
prov: Optional[ProvenanceItem] = None, parent: Optional[GroupItem] = None):
item: Section = self.add_paragraph(label, text, orig, prov, parent, item_cls=Section)
item.level = level
return item

30 changes: 20 additions & 10 deletions test/data/newdoc/dummy_doc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,30 @@
description: { } # DescriptionType - TBD
file_info: # FileInfoType - TBD
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5

furniture: # Headers, footers, framing, navigation elements, all other non-body text
- $ref: "/texts/0"
name: "_root_"
parent: null
children:
- $ref: "/texts/0"

body: # Top-level elements in other arrays, by-reference only, must not have parent.
- $ref: "/texts/1"
- $ref: "/figure/0"
- $ref: "/texts/2"
- $ref: "/texts/3"
- $ref: "/tables/0"
name: "_root_"
parent: null
children:
- $ref: "/texts/1"
- $ref: "/figure/0"
- $ref: "/texts/2"
- $ref: "/tables/0"

texts: # All elements that have a text-string representation, with actual data
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
hash: 132103230
label: "page_header"
parent: null
parent:
$ref: "#/furniture"
children: []
prov:
- page_no: 1
Expand All @@ -34,7 +41,8 @@ texts: # All elements that have a text-string representation, with actual data
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
hash: 2349732 # uint64 hash of dloc
label: "title"
parent: null
parent:
$ref: "#/body"
children: [ ]
prov: # must exist, can be empty
- page_no: 1
Expand Down Expand Up @@ -83,7 +91,8 @@ tables: # All tables...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
hash: 98574
label: "table"
parent: null
parent:
$ref: "#/body"
children: [ ]
caption:
$ref: "/texts/3"
Expand Down Expand Up @@ -117,7 +126,8 @@ figures: # All figures...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
hash: 7782482
label: "figure"
parent: null
parent:
$ref: "#/body"
caption:
$ref: "/texts/2"
references:
Expand Down
40 changes: 17 additions & 23 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import yaml
import pytest
from docling_core.types import DoclingDocument, BoundingBox
from docling_core.types.experimental.document import ProvenanceItem
from docling_core.types.experimental.document import ProvenanceItem, FileInfo


def test_load_serialize_doc():
Expand All @@ -19,7 +19,7 @@ def test_load_serialize_doc():
text_item.prov[0].page_no

# Objects that are references need explicit resolution for now:
obj = doc.body[2].resolve(doc=doc) # Text item with parent
obj = doc.body.children[2].resolve(doc=doc) # Text item with parent
parent = obj.parent.resolve(doc=doc) # it is a figure

obj2 = parent.children[0].resolve(
Expand All @@ -38,26 +38,20 @@ def test_load_serialize_doc():
assert doc_reload is not doc # can't be identical

def test_construct_doc():
doc = DoclingDocument(description={}, file_info={})

# group, heading, paragraph, table, figure, title, list, provenance
doc.add_title()
doc.add_paragraph(text="Author 1\nAffiliation 1").add_provenance(ProvenanceItem(page_no=1, bbox=BoundingBox(t=12, l=5, r=230, b=40), charspan=(0,22)))
doc.add_paragraph(text="Author 2\nAffiliation 2")

chapter1 = doc.add_group(name="Introduction")
chapter1.add_heading(text="1. Introduction", level=2)
chapter1.add_paragraph(text="This paper introduces the biggest invention ever made. ...")
mylist = chapter1.add_group()
mylist.add_item(text="Cooks your favourite meal before you know you want it.")
mylist.add_item(text="Cleans up all your dishes.")
mylist.add_item(text="Drains your bank account without consent.")

# This code is purely imaginative. None of the APIs below exist yet.

doc = DoclingDocument(description={}, file_info=FileInfo(document_hash="xyz"))

sec = doc.add_section(text="1. Introduction")

list = sec.add_child(label="container")
list.add_child()
list.add_child()

# group, heading, paragraph, table, figure, title, list, provenance
doc.add_paragraph(label="text", text="Author 1\nAffiliation 1")
doc.add_paragraph(label="text", text="Author 2\nAffiliation 2")

chapter1 = doc.add_group(name="Introduction") # can be done if such information is present, or ommitted.
doc.add_heading(parent=chapter1, label="section_header", text="1. Introduction", level=1)
doc.add_paragraph(parent=chapter1, label="text", text="This paper introduces the biggest invention ever made. ...")
mylist = doc.add_group(parent=chapter1, name="whateverlist")
doc.add_paragraph(parent=mylist, label="list_item", text="Cooks your favourite meal before you know you want it.")
doc.add_paragraph(parent=mylist, label="list_item", text="Cleans up all your dishes.")
doc.add_paragraph(parent=mylist, label="list_item", text="Drains your bank account without consent.")

print(doc.model_dump(mode="json", by_alias=True))

0 comments on commit 7cd81c0

Please sign in to comment.