Skip to content

Commit

Permalink
fix: Call into docling-core for legacy document transform (#551)
Browse files Browse the repository at this point in the history
Call into docling-core for legacy document transform

Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git authored Dec 9, 2024
1 parent 78f61a8 commit 7972d47
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 258 deletions.
255 changes: 2 additions & 253 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
from typing_extensions import deprecated

Expand Down Expand Up @@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
@property
@deprecated("Use document instead.")
def legacy_document(self):
reverse_label_mapping = {
DocItemLabel.CAPTION.value: "Caption",
DocItemLabel.FOOTNOTE.value: "Footnote",
DocItemLabel.FORMULA.value: "Formula",
DocItemLabel.LIST_ITEM.value: "List-item",
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
DocItemLabel.PAGE_HEADER.value: "Page-header",
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER.value: "Section-header",
DocItemLabel.TABLE.value: "Table",
DocItemLabel.TEXT.value: "Text",
DocItemLabel.TITLE.value: "Title",
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
DocItemLabel.CODE.value: "Code",
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
DocItemLabel.FORM.value: "Form",
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
DocItemLabel.PARAGRAPH.value: "paragraph",
}

title = ""
desc = DsDocumentDescription(logs=[])

page_hashes = [
PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
page=p.page_no,
model="default",
)
for p in self.document.pages.values()
]

file_info = DsFileInfoObject(
filename=self.input.file.name,
document_hash=self.input.document_hash,
num_pages=self.input.page_count,
page_hashes=page_hashes,
)

main_text = []
tables = []
figures = []
equations = []
footnotes = []
page_headers = []
page_footers = []

embedded_captions = set()
for ix, (item, level) in enumerate(
self.document.iterate_items(self.document.body)
):

if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
caption = item.caption_text(self.document)
if caption:
embedded_captions.add(caption)

for item, level in self.document.iterate_items():
if isinstance(item, DocItem):
item_type = item.label

if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):

if isinstance(item, ListItem) and item.marker:
text = f"{item.marker} {item.text}"
else:
text = item.text

# Can be empty.
prov = [
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(item.text)],
)
for p in item.prov
]
main_text.append(
BaseText(
text=text,
obj_type=layout_label_to_ds_type.get(item.label),
name=reverse_label_mapping[item.label],
prov=prov,
)
)

# skip captions of they are embedded in the actual
# floating object
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
continue

elif isinstance(item, TableItem) and item.data:
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)

# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(item.data.num_cols)
]
for i in range(item.data.num_rows)
]

# Overwrite cells in table data for which there is actual cell content.
for cell in item.data.table_cells:
for i in range(
min(cell.start_row_offset_idx, item.data.num_rows),
min(cell.end_row_offset_idx, item.data.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, item.data.num_cols),
min(cell.end_col_offset_idx, item.data.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
elif cell.row_section:
celltype = "row_section"

def make_spans(cell):
for rspan in range(
min(
cell.start_row_offset_idx,
item.data.num_rows,
),
min(
cell.end_row_offset_idx, item.data.num_rows
),
):
for cspan in range(
min(
cell.start_col_offset_idx,
item.data.num_cols,
),
min(
cell.end_col_offset_idx,
item.data.num_cols,
),
):
yield [rspan, cspan]

spans = list(make_spans(cell))
table_data[i][j] = GlmTableCell(
text=cell.text,
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
else None
), # check if this is bottom-left
spans=spans,
obj_type=celltype,
col=j,
row=i,
row_header=cell.row_header,
row_section=cell.row_section,
col_header=cell.column_header,
row_span=[
cell.start_row_offset_idx,
cell.end_row_offset_idx,
],
col_span=[
cell.start_col_offset_idx,
cell.end_col_offset_idx,
],
)

# Compute the caption
caption = item.caption_text(self.document)

tables.append(
DsSchemaTable(
text=caption,
num_cols=item.data.num_cols,
num_rows=item.data.num_rows,
obj_type=layout_label_to_ds_type.get(item.label),
data=table_data,
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, 0],
)
for p in item.prov
],
)
)

elif isinstance(item, PictureItem):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=reverse_label_mapping[item.label],
obj_type=layout_label_to_ds_type.get(item.label),
ref=ref_str,
),
)

# Compute the caption
caption = item.caption_text(self.document)

figures.append(
Figure(
prov=[
Prov(
bbox=p.bbox.as_tuple(),
page=p.page_no,
span=[0, len(caption)],
)
for p in item.prov
],
obj_type=layout_label_to_ds_type.get(item.label),
text=caption,
# data=[[]],
)
)

page_dimensions = [
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
for p in self.document.pages.values()
]

ds_doc = DsDocument(
name=title,
description=desc,
file_info=file_info,
main_text=main_text,
equations=equations,
footnotes=footnotes,
page_headers=page_headers,
page_footers=page_footers,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
)

return ds_doc
return docling_document_to_legacy(self.document)


class _DummyBackend(AbstractDocumentBackend):
Expand Down
13 changes: 9 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ packages = [{include = "docling"}]
# actual dependencies:
######################
python = "^3.9"
docling-core = { version = "^2.8.0", extras = ["chunking"] }
docling-core = { version = "^2.9.0", extras = ["chunking"] }
pydantic = "^2.0.0"
docling-ibm-models = "^2.0.6"
deepsearch-glm = "^1.0.0"
Expand Down

0 comments on commit 7972d47

Please sign in to comment.