Skip to content

Commit

Permalink
feat: add table exports (#86)
Browse files Browse the repository at this point in the history
* feat: expose docling-core table exporters and add examples

Signed-off-by: Michele Dolfi <[email protected]>

* remove temp internal implementation of html export

Signed-off-by: Michele Dolfi <[email protected]>

* pin latest docling-core 1.4.0 with table exports

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Sep 18, 2024
1 parent 442443a commit f19bd43
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 86 deletions.
63 changes: 1 addition & 62 deletions docling/utils/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,67 +9,6 @@
_log = logging.getLogger(__name__)


def _export_table_to_html(table: Table):

# TODO: this is flagged as internal, because we will move it
# to the docling-core package.

def _get_tablecell_span(cell: TableCell, ix):
if cell.spans is None:
span = set()
else:
span = set([s[ix] for s in cell.spans])
if len(span) == 0:
return 1, None, None
return len(span), min(span), max(span)

body = ""
nrows = table.num_rows
ncols = table.num_cols

if table.data is None:
return ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
cell: TableCell = table.data[i][j]

rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
colspan, colstart, colend = _get_tablecell_span(cell, 1)

if rowstart is not None and rowstart != i:
continue
if colstart is not None and colstart != j:
continue

if rowstart is None:
rowstart = i
if colstart is None:
colstart = j

content = cell.text.strip()
label = cell.obj_type
label_class = "body"
celltag = "td"
if label in ["row_header", "row_multi_header", "row_title"]:
label_class = "header"
elif label in ["col_header", "col_multi_header"]:
label_class = "header"
celltag = "th"

opening_tag = f"{celltag}"
if rowspan > 1:
opening_tag += f' rowspan="{rowspan}"'
if colspan > 1:
opening_tag += f' colspan="{colspan}"'

body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"
body = f"<table>{body}</table>"

return body


def generate_multimodal_pages(
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
Expand Down Expand Up @@ -129,7 +68,7 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
}

if isinstance(item, Table):
table_html = _export_table_to_html(item)
table_html = item.export_to_html()
new_segment["data"].append(
{
"html_seq": table_html,
Expand Down
74 changes: 74 additions & 0 deletions examples/export_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging
import time
from pathlib import Path
from typing import Tuple

import pandas as pd

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)


def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

input_files = DocumentConversionInput.from_paths(input_doc_paths)

doc_converter = DocumentConverter()

start_time = time.time()

conv_results = doc_converter.convert(input_files)

success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue

doc_filename = conv_res.input.file.stem

# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())

# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)

# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())

success_count += 1

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()
46 changes: 23 additions & 23 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^1.3.0"
docling-core = "^1.4.0"
docling-ibm-models = "^1.2.0"
deepsearch-glm = "^0.21.1"
filetype = "^1.2.0"
Expand Down

0 comments on commit f19bd43

Please sign in to comment.