Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add table exports #86

Merged
merged 4 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 1 addition & 62 deletions docling/utils/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,67 +9,6 @@
_log = logging.getLogger(__name__)


def _export_table_to_html(table: Table):

# TODO: this is flagged as internal, because we will move it
# to the docling-core package.

def _get_tablecell_span(cell: TableCell, ix):
if cell.spans is None:
span = set()
else:
span = set([s[ix] for s in cell.spans])
if len(span) == 0:
return 1, None, None
return len(span), min(span), max(span)

body = ""
nrows = table.num_rows
ncols = table.num_cols

if table.data is None:
return ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
cell: TableCell = table.data[i][j]

rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
colspan, colstart, colend = _get_tablecell_span(cell, 1)

if rowstart is not None and rowstart != i:
continue
if colstart is not None and colstart != j:
continue

if rowstart is None:
rowstart = i
if colstart is None:
colstart = j

content = cell.text.strip()
label = cell.obj_type
label_class = "body"
celltag = "td"
if label in ["row_header", "row_multi_header", "row_title"]:
label_class = "header"
elif label in ["col_header", "col_multi_header"]:
label_class = "header"
celltag = "th"

opening_tag = f"{celltag}"
if rowspan > 1:
opening_tag += f' rowspan="{rowspan}"'
if colspan > 1:
opening_tag += f' colspan="{colspan}"'

body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"
body = f"<table>{body}</table>"

return body


def generate_multimodal_pages(
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
Expand Down Expand Up @@ -129,7 +68,7 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
}

if isinstance(item, Table):
table_html = _export_table_to_html(item)
table_html = item.export_to_html()
new_segment["data"].append(
{
"html_seq": table_html,
Expand Down
74 changes: 74 additions & 0 deletions examples/export_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging
import time
from pathlib import Path
from typing import Tuple

import pandas as pd

from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)


def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

input_files = DocumentConversionInput.from_paths(input_doc_paths)

doc_converter = DocumentConverter()

start_time = time.time()

conv_results = doc_converter.convert(input_files)

success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue

doc_filename = conv_res.input.file.stem

# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())

# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)

# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())

success_count += 1

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()
46 changes: 23 additions & 23 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^1.3.0"
docling-core = "^1.4.0"
docling-ibm-models = "^1.2.0"
deepsearch-glm = "^0.21.1"
filetype = "^1.2.0"
Expand Down
Loading