DS4SD · dolfim-ibm · Sep 18, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 18, 2024
diff --git a/docling/utils/export.py b/docling/utils/export.py
@@ -9,67 +9,6 @@
 _log = logging.getLogger(__name__)
 
 
-def _export_table_to_html(table: Table):
-
-    # TODO: this is flagged as internal, because we will move it
-    # to the docling-core package.
-
-    def _get_tablecell_span(cell: TableCell, ix):
-        if cell.spans is None:
-            span = set()
-        else:
-            span = set([s[ix] for s in cell.spans])
-        if len(span) == 0:
-            return 1, None, None
-        return len(span), min(span), max(span)
-
-    body = ""
-    nrows = table.num_rows
-    ncols = table.num_cols
-
-    if table.data is None:
-        return ""
-    for i in range(nrows):
-        body += "<tr>"
-        for j in range(ncols):
-            cell: TableCell = table.data[i][j]
-
-            rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
-            colspan, colstart, colend = _get_tablecell_span(cell, 1)
-
-            if rowstart is not None and rowstart != i:
-                continue
-            if colstart is not None and colstart != j:
-                continue
-
-            if rowstart is None:
-                rowstart = i
-            if colstart is None:
-                colstart = j
-
-            content = cell.text.strip()
-            label = cell.obj_type
-            label_class = "body"
-            celltag = "td"
-            if label in ["row_header", "row_multi_header", "row_title"]:
-                label_class = "header"
-            elif label in ["col_header", "col_multi_header"]:
-                label_class = "header"
-                celltag = "th"
-
-            opening_tag = f"{celltag}"
-            if rowspan > 1:
-                opening_tag += f' rowspan="{rowspan}"'
-            if colspan > 1:
-                opening_tag += f' colspan="{colspan}"'
-
-            body += f"<{opening_tag}>{content}</{celltag}>"
-        body += "</tr>"
-    body = f"<table>{body}</table>"
-
-    return body
-
-
 def generate_multimodal_pages(
     doc_result: ConversionResult,
 ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
@@ -129,7 +68,7 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
             }
 
             if isinstance(item, Table):
-                table_html = _export_table_to_html(item)
+                table_html = item.export_to_html()
                 new_segment["data"].append(
                     {
                         "html_seq": table_html,

diff --git a/examples/export_tables.py b/examples/export_tables.py
@@ -0,0 +1,74 @@
+import logging
+import time
+from pathlib import Path
+from typing import Tuple
+
+import pandas as pd
+
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./tests/data/2206.01062.pdf"),
+    ]
+    output_dir = Path("./scratch")
+
+    input_files = DocumentConversionInput.from_paths(input_doc_paths)
+
+    doc_converter = DocumentConverter()
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input_files)
+
+    success_count = 0
+    failure_count = 0
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for conv_res in conv_results:
+        if conv_res.status != ConversionStatus.SUCCESS:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+            continue
+
+        doc_filename = conv_res.input.file.stem
+
+        # Export tables
+        for table_ix, table in enumerate(conv_res.output.tables):
+            table_df: pd.DataFrame = table.export_to_dataframe()
+            print(f"## Table {table_ix}")
+            print(table_df.to_markdown())
+
+            # Save the table as csv
+            element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
+            _log.info(f"Saving CSV table to {element_csv_filename}")
+            table_df.to_csv(element_csv_filename)
+
+            # Save the table as html
+            element_html_filename = (
+                output_dir / f"{doc_filename}-table-{table_ix+1}.html"
+            )
+            _log.info(f"Saving HTML table to {element_html_filename}")
+            with element_html_filename.open("w") as fp:
+                fp.write(table.export_to_html())
+
+        success_count += 1
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.3.0"
+docling-core = "^1.4.0"
 docling-ibm-models = "^1.2.0"
 deepsearch-glm = "^0.21.1"
 filetype = "^1.2.0"