diff --git a/docling/utils/export.py b/docling/utils/export.py
index 115f7646..e9e56930 100644
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -9,67 +9,6 @@
_log = logging.getLogger(__name__)
-def _export_table_to_html(table: Table):
-
- # TODO: this is flagged as internal, because we will move it
- # to the docling-core package.
-
- def _get_tablecell_span(cell: TableCell, ix):
- if cell.spans is None:
- span = set()
- else:
- span = set([s[ix] for s in cell.spans])
- if len(span) == 0:
- return 1, None, None
- return len(span), min(span), max(span)
-
- body = ""
- nrows = table.num_rows
- ncols = table.num_cols
-
- if table.data is None:
- return ""
- for i in range(nrows):
- body += "
"
- for j in range(ncols):
- cell: TableCell = table.data[i][j]
-
- rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
- colspan, colstart, colend = _get_tablecell_span(cell, 1)
-
- if rowstart is not None and rowstart != i:
- continue
- if colstart is not None and colstart != j:
- continue
-
- if rowstart is None:
- rowstart = i
- if colstart is None:
- colstart = j
-
- content = cell.text.strip()
- label = cell.obj_type
- label_class = "body"
- celltag = "td"
- if label in ["row_header", "row_multi_header", "row_title"]:
- label_class = "header"
- elif label in ["col_header", "col_multi_header"]:
- label_class = "header"
- celltag = "th"
-
- opening_tag = f"{celltag}"
- if rowspan > 1:
- opening_tag += f' rowspan="{rowspan}"'
- if colspan > 1:
- opening_tag += f' colspan="{colspan}"'
-
- body += f"<{opening_tag}>{content}{celltag}>"
- body += "
"
- body = f""
-
- return body
-
-
def generate_multimodal_pages(
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
@@ -129,7 +68,7 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
}
if isinstance(item, Table):
- table_html = _export_table_to_html(item)
+ table_html = item.export_to_html()
new_segment["data"].append(
{
"html_seq": table_html,
diff --git a/examples/export_tables.py b/examples/export_tables.py
new file mode 100644
index 00000000..a0c605c1
--- /dev/null
+++ b/examples/export_tables.py
@@ -0,0 +1,74 @@
+import logging
+import time
+from pathlib import Path
+from typing import Tuple
+
+import pandas as pd
+
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+
+ input_doc_paths = [
+ Path("./tests/data/2206.01062.pdf"),
+ ]
+ output_dir = Path("./scratch")
+
+ input_files = DocumentConversionInput.from_paths(input_doc_paths)
+
+ doc_converter = DocumentConverter()
+
+ start_time = time.time()
+
+ conv_results = doc_converter.convert(input_files)
+
+ success_count = 0
+ failure_count = 0
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for conv_res in conv_results:
+ if conv_res.status != ConversionStatus.SUCCESS:
+ _log.info(f"Document {conv_res.input.file} failed to convert.")
+ failure_count += 1
+ continue
+
+ doc_filename = conv_res.input.file.stem
+
+ # Export tables
+ for table_ix, table in enumerate(conv_res.output.tables):
+ table_df: pd.DataFrame = table.export_to_dataframe()
+ print(f"## Table {table_ix}")
+ print(table_df.to_markdown())
+
+ # Save the table as csv
+ element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
+ _log.info(f"Saving CSV table to {element_csv_filename}")
+ table_df.to_csv(element_csv_filename)
+
+ # Save the table as html
+ element_html_filename = (
+ output_dir / f"{doc_filename}-table-{table_ix+1}.html"
+ )
+ _log.info(f"Saving HTML table to {element_html_filename}")
+ with element_html_filename.open("w") as fp:
+ fp.write(table.export_to_html())
+
+ success_count += 1
+
+ end_time = time.time() - start_time
+
+ _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+ if failure_count > 0:
+ raise RuntimeError(
+ f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/poetry.lock b/poetry.lock
index b3092740..075a5220 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -957,13 +957,13 @@ files = [
[[package]]
name = "docling-core"
-version = "1.3.0"
+version = "1.4.0"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
- {file = "docling_core-1.3.0-py3-none-any.whl", hash = "sha256:31779b9a5cce7e925d01d3b78fa8a835c531fa74646205ae2a8721f534eb8b27"},
- {file = "docling_core-1.3.0.tar.gz", hash = "sha256:beb55fb0018c912209bdf12958e4cf5a6c8bbe73fd097d03da25fc3979260fab"},
+ {file = "docling_core-1.4.0-py3-none-any.whl", hash = "sha256:11cd6228d5f321fd11427cf61f40148afd544170e82236228794300f14f8a15a"},
+ {file = "docling_core-1.4.0.tar.gz", hash = "sha256:6ea151974172a87a9bca0d63787dc16bdb4170ecb73f18e61e3c2e95eb3fe3d8"},
]
[package.dependencies]
@@ -1151,18 +1151,18 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth
[[package]]
name = "filelock"
-version = "3.16.0"
+version = "3.16.1"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.8"
files = [
- {file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"},
- {file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"},
+ {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
+ {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
]
[package.extras]
-docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
typing = ["typing-extensions (>=4.12.2)"]
[[package]]
@@ -2383,17 +2383,17 @@ transformers = ">=4.39.0"
[[package]]
name = "langchain-milvus"
-version = "0.1.4"
+version = "0.1.5"
description = "An integration package connecting Milvus and LangChain"
optional = true
python-versions = "<4.0,>=3.8.1"
files = [
- {file = "langchain_milvus-0.1.4-py3-none-any.whl", hash = "sha256:f5c1f2d023c6853d1acc22dc8d0b61ca4d99015c1b095b0cf84ec84a9ba2936e"},
- {file = "langchain_milvus-0.1.4.tar.gz", hash = "sha256:1cd67f127d60c73ffb07cd789705766479137630d43f8ff547c69eee4775dae8"},
+ {file = "langchain_milvus-0.1.5-py3-none-any.whl", hash = "sha256:74aa487738afde4c3e1346433ef26f9556e599826161562b308d3357d86529fd"},
+ {file = "langchain_milvus-0.1.5.tar.gz", hash = "sha256:1cceab384783ba264055102e5831451482fd726a68feb64258f6dbbd8d702557"},
]
[package.dependencies]
-langchain-core = ">=0.2.20,<0.3.0"
+langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""}
pymilvus = ">=2.4.3,<3.0.0"
scipy = [
{version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
@@ -3950,13 +3950,13 @@ testing = ["pytest", "pytest-cov", "wheel"]
[[package]]
name = "platformdirs"
-version = "4.3.4"
+version = "4.3.6"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
optional = false
python-versions = ">=3.8"
files = [
- {file = "platformdirs-4.3.4-py3-none-any.whl", hash = "sha256:8b4ba85412f5065dae40aa19feaa02ac2be584c8b14abd70712b5cd11ad80034"},
- {file = "platformdirs-4.3.4.tar.gz", hash = "sha256:9e8a037c36fe1b1f1b5de4482e60464272cc8dca725e40b568bf2c285f7509cf"},
+ {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
+ {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
]
[package.extras]
@@ -4500,17 +4500,17 @@ wheel = "*"
[[package]]
name = "pyreadline3"
-version = "3.5.2"
+version = "3.5.3"
description = "A python implementation of GNU readline."
optional = false
python-versions = ">=3.8"
files = [
- {file = "pyreadline3-3.5.2-py3-none-any.whl", hash = "sha256:a87d56791e2965b2b187e2ea33dcf664600842c997c0623c95cf8ef07db83de9"},
- {file = "pyreadline3-3.5.2.tar.gz", hash = "sha256:ba82292e52c5a3bb256b291af0c40b457c1e8699cac9a873abbcaac8aef3a1bb"},
+ {file = "pyreadline3-3.5.3-py3-none-any.whl", hash = "sha256:ddede153a92e5aad9c1fe63d692efd6a3e478f686adcd4938a051ffb63ec4f52"},
+ {file = "pyreadline3-3.5.3.tar.gz", hash = "sha256:9234684ca75a00a702fda42b17cc26ca665bc9d7c2da06af450468253099ff61"},
]
[package.extras]
-dev = ["build", "flake8", "pytest", "twine"]
+dev = ["build", "flake8", "mypy", "pytest", "twine"]
[[package]]
name = "pytest"
@@ -6862,13 +6862,13 @@ zstd = ["zstandard (>=0.18.0)"]
[[package]]
name = "virtualenv"
-version = "20.26.4"
+version = "20.26.5"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.7"
files = [
- {file = "virtualenv-20.26.4-py3-none-any.whl", hash = "sha256:48f2695d9809277003f30776d155615ffc11328e6a0a8c1f0ec80188d7874a55"},
- {file = "virtualenv-20.26.4.tar.gz", hash = "sha256:c17f4e0f3e6036e9f26700446f85c76ab11df65ff6d8a9cbfad9f71aabfcf23c"},
+ {file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"},
+ {file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"},
]
[package.dependencies]
@@ -7257,4 +7257,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
-content-hash = "ae5c784c10b8d5635bc8fd7490c89049a8b4f0247e2b8ddd7b0d65106c10dda5"
+content-hash = "7dc789b3c981898fdabec03f85ebb92273f2bb55b2bf1e18dad1d4c361c6b97b"
diff --git a/pyproject.toml b/pyproject.toml
index 1813d53c..c78d66f5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
-docling-core = "^1.3.0"
+docling-core = "^1.4.0"
docling-ibm-models = "^1.2.0"
deepsearch-glm = "^0.21.1"
filetype = "^1.2.0"