From f19bd437984f77067d33d591e25c5d5c92d7e0a9 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:44:13 +0200 Subject: [PATCH] feat: add table exports (#86) * feat: expose docling-core table exporters and add examples Signed-off-by: Michele Dolfi * remove temp internal implementation of html export Signed-off-by: Michele Dolfi * pin latest docling-core 1.4.0 with table exports Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/utils/export.py | 63 +-------------------------------- examples/export_tables.py | 74 +++++++++++++++++++++++++++++++++++++++ poetry.lock | 46 ++++++++++++------------ pyproject.toml | 2 +- 4 files changed, 99 insertions(+), 86 deletions(-) create mode 100644 examples/export_tables.py diff --git a/docling/utils/export.py b/docling/utils/export.py index 115f7646..e9e56930 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -9,67 +9,6 @@ _log = logging.getLogger(__name__) -def _export_table_to_html(table: Table): - - # TODO: this is flagged as internal, because we will move it - # to the docling-core package. - - def _get_tablecell_span(cell: TableCell, ix): - if cell.spans is None: - span = set() - else: - span = set([s[ix] for s in cell.spans]) - if len(span) == 0: - return 1, None, None - return len(span), min(span), max(span) - - body = "" - nrows = table.num_rows - ncols = table.num_cols - - if table.data is None: - return "" - for i in range(nrows): - body += "" - for j in range(ncols): - cell: TableCell = table.data[i][j] - - rowspan, rowstart, rowend = _get_tablecell_span(cell, 0) - colspan, colstart, colend = _get_tablecell_span(cell, 1) - - if rowstart is not None and rowstart != i: - continue - if colstart is not None and colstart != j: - continue - - if rowstart is None: - rowstart = i - if colstart is None: - colstart = j - - content = cell.text.strip() - label = cell.obj_type - label_class = "body" - celltag = "td" - if label in ["row_header", "row_multi_header", "row_title"]: - label_class = "header" - elif label in ["col_header", "col_multi_header"]: - label_class = "header" - celltag = "th" - - opening_tag = f"{celltag}" - if rowspan > 1: - opening_tag += f' rowspan="{rowspan}"' - if colspan > 1: - opening_tag += f' colspan="{colspan}"' - - body += f"<{opening_tag}>{content}" - body += "" - body = f"{body}
" - - return body - - def generate_multimodal_pages( doc_result: ConversionResult, ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: @@ -129,7 +68,7 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page): } if isinstance(item, Table): - table_html = _export_table_to_html(item) + table_html = item.export_to_html() new_segment["data"].append( { "html_seq": table_html, diff --git a/examples/export_tables.py b/examples/export_tables.py new file mode 100644 index 00000000..a0c605c1 --- /dev/null +++ b/examples/export_tables.py @@ -0,0 +1,74 @@ +import logging +import time +from pathlib import Path +from typing import Tuple + +import pandas as pd + +from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.document import DocumentConversionInput +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./tests/data/2206.01062.pdf"), + ] + output_dir = Path("./scratch") + + input_files = DocumentConversionInput.from_paths(input_doc_paths) + + doc_converter = DocumentConverter() + + start_time = time.time() + + conv_results = doc_converter.convert(input_files) + + success_count = 0 + failure_count = 0 + output_dir.mkdir(parents=True, exist_ok=True) + for conv_res in conv_results: + if conv_res.status != ConversionStatus.SUCCESS: + _log.info(f"Document {conv_res.input.file} failed to convert.") + failure_count += 1 + continue + + doc_filename = conv_res.input.file.stem + + # Export tables + for table_ix, table in enumerate(conv_res.output.tables): + table_df: pd.DataFrame = table.export_to_dataframe() + print(f"## Table {table_ix}") + print(table_df.to_markdown()) + + # Save the table as csv + element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" + _log.info(f"Saving CSV table to {element_csv_filename}") + table_df.to_csv(element_csv_filename) + + # Save the table as html + element_html_filename = ( + output_dir / f"{doc_filename}-table-{table_ix+1}.html" + ) + _log.info(f"Saving HTML table to {element_html_filename}") + with element_html_filename.open("w") as fp: + fp.write(table.export_to_html()) + + success_count += 1 + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + if failure_count > 0: + raise RuntimeError( + f"The example failed converting {failure_count} on {len(input_doc_paths)}." + ) + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index b3092740..075a5220 100644 --- a/poetry.lock +++ b/poetry.lock @@ -957,13 +957,13 @@ files = [ [[package]] name = "docling-core" -version = "1.3.0" +version = "1.4.0" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-1.3.0-py3-none-any.whl", hash = "sha256:31779b9a5cce7e925d01d3b78fa8a835c531fa74646205ae2a8721f534eb8b27"}, - {file = "docling_core-1.3.0.tar.gz", hash = "sha256:beb55fb0018c912209bdf12958e4cf5a6c8bbe73fd097d03da25fc3979260fab"}, + {file = "docling_core-1.4.0-py3-none-any.whl", hash = "sha256:11cd6228d5f321fd11427cf61f40148afd544170e82236228794300f14f8a15a"}, + {file = "docling_core-1.4.0.tar.gz", hash = "sha256:6ea151974172a87a9bca0d63787dc16bdb4170ecb73f18e61e3c2e95eb3fe3d8"}, ] [package.dependencies] @@ -1151,18 +1151,18 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth [[package]] name = "filelock" -version = "3.16.0" +version = "3.16.1" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"}, - {file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"}, + {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"}, + {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"}, ] [package.extras] -docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"] +docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"] typing = ["typing-extensions (>=4.12.2)"] [[package]] @@ -2383,17 +2383,17 @@ transformers = ">=4.39.0" [[package]] name = "langchain-milvus" -version = "0.1.4" +version = "0.1.5" description = "An integration package connecting Milvus and LangChain" optional = true python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain_milvus-0.1.4-py3-none-any.whl", hash = "sha256:f5c1f2d023c6853d1acc22dc8d0b61ca4d99015c1b095b0cf84ec84a9ba2936e"}, - {file = "langchain_milvus-0.1.4.tar.gz", hash = "sha256:1cd67f127d60c73ffb07cd789705766479137630d43f8ff547c69eee4775dae8"}, + {file = "langchain_milvus-0.1.5-py3-none-any.whl", hash = "sha256:74aa487738afde4c3e1346433ef26f9556e599826161562b308d3357d86529fd"}, + {file = "langchain_milvus-0.1.5.tar.gz", hash = "sha256:1cceab384783ba264055102e5831451482fd726a68feb64258f6dbbd8d702557"}, ] [package.dependencies] -langchain-core = ">=0.2.20,<0.3.0" +langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""} pymilvus = ">=2.4.3,<3.0.0" scipy = [ {version = ">=1.7,<2.0", markers = "python_version < \"3.12\""}, @@ -3950,13 +3950,13 @@ testing = ["pytest", "pytest-cov", "wheel"] [[package]] name = "platformdirs" -version = "4.3.4" +version = "4.3.6" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" files = [ - {file = "platformdirs-4.3.4-py3-none-any.whl", hash = "sha256:8b4ba85412f5065dae40aa19feaa02ac2be584c8b14abd70712b5cd11ad80034"}, - {file = "platformdirs-4.3.4.tar.gz", hash = "sha256:9e8a037c36fe1b1f1b5de4482e60464272cc8dca725e40b568bf2c285f7509cf"}, + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, ] [package.extras] @@ -4500,17 +4500,17 @@ wheel = "*" [[package]] name = "pyreadline3" -version = "3.5.2" +version = "3.5.3" description = "A python implementation of GNU readline." optional = false python-versions = ">=3.8" files = [ - {file = "pyreadline3-3.5.2-py3-none-any.whl", hash = "sha256:a87d56791e2965b2b187e2ea33dcf664600842c997c0623c95cf8ef07db83de9"}, - {file = "pyreadline3-3.5.2.tar.gz", hash = "sha256:ba82292e52c5a3bb256b291af0c40b457c1e8699cac9a873abbcaac8aef3a1bb"}, + {file = "pyreadline3-3.5.3-py3-none-any.whl", hash = "sha256:ddede153a92e5aad9c1fe63d692efd6a3e478f686adcd4938a051ffb63ec4f52"}, + {file = "pyreadline3-3.5.3.tar.gz", hash = "sha256:9234684ca75a00a702fda42b17cc26ca665bc9d7c2da06af450468253099ff61"}, ] [package.extras] -dev = ["build", "flake8", "pytest", "twine"] +dev = ["build", "flake8", "mypy", "pytest", "twine"] [[package]] name = "pytest" @@ -6862,13 +6862,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.26.4" +version = "20.26.5" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.4-py3-none-any.whl", hash = "sha256:48f2695d9809277003f30776d155615ffc11328e6a0a8c1f0ec80188d7874a55"}, - {file = "virtualenv-20.26.4.tar.gz", hash = "sha256:c17f4e0f3e6036e9f26700446f85c76ab11df65ff6d8a9cbfad9f71aabfcf23c"}, + {file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"}, + {file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"}, ] [package.dependencies] @@ -7257,4 +7257,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "ae5c784c10b8d5635bc8fd7490c89049a8b4f0247e2b8ddd7b0d65106c10dda5" +content-hash = "7dc789b3c981898fdabec03f85ebb92273f2bb55b2bf1e18dad1d4c361c6b97b" diff --git a/pyproject.toml b/pyproject.toml index 1813d53c..c78d66f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ packages = [{include = "docling"}] [tool.poetry.dependencies] python = "^3.10" pydantic = "^2.0.0" -docling-core = "^1.3.0" +docling-core = "^1.4.0" docling-ibm-models = "^1.2.0" deepsearch-glm = "^0.21.1" filetype = "^1.2.0"