feat!: add document Markdown export

Signed-off-by: Panos Vagenas <[email protected]>
DS4SD · Jul 17, 2024 · fc0d076 · fc0d076
1 parent 4e5dbdf
commit fc0d076
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 3 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -6,7 +6,7 @@
 """Models for the Docling Document data type."""
 
 from datetime import datetime
-from typing import Generic, Optional, Union
+from typing import Generic, List, Optional, Union
 
 from pydantic import (
     AnyHttpUrl,
@@ -16,6 +16,7 @@
     StrictStr,
     model_validator,
 )
+from tabulate import tabulate
 
 from docling_core.search.mapping import es_field
 from docling_core.types.base import (
@@ -391,3 +392,98 @@ def from_dict(cls, data):
                     item["$ref"] = ref
 
         return data
+
+    def _resolve_ref(self, item: Ref) -> Optional[Table]:
+        """Return the resolved reference in case of table reference, otherwise None."""
+        result: Optional[Table] = None
+
+        # NOTE: currently only resolves table refs & makes assumptions on ref parts
+        if item.obj_type == "table" and self.tables:
+            parts = item.ref.split("/")
+            result = self.tables[int(parts[2])]
+
+        return result
+
+    def export_to_markdown(
+        self,
+        sep: str = "\n\n",
+        start_incl: int = 0,
+        end_excl: Optional[int] = None,
+    ) -> str:
+        """Return a Markdown serialization of the document."""
+        has_title = False
+        prev_text = ""
+        md_texts: List[str] = []
+
+        if self.main_text is not None:
+            for orig_item in self.main_text[start_incl:end_excl]:
+                markdown_text = ""
+
+                item = (
+                    self._resolve_ref(orig_item)
+                    if isinstance(orig_item, Ref)
+                    else orig_item
+                )
+                if item is None:
+                    continue
+
+                item_type = item.obj_type
+                if isinstance(item, BaseText) and item_type in {
+                    "title",
+                    "subtitle-level-1",
+                    "paragraph",
+                    "caption",
+                }:
+                    text = item.text
+
+                    print(f"{text=}, {has_title=}, {item_type=}")
+
+                    # ignore repeated text
+                    if prev_text == text:
+                        continue
+                    else:
+                        prev_text = text
+
+                    # first title match
+                    if item_type == "title" and not has_title:
+                        markdown_text = f"# {text}"
+                        has_title = True
+
+                    # secondary titles
+                    elif item_type in {"title", "subtitle-level-1"} or (
+                        has_title and item_type == "title"
+                    ):
+                        markdown_text = f"## {text}"
+
+                    # normal text
+                    else:
+                        markdown_text = text
+
+                elif isinstance(item, Table) and item.data:
+                    table = []
+                    for row in item.data:
+                        tmp = []
+                        for col in row:
+                            tmp.append(col.text)
+                        table.append(tmp)
+
+                    if len(table) > 1 and len(table[0]) > 0:
+                        try:
+                            md_table = tabulate(
+                                table[1:], headers=table[0], tablefmt="github"
+                            )
+                        except ValueError:
+                            md_table = tabulate(
+                                table[1:],
+                                headers=table[0],
+                                tablefmt="github",
+                                disable_numparse=True,
+                            )
+
+                        markdown_text = md_table
+
+                if markdown_text:
+                    md_texts.append(markdown_text)
+
+        result = sep.join(md_texts)
+        return result
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,7 @@ jsonref = "^1.1.0"
 json-schema-for-humans = "^1.0.0"
 poetry = "^1.8.3"
 pyproject-toml = "^0.0.10"
+tabulate = "^0.9.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.4.2"
@@ -111,7 +112,7 @@ python_version = "3.9"
 plugins = ["pydantic.mypy"]
 
 [[tool.mypy.overrides]]
-module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*"]
+module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
 ignore_missing_imports = true
 
 [tool.semantic_release]