Skip to content

Commit

Permalink
feat!: add document Markdown export
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Jul 17, 2024
1 parent 4e5dbdf commit fc0d076
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 3 deletions.
98 changes: 97 additions & 1 deletion docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""Models for the Docling Document data type."""

from datetime import datetime
from typing import Generic, Optional, Union
from typing import Generic, List, Optional, Union

from pydantic import (
AnyHttpUrl,
Expand All @@ -16,6 +16,7 @@
StrictStr,
model_validator,
)
from tabulate import tabulate

from docling_core.search.mapping import es_field
from docling_core.types.base import (
Expand Down Expand Up @@ -391,3 +392,98 @@ def from_dict(cls, data):
item["$ref"] = ref

return data

def _resolve_ref(self, item: Ref) -> Optional[Table]:
"""Return the resolved reference in case of table reference, otherwise None."""
result: Optional[Table] = None

# NOTE: currently only resolves table refs & makes assumptions on ref parts
if item.obj_type == "table" and self.tables:
parts = item.ref.split("/")
result = self.tables[int(parts[2])]

return result

def export_to_markdown(
self,
sep: str = "\n\n",
start_incl: int = 0,
end_excl: Optional[int] = None,
) -> str:
"""Return a Markdown serialization of the document."""
has_title = False
prev_text = ""
md_texts: List[str] = []

if self.main_text is not None:
for orig_item in self.main_text[start_incl:end_excl]:
markdown_text = ""

item = (
self._resolve_ref(orig_item)
if isinstance(orig_item, Ref)
else orig_item
)
if item is None:
continue

item_type = item.obj_type
if isinstance(item, BaseText) and item_type in {
"title",
"subtitle-level-1",
"paragraph",
"caption",
}:
text = item.text

print(f"{text=}, {has_title=}, {item_type=}")

# ignore repeated text
if prev_text == text:
continue
else:
prev_text = text

# first title match
if item_type == "title" and not has_title:
markdown_text = f"# {text}"
has_title = True

# secondary titles
elif item_type in {"title", "subtitle-level-1"} or (
has_title and item_type == "title"
):
markdown_text = f"## {text}"

# normal text
else:
markdown_text = text

elif isinstance(item, Table) and item.data:
table = []
for row in item.data:
tmp = []
for col in row:
tmp.append(col.text)
table.append(tmp)

if len(table) > 1 and len(table[0]) > 0:
try:
md_table = tabulate(
table[1:], headers=table[0], tablefmt="github"
)
except ValueError:
md_table = tabulate(
table[1:],
headers=table[0],
tablefmt="github",
disable_numparse=True,
)

markdown_text = md_table

if markdown_text:
md_texts.append(markdown_text)

result = sep.join(md_texts)
return result
16 changes: 15 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ jsonref = "^1.1.0"
json-schema-for-humans = "^1.0.0"
poetry = "^1.8.3"
pyproject-toml = "^0.0.10"
tabulate = "^0.9.0"

[tool.poetry.group.dev.dependencies]
black = "^24.4.2"
Expand Down Expand Up @@ -111,7 +112,7 @@ python_version = "3.9"
plugins = ["pydantic.mypy"]

[[tool.mypy.overrides]]
module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*"]
module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
ignore_missing_imports = true

[tool.semantic_release]
Expand Down

0 comments on commit fc0d076

Please sign in to comment.