feat: refactoring doc-tokens in new file and add new export function …

…to table Signed-off-by: Peter Staar <[email protected]>
DS4SD · Sep 18, 2024 · 17f54bf · 17f54bf
1 parent 9b8e97b
commit 17f54bf
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 102 deletions.
diff --git a/docling_core/types/doc/base.py b/docling_core/types/doc/base.py
@@ -249,6 +249,31 @@ def export_to_html(self) -> str:
 
         return body
 
+    def export_to_document_tokens(self, new_line:str="\n", loc_str:str=""):
+
+        body = ""
+
+        body += f"{DocumentToken.BEG_TABLE.value}{loc_str}"
+
+        if self.text is not None and len(self.text) > 0:
+            body += f"{DocumentToken.BEG_CAPTION.value}"
+            body += (
+                f"{self.text}{DocumentToken.END_CAPTION.value}{new_line}"
+            )
+
+        if self.data is not None and len(self.data) > 0:
+            for i, row in enumerate(self.data):
+                body += f"<row_{i}>"
+                for j, col in enumerate(row):
+                    text = col.text
+                    body += f"<col_{j}>{text}</col_{j}>"
+
+                body += f"</row_{i}>{new_line}"
+
+        body += f"{DocumentToken.BEG_TABLE.value}{new_line}"
+
+        return body
+
 
 # FIXME: let's add some figure specific data-types later
 class Figure(BaseCell):

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -9,6 +9,8 @@
 from enum import Enum
 from typing import Generic, Optional, Tuple, Union
 
+
+
 from pydantic import (
     AnyHttpUrl,
     BaseModel,
@@ -32,6 +34,9 @@
     LanguageT,
     Log,
 )
+
+from docling_core.types.doc.tokens import DocumentToken
+
 from docling_core.types.doc.base import (
     BaseCell,
     BaseText,
@@ -347,107 +352,6 @@ def from_dict(cls, data):
         return data
 
 
-class DocumentToken(Enum):
-    """Class to represent an LLM friendly representation of a Document."""
-
-    BEG_DOCUMENT = "<document>"
-    END_DOCUMENT = "</document>"
-
-    BEG_TITLE = "<title>"
-    END_TITLE = "</title>"
-
-    BEG_ABSTRACT = "<abstract>"
-    END_ABSTRACT = "</abstract>"
-
-    BEG_DOI = "<doi>"
-    END_DOI = "</doi>"
-    BEG_DATE = "<date>"
-    END_DATE = "</date>"
-
-    BEG_AUTHORS = "<authors>"
-    END_AUTHORS = "</authors>"
-    BEG_AUTHOR = "<author>"
-    END_AUTHOR = "</author>"
-
-    BEG_AFFILIATIONS = "<affiliations>"
-    END_AFFILIATIONS = "</affiliations>"
-    BEG_AFFILIATION = "<affiliation>"
-    END_AFFILIATION = "</affiliation>"
-
-    BEG_HEADER = "<section-header>"
-    END_HEADER = "</section-header>"
-    BEG_TEXT = "<text>"
-    END_TEXT = "</text>"
-    BEG_PARAGRAPH = "<paragraph>"
-    END_PARAGRAPH = "</paragraph>"
-    BEG_TABLE = "<table>"
-    END_TABLE = "</table>"
-    BEG_FIGURE = "<figure>"
-    END_FIGURE = "</figure>"
-    BEG_CAPTION = "<caption>"
-    END_CAPTION = "</caption>"
-    BEG_EQUATION = "<equation>"
-    END_EQUATION = "</equation>"
-    BEG_LIST = "<list>"
-    END_LIST = "</list>"
-    BEG_LISTITEM = "<list-item>"
-    END_LISTITEM = "</list-item>"
-
-    BEG_LOCATION = "<location>"
-    END_LOCATION = "</location>"
-    BEG_GROUP = "<group>"
-    END_GROUP = "</group>"
-
-    @classmethod
-    def get_special_tokens(
-        cls,
-        max_rows: int = 100,
-        max_cols: int = 100,
-        max_pages: int = 1000,
-        page_dimension: Tuple[int, int] = (100, 100),
-    ):
-        """Function to get all special document tokens."""
-        special_tokens = [token.value for token in cls]
-
-        # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows + 1):
-            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
-
-        for i in range(0, max_cols + 1):
-            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
-
-        for i in range(6):
-            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
-
-        # Adding dynamically generated page-tokens
-        for i in range(0, max_pages + 1):
-            special_tokens.append(f"<page_{i}>")
-
-        # Adding dynamically generated location-tokens
-        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
-            special_tokens.append(f"<loc_{i}>")
-
-        return special_tokens
-
-    @staticmethod
-    def get_page_token(page: int):
-        """Function to get page tokens."""
-        return f"<page_{page}>"
-
-    @staticmethod
-    def get_location_token(val: float, rnorm: int = 100):
-        """Function to get location tokens."""
-        val_ = round(rnorm * val)
-
-        if val_ < 0:
-            return "<loc_0>"
-
-        if val_ > rnorm:
-            return f"<loc_{rnorm}>"
-
-        return f"<loc_{val_}>"
-
-
 class ExportedCCSDocument(
     MinimalDocument,
     Generic[

diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py
@@ -0,0 +1,126 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+"""Tokens used in the docling document model."""
+
+from enum import Enum
+
+class DocumentToken(Enum):
+    """Class to represent an LLM friendly representation of a Document."""
+
+    BEG_DOCUMENT = "<document>"
+    END_DOCUMENT = "</document>"
+
+    BEG_TITLE = "<title>"
+    END_TITLE = "</title>"
+
+    BEG_ABSTRACT = "<abstract>"
+    END_ABSTRACT = "</abstract>"
+
+    BEG_DOI = "<doi>"
+    END_DOI = "</doi>"
+    BEG_DATE = "<date>"
+    END_DATE = "</date>"
+
+    BEG_AUTHORS = "<authors>"
+    END_AUTHORS = "</authors>"
+    BEG_AUTHOR = "<author>"
+    END_AUTHOR = "</author>"
+
+    BEG_AFFILIATIONS = "<affiliations>"
+    END_AFFILIATIONS = "</affiliations>"
+    BEG_AFFILIATION = "<affiliation>"
+    END_AFFILIATION = "</affiliation>"
+
+    BEG_HEADER = "<section-header>"
+    END_HEADER = "</section-header>"
+    BEG_TEXT = "<text>"
+    END_TEXT = "</text>"
+    BEG_PARAGRAPH = "<paragraph>"
+    END_PARAGRAPH = "</paragraph>"
+    BEG_TABLE = "<table>"
+    END_TABLE = "</table>"
+    BEG_FIGURE = "<figure>"
+    END_FIGURE = "</figure>"
+    BEG_CAPTION = "<caption>"
+    END_CAPTION = "</caption>"
+    BEG_EQUATION = "<equation>"
+    END_EQUATION = "</equation>"
+    BEG_LIST = "<list>"
+    END_LIST = "</list>"
+    BEG_LISTITEM = "<list-item>"
+    END_LISTITEM = "</list-item>"
+
+    BEG_LOCATION = "<location>"
+    END_LOCATION = "</location>"
+    BEG_GROUP = "<group>"
+    END_GROUP = "</group>"
+
+    @classmethod
+    def get_special_tokens(
+        cls,
+        max_rows: int = 100,
+        max_cols: int = 100,
+        max_pages: int = 1000,
+        page_dimension: Tuple[int, int] = (100, 100),
+    ):
+        """Function to get all special document tokens."""
+        special_tokens = [token.value for token in cls]
+
+        # Adding dynamically generated row and col tokens
+        for i in range(0, max_rows + 1):
+            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
+
+        for i in range(0, max_cols + 1):
+            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
+
+        for i in range(6):
+            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
+
+        # Adding dynamically generated page-tokens
+        for i in range(0, max_pages + 1):
+            special_tokens.append(f"<page_{i}>")
+
+        # Adding dynamically generated location-tokens
+        for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
+            special_tokens.append(f"<loc_{i}>")
+
+        return special_tokens
+
+    @staticmethod
+    def get_row_token(row: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<row_{row}>"
+        else:
+            return f"</row_{row}>"
+
+    @staticmethod
+    def get_col_token(col: int, beg=bool) -> str:
+        """Function to get page tokens."""
+        if beg:
+            return f"<col_{col}>"
+        else:
+            return f"</col_{col}>"
+
+    @staticmethod
+    def get_page_token(page: int):
+        """Function to get page tokens."""
+        return f"<page_{page}>"
+
+    @staticmethod
+    def get_location_token(val: float, rnorm: int = 100):
+        """Function to get location tokens."""
+        val_ = round(rnorm * val)
+
+        if val_ < 0:
+            return "<loc_0>"
+
+        if val_ > rnorm:
+            return f"<loc_{rnorm}>"
+
+        return f"<loc_{val_}>"
+
+
diff --git a/docling_core/types/rec/statement.py b/docling_core/types/rec/statement.py
@@ -21,7 +21,6 @@
 from docling_core.types.rec.attribute import Attribute
 from docling_core.types.rec.subject import Subject
 
-
 class StatementToken(Enum):
     """Class to represent an LLM friendly representation of statements."""