Skip to content

Commit

Permalink
feat: refactoring doc-tokens in new file and add new export function …
Browse files Browse the repository at this point in the history
…to table

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Sep 18, 2024
1 parent 9b8e97b commit 17f54bf
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 102 deletions.
25 changes: 25 additions & 0 deletions docling_core/types/doc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,31 @@ def export_to_html(self) -> str:

return body

def export_to_document_tokens(self, new_line:str="\n", loc_str:str=""):

body = ""

body += f"{DocumentToken.BEG_TABLE.value}{loc_str}"

if self.text is not None and len(self.text) > 0:
body += f"{DocumentToken.BEG_CAPTION.value}"
body += (
f"{self.text}{DocumentToken.END_CAPTION.value}{new_line}"
)

if self.data is not None and len(self.data) > 0:
for i, row in enumerate(self.data):
body += f"<row_{i}>"
for j, col in enumerate(row):
text = col.text
body += f"<col_{j}>{text}</col_{j}>"

body += f"</row_{i}>{new_line}"

body += f"{DocumentToken.BEG_TABLE.value}{new_line}"

return body


# FIXME: let's add some figure specific data-types later
class Figure(BaseCell):
Expand Down
106 changes: 5 additions & 101 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from enum import Enum
from typing import Generic, Optional, Tuple, Union



from pydantic import (
AnyHttpUrl,
BaseModel,
Expand All @@ -32,6 +34,9 @@
LanguageT,
Log,
)

from docling_core.types.doc.tokens import DocumentToken

from docling_core.types.doc.base import (
BaseCell,
BaseText,
Expand Down Expand Up @@ -347,107 +352,6 @@ def from_dict(cls, data):
return data


class DocumentToken(Enum):
"""Class to represent an LLM friendly representation of a Document."""

BEG_DOCUMENT = "<document>"
END_DOCUMENT = "</document>"

BEG_TITLE = "<title>"
END_TITLE = "</title>"

BEG_ABSTRACT = "<abstract>"
END_ABSTRACT = "</abstract>"

BEG_DOI = "<doi>"
END_DOI = "</doi>"
BEG_DATE = "<date>"
END_DATE = "</date>"

BEG_AUTHORS = "<authors>"
END_AUTHORS = "</authors>"
BEG_AUTHOR = "<author>"
END_AUTHOR = "</author>"

BEG_AFFILIATIONS = "<affiliations>"
END_AFFILIATIONS = "</affiliations>"
BEG_AFFILIATION = "<affiliation>"
END_AFFILIATION = "</affiliation>"

BEG_HEADER = "<section-header>"
END_HEADER = "</section-header>"
BEG_TEXT = "<text>"
END_TEXT = "</text>"
BEG_PARAGRAPH = "<paragraph>"
END_PARAGRAPH = "</paragraph>"
BEG_TABLE = "<table>"
END_TABLE = "</table>"
BEG_FIGURE = "<figure>"
END_FIGURE = "</figure>"
BEG_CAPTION = "<caption>"
END_CAPTION = "</caption>"
BEG_EQUATION = "<equation>"
END_EQUATION = "</equation>"
BEG_LIST = "<list>"
END_LIST = "</list>"
BEG_LISTITEM = "<list-item>"
END_LISTITEM = "</list-item>"

BEG_LOCATION = "<location>"
END_LOCATION = "</location>"
BEG_GROUP = "<group>"
END_GROUP = "</group>"

@classmethod
def get_special_tokens(
cls,
max_rows: int = 100,
max_cols: int = 100,
max_pages: int = 1000,
page_dimension: Tuple[int, int] = (100, 100),
):
"""Function to get all special document tokens."""
special_tokens = [token.value for token in cls]

# Adding dynamically generated row and col tokens
for i in range(0, max_rows + 1):
special_tokens += [f"<row_{i}>", f"</row_{i}>"]

for i in range(0, max_cols + 1):
special_tokens += [f"<col_{i}>", f"</col_{i}>"]

for i in range(6):
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]

# Adding dynamically generated page-tokens
for i in range(0, max_pages + 1):
special_tokens.append(f"<page_{i}>")

# Adding dynamically generated location-tokens
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
special_tokens.append(f"<loc_{i}>")

return special_tokens

@staticmethod
def get_page_token(page: int):
"""Function to get page tokens."""
return f"<page_{page}>"

@staticmethod
def get_location_token(val: float, rnorm: int = 100):
"""Function to get location tokens."""
val_ = round(rnorm * val)

if val_ < 0:
return "<loc_0>"

if val_ > rnorm:
return f"<loc_{rnorm}>"

return f"<loc_{val_}>"


class ExportedCCSDocument(
MinimalDocument,
Generic[
Expand Down
126 changes: 126 additions & 0 deletions docling_core/types/doc/tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

"""Tokens used in the docling document model."""

from enum import Enum

class DocumentToken(Enum):
"""Class to represent an LLM friendly representation of a Document."""

BEG_DOCUMENT = "<document>"
END_DOCUMENT = "</document>"

BEG_TITLE = "<title>"
END_TITLE = "</title>"

BEG_ABSTRACT = "<abstract>"
END_ABSTRACT = "</abstract>"

BEG_DOI = "<doi>"
END_DOI = "</doi>"
BEG_DATE = "<date>"
END_DATE = "</date>"

BEG_AUTHORS = "<authors>"
END_AUTHORS = "</authors>"
BEG_AUTHOR = "<author>"
END_AUTHOR = "</author>"

BEG_AFFILIATIONS = "<affiliations>"
END_AFFILIATIONS = "</affiliations>"
BEG_AFFILIATION = "<affiliation>"
END_AFFILIATION = "</affiliation>"

BEG_HEADER = "<section-header>"
END_HEADER = "</section-header>"
BEG_TEXT = "<text>"
END_TEXT = "</text>"
BEG_PARAGRAPH = "<paragraph>"
END_PARAGRAPH = "</paragraph>"
BEG_TABLE = "<table>"
END_TABLE = "</table>"
BEG_FIGURE = "<figure>"
END_FIGURE = "</figure>"
BEG_CAPTION = "<caption>"
END_CAPTION = "</caption>"
BEG_EQUATION = "<equation>"
END_EQUATION = "</equation>"
BEG_LIST = "<list>"
END_LIST = "</list>"
BEG_LISTITEM = "<list-item>"
END_LISTITEM = "</list-item>"

BEG_LOCATION = "<location>"
END_LOCATION = "</location>"
BEG_GROUP = "<group>"
END_GROUP = "</group>"

@classmethod
def get_special_tokens(
cls,
max_rows: int = 100,
max_cols: int = 100,
max_pages: int = 1000,
page_dimension: Tuple[int, int] = (100, 100),
):
"""Function to get all special document tokens."""
special_tokens = [token.value for token in cls]

# Adding dynamically generated row and col tokens
for i in range(0, max_rows + 1):
special_tokens += [f"<row_{i}>", f"</row_{i}>"]

for i in range(0, max_cols + 1):
special_tokens += [f"<col_{i}>", f"</col_{i}>"]

for i in range(6):
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]

# Adding dynamically generated page-tokens
for i in range(0, max_pages + 1):
special_tokens.append(f"<page_{i}>")

# Adding dynamically generated location-tokens
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
special_tokens.append(f"<loc_{i}>")

return special_tokens

@staticmethod
def get_row_token(row: int, beg=bool) -> str:
"""Function to get page tokens."""
if beg:
return f"<row_{row}>"
else:
return f"</row_{row}>"

@staticmethod
def get_col_token(col: int, beg=bool) -> str:
"""Function to get page tokens."""
if beg:
return f"<col_{col}>"
else:
return f"</col_{col}>"

@staticmethod
def get_page_token(page: int):
"""Function to get page tokens."""
return f"<page_{page}>"

@staticmethod
def get_location_token(val: float, rnorm: int = 100):
"""Function to get location tokens."""
val_ = round(rnorm * val)

if val_ < 0:
return "<loc_0>"

if val_ > rnorm:
return f"<loc_{rnorm}>"

return f"<loc_{val_}>"


1 change: 0 additions & 1 deletion docling_core/types/rec/statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from docling_core.types.rec.attribute import Attribute
from docling_core.types.rec.subject import Subject


class StatementToken(Enum):
"""Class to represent an LLM friendly representation of statements."""

Expand Down

0 comments on commit 17f54bf

Please sign in to comment.