Skip to content

Commit

Permalink
fix: Missing text in docx (t tag) when embedded in a table (#528)
Browse files Browse the repository at this point in the history
Fix for missing text in docx (t tag) when embedded in a table

Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Dec 6, 2024
1 parent c830b92 commit b730b2d
Showing 1 changed file with 20 additions and 13 deletions.
33 changes: 20 additions & 13 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def get_level(self) -> int:
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname

# Check for Inline Images (blip elements)
namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
Expand All @@ -153,6 +152,7 @@ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
self.handle_pictures(element, docx_obj, drawing_blip, doc)
# Check for Text
elif tag_name in ["p"]:
# "tcPr", "sectPr"
self.handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
Expand Down Expand Up @@ -219,7 +219,6 @@ def handle_text_elements(self, element, docx_obj, doc):
if paragraph.text is None:
return
text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!

# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
Expand Down Expand Up @@ -291,19 +290,15 @@ def handle_text_elements(self, element, docx_obj, doc):
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
level = self.get_level()
if isinstance(curr_level, int):

if curr_level > level:

# add invisible group
for i in range(level, curr_level):
self.parents[i] = doc.add_group(
parent=self.parents[i - 1],
label=GroupLabel.SECTION,
name=f"header-{i}",
)

elif curr_level < level:

# remove the tail
for key, val in self.parents.items():
if key >= curr_level:
Expand All @@ -314,7 +309,6 @@ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
text=text,
level=curr_level,
)

else:
self.parents[self.level] = doc.add_heading(
parent=self.parents[self.level - 1],
Expand Down Expand Up @@ -346,7 +340,7 @@ def add_listitem(
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
)

# TODO: Set marker and enumerated arguments if this is an enumeration element.
# Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
Expand All @@ -365,8 +359,8 @@ def add_listitem(
self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1,
):
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
# Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0
if is_numbered:
self.parents[i] = doc.add_group(
Expand Down Expand Up @@ -467,6 +461,19 @@ def get_rowspan(cell):
row_span = get_rowspan(cell)
col_span = get_colspan(cell)

cell_text = cell.text
# In case cell doesn't return text via docx library:
if len(cell_text) == 0:
cell_xml = cell._element

texts = [""]
for elem in cell_xml.iter():
if elem.tag.endswith("t"): # <w:t> tags that contain text
if elem.text:
texts.append(elem.text)
# Join the collected text
cell_text = " ".join(texts).strip()

# Find the next available column in the grid
while table_grid[row_idx][col_idx] is not None:
col_idx += 1
Expand All @@ -477,15 +484,15 @@ def get_rowspan(cell):
table_grid[row_idx + i][col_idx + j] = ""

cell = TableCell(
text=cell.text,
text=cell_text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=False, # col_header,
row_header=False, # ((not col_header) and html_cell.name=='th')
col_header=False,
row_header=False,
)

data.table_cells.append(cell)
Expand Down

0 comments on commit b730b2d

Please sign in to comment.