Skip to content

Commit

Permalink
fix: Added handling of grouped elements in pptx backend (#307)
Browse files Browse the repository at this point in the history
* Added handling of grouped elements in pptx backend

Signed-off-by: Maksym Lysak <[email protected]>

* updated log.warn to warning

Signed-off-by: Maksym Lysak <[email protected]>

---------

Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Nov 11, 2024
1 parent 53bf2d1 commit 81c8243
Showing 1 changed file with 17 additions and 22 deletions.
39 changes: 17 additions & 22 deletions docling/backend/mspowerpoint_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:

size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)

# Loop through each shape in the slide
for shape in slide.shapes:

def handle_shapes(shape, parent_slide, slide_ind, doc):
handle_groups(shape, parent_slide, slide_ind, doc)
if shape.has_table:
# Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc)

if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Tables
# Handle Pictures
self.handle_pictures(shape, parent_slide, slide_ind, doc)

# If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"):
continue
return
if shape.text is None:
continue
return
if len(shape.text.strip()) == 0:
continue
return
if not shape.has_text_frame:
_log.warn("Warning: shape has text but not text_frame")
continue

# if shape.is_placeholder:
# Handle Titles (Headers) and Subtitles
# Check if the shape is a placeholder (titles are placeholders)
# self.handle_title(shape, parent_slide, slide_ind, doc)
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# else:

_log.warning("Warning: shape has text but not text_frame")
return
# Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
return

def handle_groups(shape, parent_slide, slide_ind, doc):
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for groupedshape in shape.shapes:
handle_shapes(groupedshape, parent_slide, slide_ind, doc)

# figures...
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
# Loop through each shape in the slide
for shape in slide.shapes:
handle_shapes(shape, parent_slide, slide_ind, doc)

return doc

0 comments on commit 81c8243

Please sign in to comment.