DS4SD · maxmnemonic · Nov 26, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 26, 2024
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
@@ -14,7 +14,8 @@
     TableData,
 )
 from lxml import etree
-from PIL import Image
+from lxml.etree import XPath
+from PIL import Image, UnidentifiedImageError
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -132,8 +133,14 @@ def get_level(self) -> int:
     def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
+
             # Check for Inline Images (blip elements)
-            drawing_blip = element.xpath(".//a:blip")
+            namespaces = {
+                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+                "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+            }
+            xpath_expr = XPath(".//a:blip", namespaces=namespaces)
+            drawing_blip = xpath_expr(element)
 
             # Check for Tables
             if element.tag.endswith("tbl"):
@@ -210,7 +217,6 @@ def handle_text_elements(self, element, docx_obj, doc):
         paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
 
         if paragraph.text is None:
-            # _log.warn(f"paragraph has text==None")
             return
         text = paragraph.text.strip()
         # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
@@ -502,10 +508,17 @@ def get_docx_image(element, drawing_blip):
         image_data = get_docx_image(element, drawing_blip)
         image_bytes = BytesIO(image_data)
         # Open the BytesIO object with PIL to create an Image
-        pil_image = Image.open(image_bytes)
-        doc.add_picture(
-            parent=self.parents[self.level],
-            image=ImageRef.from_pil(image=pil_image, dpi=72),
-            caption=None,
-        )
+        try:
+            pil_image = Image.open(image_bytes)
+            doc.add_picture(
+                parent=self.parents[self.level],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        except (UnidentifiedImageError, OSError) as e:
+            _log.warning("Warning: image cannot be loaded by Pillow")
+            doc.add_picture(
+                parent=self.parents[self.level],
+                caption=None,
+            )
         return
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
 pandas = "^2.1.4"
 marko = "^2.1.2"
 openpyxl = "^3.1.5"
+lxml = ">=4.0.0,<6.0.0"
 ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
 
 [tool.poetry.group.dev.dependencies]

diff --git a/tests/data/docx/test_emf_docx.docx b/tests/data/docx/test_emf_docx.docx
diff --git a/tests/data/groundtruth/docling_v2/tablecell.docx.itxt b/tests/data/groundtruth/docling_v2/tablecell.docx.itxt
@@ -0,0 +1,10 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: list: group list
+    item-2 at level 2: list_item: Hello world1
+    item-3 at level 2: list_item: Hello2
+  item-4 at level 1: paragraph: 
+  item-5 at level 1: paragraph: Some text before
+  item-6 at level 1: table with [3x3]
+  item-7 at level 1: paragraph: 
+  item-8 at level 1: paragraph: 
+  item-9 at level 1: paragraph: Some text after