Merge branch 'release_v3' into nli/performance

DS4SD · Dec 9, 2024 · bb1774d · bb1774d
2 parents 46ae215 + 9e99e24
commit bb1774d
Show file tree

Hide file tree

Showing 32 changed files with 906 additions and 161 deletions.
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -6,7 +6,7 @@ merge_protections:
     success_conditions:
       - "title ~=
         ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\
-        \\))?:"
+        \\))?(!)?:"
   - name: Require two reviewer for test updates
     description: When test data is updated, we require two reviewers
     if:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,23 @@
+## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09
+
+### Feature
+
+* Expose new hybrid chunker, update docs ([#384](https://github.com/DS4SD/docling/issues/384)) ([`c8ecdd9`](https://github.com/DS4SD/docling/commit/c8ecdd987e80227db3850ea729ecb36d2b609040))
+* **MS Word backend:** Make detection of headers and other styles localization agnostic ([#534](https://github.com/DS4SD/docling/issues/534)) ([`3e073df`](https://github.com/DS4SD/docling/commit/3e073dfbebbc65f995d4df946c1650699a26782c))
+
+### Fix
+
+* Correcting DefaultText ID for MS Word backend ([#537](https://github.com/DS4SD/docling/issues/537)) ([`eb7ffcd`](https://github.com/DS4SD/docling/commit/eb7ffcdd1cda1caa8ec8ba2fc313ff1e7d9acd4f))
+* Add `py.typed` marker file ([#531](https://github.com/DS4SD/docling/issues/531)) ([`9102fe1`](https://github.com/DS4SD/docling/commit/9102fe1adcd43432e5fb3f35af704b7442c5d633))
+* Enable HTML export in CLI and add options for image mode ([#513](https://github.com/DS4SD/docling/issues/513)) ([`0d11e30`](https://github.com/DS4SD/docling/commit/0d11e30dd813020c0189de849cd7b2e285d08694))
+* Missing text in docx (t tag) when embedded in a table ([#528](https://github.com/DS4SD/docling/issues/528)) ([`b730b2d`](https://github.com/DS4SD/docling/commit/b730b2d7a04a8773a00ed88889d28b0c476ba052))
+* Restore pydantic version pin after fixes ([#512](https://github.com/DS4SD/docling/issues/512)) ([`c830b92`](https://github.com/DS4SD/docling/commit/c830b92b2e043ea63d216f65b3f9d88d2a8c33f7))
+* Folder input in cli ([#511](https://github.com/DS4SD/docling/issues/511)) ([`8ada0bc`](https://github.com/DS4SD/docling/commit/8ada0bccc744df94f755adf71cf8b163e6304375))
+
+### Documentation
+
+* Document new integrations ([#532](https://github.com/DS4SD/docling/issues/532)) ([`e780333`](https://github.com/DS4SD/docling/commit/e7803334409a343a59c536c529a03d6f5cdbfe15))
+
 ## [v2.8.3](https://github.com/DS4SD/docling/releases/tag/v2.8.3) - 2024-12-03
 
 ### Fix

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
   </a>
 </p>
 
-# 🦆 Docling
+# Docling
 
 <p align="center">
   <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 
 ## Features
 
-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
+* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
 * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications

diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py
@@ -210,12 +210,14 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
         self.parser = pdf_parser_v2("fatal")
 
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                self.document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(self.document_hash, str(path_or_stream))
 
         if not success:
             raise RuntimeError(

diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
@@ -1,4 +1,5 @@
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@@ -133,7 +134,6 @@ def get_level(self) -> int:
     def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
-
             # Check for Inline Images (blip elements)
             namespaces = {
                 "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -153,6 +153,7 @@ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
                 self.handle_pictures(element, docx_obj, drawing_blip, doc)
             # Check for Text
             elif tag_name in ["p"]:
+                # "tcPr", "sectPr"
                 self.handle_text_elements(element, docx_obj, doc)
             else:
                 _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -166,6 +167,14 @@ def str_to_int(self, s, default=0):
         except ValueError:
             return default
 
+    def split_text_and_number(self, input_string):
+        match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
+        if match:
+            parts = list(filter(None, match.groups()))
+            return parts
+        else:
+            return [input_string]
+
     def get_numId_and_ilvl(self, paragraph):
         # Access the XML element of the paragraph
         numPr = paragraph._element.find(
@@ -188,7 +197,7 @@ def get_numId_and_ilvl(self, paragraph):
     def get_label_and_level(self, paragraph):
         if paragraph.style is None:
             return "Normal", None
-        label = paragraph.style.name
+        label = paragraph.style.style_id
         if label is None:
             return "Normal", None
         if ":" in label:
@@ -197,7 +206,7 @@ def get_label_and_level(self, paragraph):
             if len(parts) == 2:
                 return parts[0], int(parts[1])
 
-        parts = label.split(" ")
+        parts = self.split_text_and_number(label)
 
         if "Heading" in label and len(parts) == 2:
             parts.sort()
@@ -219,14 +228,13 @@ def handle_text_elements(self, element, docx_obj, doc):
         if paragraph.text is None:
             return
         text = paragraph.text.strip()
-        # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
 
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
         # Identify wether list is a numbered list or not
         # is_numbered = "List Bullet" not in paragraph.style.name
         is_numbered = False
-        p_style_name, p_level = self.get_label_and_level(paragraph)
+        p_style_id, p_level = self.get_label_and_level(paragraph)
         numid, ilevel = self.get_numId_and_ilvl(paragraph)
 
         if numid == 0:
@@ -238,38 +246,38 @@ def handle_text_elements(self, element, docx_obj, doc):
                 element,
                 docx_obj,
                 doc,
-                p_style_name,
+                p_style_id,
                 p_level,
                 numid,
                 ilevel,
                 text,
                 is_numbered,
             )
-            self.update_history(p_style_name, p_level, numid, ilevel)
+            self.update_history(p_style_id, p_level, numid, ilevel)
             return
         elif numid is None and self.prev_numid() is not None:  # Close list
             for key, val in self.parents.items():
                 if key >= self.level_at_new_list:
                     self.parents[key] = None
             self.level = self.level_at_new_list - 1
             self.level_at_new_list = None
-        if p_style_name in ["Title"]:
+        if p_style_id in ["Title"]:
             for key, val in self.parents.items():
                 self.parents[key] = None
             self.parents[0] = doc.add_text(
                 parent=None, label=DocItemLabel.TITLE, text=text
             )
-        elif "Heading" in p_style_name:
-            self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
+        elif "Heading" in p_style_id:
+            self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
 
-        elif p_style_name in [
+        elif p_style_id in [
             "Paragraph",
             "Normal",
             "Subtitle",
             "Author",
-            "Default Text",
-            "List Paragraph",
-            "List Bullet",
+            "DefaultText",
+            "ListParagraph",
+            "ListBullet",
             "Quote",
         ]:
             level = self.get_level()
@@ -285,25 +293,21 @@ def handle_text_elements(self, element, docx_obj, doc):
                 label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
             )
 
-        self.update_history(p_style_name, p_level, numid, ilevel)
+        self.update_history(p_style_id, p_level, numid, ilevel)
         return
 
     def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
         level = self.get_level()
         if isinstance(curr_level, int):
-
             if curr_level > level:
-
                 # add invisible group
                 for i in range(level, curr_level):
                     self.parents[i] = doc.add_group(
                         parent=self.parents[i - 1],
                         label=GroupLabel.SECTION,
                         name=f"header-{i}",
                     )
-
             elif curr_level < level:
-
                 # remove the tail
                 for key, val in self.parents.items():
                     if key >= curr_level:
@@ -314,7 +318,6 @@ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
                 text=text,
                 level=curr_level,
             )
-
         else:
             self.parents[self.level] = doc.add_heading(
                 parent=self.parents[self.level - 1],
@@ -328,7 +331,7 @@ def add_listitem(
         element,
         docx_obj,
         doc,
-        p_style_name,
+        p_style_id,
         p_level,
         numid,
         ilevel,
@@ -346,7 +349,7 @@ def add_listitem(
                 label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
             )
 
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            # Set marker and enumerated arguments if this is an enumeration element.
             self.listIter += 1
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
@@ -365,8 +368,8 @@ def add_listitem(
                 self.level_at_new_list + self.prev_indent() + 1,
                 self.level_at_new_list + ilevel + 1,
             ):
-                # TODO: determine if this is an unordered list or an ordered list.
-                #  Set GroupLabel.ORDERED_LIST when it fits.
+                # Determine if this is an unordered list or an ordered list.
+                # Set GroupLabel.ORDERED_LIST when it fits.
                 self.listIter = 0
                 if is_numbered:
                     self.parents[i] = doc.add_group(
@@ -467,6 +470,19 @@ def get_rowspan(cell):
                 row_span = get_rowspan(cell)
                 col_span = get_colspan(cell)
 
+                cell_text = cell.text
+                # In case cell doesn't return text via docx library:
+                if len(cell_text) == 0:
+                    cell_xml = cell._element
+
+                    texts = [""]
+                    for elem in cell_xml.iter():
+                        if elem.tag.endswith("t"):  # <w:t> tags that contain text
+                            if elem.text:
+                                texts.append(elem.text)
+                    # Join the collected text
+                    cell_text = " ".join(texts).strip()
+
                 # Find the next available column in the grid
                 while table_grid[row_idx][col_idx] is not None:
                     col_idx += 1
@@ -477,15 +493,15 @@ def get_rowspan(cell):
                         table_grid[row_idx + i][col_idx + j] = ""
 
                 cell = TableCell(
-                    text=cell.text,
+                    text=cell_text,
                     row_span=row_span,
                     col_span=col_span,
                     start_row_offset_idx=row_idx,
                     end_row_offset_idx=row_idx + row_span,
                     start_col_offset_idx=col_idx,
                     end_col_offset_idx=col_idx + col_span,
-                    col_header=False,  # col_header,
-                    row_header=False,  # ((not col_header) and html_cell.name=='th')
+                    col_header=False,
+                    row_header=False,
                 )
 
                 data.table_cells.append(cell)

diff --git a/docling/chunking/__init__.py b/docling/chunking/__init__.py
@@ -0,0 +1,12 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+
+from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
+from docling_core.transforms.chunker.hierarchical_chunker import (
+    DocChunk,
+    DocMeta,
+    HierarchicalChunker,
+)
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker