From 84403798466858abbb530be70cb4063c2a23bc9a Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Fri, 8 Mar 2024 16:38:40 -0400
Subject: [PATCH] Several fixes for table module

- Add new method for outputting the table as a markdown string.

- Address errors in computing the table header object:
We now allow None as the cell value, because this will be resolved where needed (e.g. in the pandas DataFrame).

We previously tried to enforce rect-like tuples in all header cell bboxes, however this fails for tables with all-None columns.
This fix enables this and constructs an empty string in the corresponding cell string.

We now correctly include start / stop points of lines in the bbox of the clustered graphic.
We previously joined the line's rectangle - which had no effect because this is always empty.
---
 docs/page.rst        |  29 ++++++------
 src/table.py         | 103 ++++++++++++++++++++++++-------------------
 tests/test_tables.py |  22 ++++++---
 3 files changed, 89 insertions(+), 65 deletions(-)

diff --git a/docs/page.rst b/docs/page.rst
index c8c3e805f..08888c4e9 100644
--- a/docs/page.rst
+++ b/docs/page.rst
@@ -419,27 +419,28 @@ In a nutshell, this is what you can do with PyMuPDF:
 
       :returns: a `TableFinder` object that has the following significant attributes:
 
-         * **cells:** a list of **all bboxes** on the page, that have been identified as table cells (across all tables). Each cell is a tuple `(x0, y0, x1, y1)` of coordinates or `None`.
-         * **tables:** a list of `Table` objects. This is `[]` if the page has no tables. Single tables can be found as items of this list. But the `TableFinder` object itself is also a sequence of its tables. This means that if `tabs` is a `TableFinder` object, then table "n" is delivered by `tabs.tables[n]` as well as by the shorter `tabs[n]`.
+         * `cells`: a list of **all bboxes** on the page, that have been identified as table cells (across all tables). Each cell is a :data:`rect_like` tuple `(x0, y0, x1, y1)` of coordinates or `None`.
+         * `tables`: a list of `Table` objects. This is `[]` if the page has no tables. Single tables can be found as items of this list. But the `TableFinder` object itself is also a sequence of its tables. This means that if `tabs` is a `TableFinder` object, then table "n" is delivered by `tabs.tables[n]` as well as by the shorter `tabs[n]`.
 
 
          * The `Table` object has the following attributes:
 
-           * **bbox:** the bounding box of the table as a tuple `(x0, y0, x1, y1)`.
-           * **cells:** bounding boxes of the table's cells (list of tuples). A cell may also be `None`.
-           * **extract():** this method returns the text content of each table cell as a list of list of strings.
-           * **to_pandas():** this method returns the table as a `pandas <https://pypi.org/project/pandas/>`_ `DataFrame <https://pandas.pydata.org/docs/reference/frame.html>`_.
-           * **header:** a `TableHeader` object containing header information of the table.
-           * **col_count:** an integer containing the number of table columns.
-           * **row_count:** an integer containing the number of table rows. 
-           * **rows:** a list of `TableRow` objects containing two attributes: *bbox* is the boundary box of the row, and *cells* is a list of table cells contained in this row.
+           * `bbox`: the bounding box of the table as a tuple `(x0, y0, x1, y1)`.
+           * `cells`: bounding boxes of the table's cells (list of tuples). A cell may also be `None`.
+           * `extract()`: this method returns the text content of each table cell as a list of list of strings.
+           * `to_markdown()`: this method returns the table as a **string in markdown format** (compatible to Github). Supporting viewers can render the string as a table. This output is optimized for **small token** sizes, which is especially beneficial for LLM/RAG feeds. Pandas DataFrames (see method `to_pandas()` below) offer an equivalent markdown table output which however is better readable for the human eye.
+           * `to_pandas()`: this method returns the table as a `pandas <https://pypi.org/project/pandas/>`_ `DataFrame <https://pandas.pydata.org/docs/reference/frame.html>`_. DataFrames are very versatile objects allowing a plethora of table manipulation methods and outputs to almost 20 well-known formats, among them Excel files, CSV, JSON, markdown-formatted tables and more. `DataFrame.to_markdown()` generates a Github-compatible markdown format optimized for human readability. This method however requires the package [tablutate](https://pypi.org/project/tabulate/) to installed in addition to pandas itself.
+           * ``header``: a `TableHeader` object containing header information of the table.
+           * `col_count`: an integer containing the number of table columns.
+           * `row_count`: an integer containing the number of table rows. 
+           * `rows`: a list of `TableRow` objects containing two attributes, ``bbox`` is the boundary box of the row, and `cells` is a list of table cells contained in this row.
 
          * The `TableHeader` object has the following attributes:
 
-           * **bbox:** the bounding box of the header.
-           * **cells:** a list of bounding boxes containing the name of the respective column.
-           * **names:** a list of strings containing the text of each of the cell bboxes. They represent the column names -- which can be used when exporting the table to pandas DataFrames or CSV, etc.
-           * **external:** a bool indicating whether the header bbox is outside the table body (`True`) or not. Table headers are never identified by the `TableFinder` logic. Therefore, if *external* is true, then the header cells are not part of any cell identified by `TableFinder`. If `external == False`, then the first table row is the header.
+           * ``bbox``: the bounding box of the header.
+           * `cells`: a list of bounding boxes containing the name of the respective column.
+           * `names`: a list of strings containing the text of each of the cell bboxes. They represent the column names -- which are used when exporting the table to pandas DataFrames, markdown, etc.
+           * `external`: a bool indicating whether the header bbox is outside the table body (`True`) or not. Table headers are never identified by the `TableFinder` logic. Therefore, if `external` is true, then the header cells are not part of any cell identified by `TableFinder`. If `external == False`, then the first table row is the header.
 
          Please have a look at these `Jupyter notebooks <https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/table-analysis>`_, which cover standard situations like multiple tables on one page or joining table fragments across multiple pages.
 
diff --git a/src/table.py b/src/table.py
index d95aee933..aa0bdae7c 100644
--- a/src/table.py
+++ b/src/table.py
@@ -1365,6 +1365,46 @@ def char_in_bbox(char, bbox) -> bool:
 
         return table_arr
 
+    def to_markdown(self, clean=True):
+        """Output table content as a string in Github-markdown format.
+
+        If clean is true, markdown syntax is removed from cell content."""
+        output = "|"
+
+        # generate header string and MD underline
+        for i, name in enumerate(self.header.names):
+            if name is None or name == "":  # generate a name if empty
+                name = f"Col{i+1}"
+            name = name.replace("\n", " ")  # remove any line breaks
+            if clean:  # remove sensitive syntax
+                name = (
+                    name.replace("<", "&lt;").replace(">", "&gt;").replace("-", "&#45;")
+                )
+            output += name + "|"
+
+        output += "\n"
+        output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
+
+        # skip first row in details if header is part of the table
+        j = 0 if self.header.external else 1
+
+        # iterate over detail rows
+        for row in self.extract()[j:]:
+            line = "|"
+            for i, cell in enumerate(row):
+                # output None cells with empty string
+                cell = "" if cell is None else cell.replace("\n", " ")
+                if clean:  # remove sensitive syntax
+                    cell = (
+                        cell.replace("<", "&lt;")
+                        .replace(">", "&gt;")
+                        .replace("-", "&#45;")
+                    )
+                line += cell + "|"
+            line += "\n"
+            output += line
+        return output + "\n"
+
     def to_pandas(self, **kwargs):
         """Return a pandas DataFrame version of the table."""
         try:
@@ -1443,43 +1483,6 @@ def top_row_is_bold(bbox):
                             return True
             return False
 
-        def recover_top_row_cells(table):
-            """Recreates top row cells if 'None' columns are present.
-
-            We need all column x-coordinates even when the top table row
-            contains None cells.
-            """
-            bbox = Rect(table.rows[0].bbox)  # top row bbox
-            tbbox = Rect(table.bbox)  # table bbox
-            y0, y1 = bbox.y0, bbox.y1  # top row upper / lower coordinates
-
-            # make sure row0 bbox has the full table width
-            bbox.x0 = tbbox.x0
-            bbox.x1 = tbbox.x1
-
-            l_r = set()  # (x0, x1) pairs for all table cells
-            for cell in table.cells:
-                if cell is None:  # skip non-existing cells
-                    continue
-                cellbb = Rect(cell)
-
-                # only accept cells wider than a character
-                if 10 < cellbb.width < tbbox.width:
-                    l_r.add((cell[0], cell[2]))
-
-            # sort (x0, x1) pairs by x0-values
-            l_r = sorted(list(l_r), key=lambda c: c[0])
-            if not l_r:
-                return [], (0, 0, 0, 0)
-
-            # recovered row 0 cells
-            cells = [(l_r[0][0], y0, l_r[0][1], y1)]
-
-            for x0, x1 in l_r[1:]:
-                if x0 >= cells[-1][2]:
-                    cells.append((x0, y0, x1, y1))
-            return cells, bbox
-
         try:
             row = self.rows[0]
             cells = row.cells
@@ -1487,9 +1490,6 @@ def recover_top_row_cells(table):
         except IndexError:  # this table has no rows
             return None
 
-        if None in cells:  # if row 0 has empty cells, repair it
-            cells, bbox = recover_top_row_cells(self)
-
         # return this if we determine that the top row is the header
         header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
 
@@ -1501,7 +1501,9 @@ def recover_top_row_cells(table):
         if len(cells) < 2:
             return header_top_row
 
-        col_x = [c[2] for c in cells[:-1]]  # column (x) coordinates
+        col_x = [
+            c[2] if c is not None else None for c in cells[:-1]
+        ]  # column (x) coordinates
 
         # Special check: is top row bold?
         # If first line above table is not bold, but top-left table cell is bold,
@@ -1600,6 +1602,7 @@ def recover_top_row_cells(table):
             intersecting = [
                 (x, r)
                 for x in col_x
+                if x is not None
                 for r in word_rects
                 if r[1] == top and r[0] < x and r[2] > x
             ]
@@ -1613,15 +1616,22 @@ def recover_top_row_cells(table):
 
         hdr_bbox = +clip  # compute the header cells
         hdr_bbox.y0 = select[-1]  # hdr_bbox top is smallest top coord of words
-        hdr_cells = [(c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) for c in cells]
+        hdr_cells = [
+            (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None
+            for c in cells
+        ]
 
         # adjust left/right of header bbox
-        hdr_bbox.x0 = hdr_cells[0][0]
-        hdr_bbox.x1 = hdr_cells[-1][2]
+        hdr_bbox.x0 = self.bbox[0]
+        hdr_bbox.x1 = self.bbox[2]
 
         # column names: no line breaks, no excess spaces
         hdr_names = [
-            page.get_textbox(c).replace("\n", " ").replace("  ", " ").strip()
+            (
+                page.get_textbox(c).replace("\n", " ").replace("  ", " ").strip()
+                if c is not None
+                else ""
+            )
             for c in hdr_cells
         ]
         return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
@@ -2012,7 +2022,8 @@ def clean_graphics():
                 repeat = False  # set to true again if some other rect touches
                 for i in range(len(prects) - 1, 0, -1):  # run backwards
                     if are_neighbors(prect0, prects[i]):  # close enough to rect 0?
-                        prect0 |= prects[i]  # extend rect 0
+                        prect0 |= prects[i].tl  # extend rect 0
+                        prect0 |= prects[i].br  # extend rect 0
                         del prects[i]  # delete this rect
                         repeat = True  # keep checking the rest
 
diff --git a/tests/test_tables.py b/tests/test_tables.py
index e4b684a2a..a785042ba 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -15,11 +15,23 @@ def test_table1():
     doc = fitz.open(filename)
     page = doc[0]
     tabs = page.find_tables()
-    cells = [tabs[0].cells, tabs[1].cells]
-    extracts = [tabs[0].extract(), tabs[1].extract()]
-    new_data = {"cells": cells, "extracts": extracts}
-    old_data = pickle.load(pickle_in)
-    assert old_data == new_data
+    cells = tabs[0].cells + tabs[1].cells  # all table cell tuples on page
+    extracts = [tabs[0].extract(), tabs[1].extract()]  # all table cell content
+    old_data = pickle.load(pickle_in)  # previously saved data
+
+    # Compare cell contents
+    assert old_data["extracts"] == extracts  # same cell contents
+
+    # Compare cell coordinates.
+    # Cell rectangles may get somewhat larger due to more cautious border
+    # computations, but any differences must be small.
+    old_cells = old_data["cells"][0] + old_data["cells"][1]
+    assert len(cells) == len(old_cells)
+    for i in range(len(cells)):
+        c1 = fitz.Rect(cells[i])  # new cell coordinates
+        c0 = fitz.Rect(old_cells[i])  # old cell coordinates
+        assert c0 in c1  # always: old contained in new
+        assert abs(c1 - c0) < 0.2  # difference must be small
 
 
 def test_table2():