From 29e64eb281a720ea1dca57ae68bdf958755ecc34 Mon Sep 17 00:00:00 2001
From: Pawel Kmiecik <pawel.kmiecik@deepsense.ai>
Date: Fri, 14 Jun 2024 11:03:27 +0200
Subject: [PATCH] feat: table evaluations for fixed html table generation
 (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
---
 CHANGELOG.md                                  |   3 +-
 .../metrics/test_table_structure.py           | 116 +++++-
 .../metrics/test_text_extraction.py           | 336 +++++++++++++++++-
 unstructured/__version__.py                   |   2 +-
 .../metrics/table/table_extraction.py         |  91 +++--
 5 files changed, 487 insertions(+), 61 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7a688d1bd4..da5e783c16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.6-dev5
+## 0.14.6-dev6
 
 ### Enhancements
 
@@ -13,6 +13,7 @@
 * **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
 * **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
 * **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'.
+* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format.
 
 ## 0.14.5
 
diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py
index 332a64b326..def97b3792 100644
--- a/test_unstructured/metrics/test_table_structure.py
+++ b/test_unstructured/metrics/test_table_structure.py
@@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
         {
             "type": "Table",
             "metadata": {
-                "text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
+                "text_as_html": """<table><thead><tr><th>r1c1</th><th>r1c2</th></tr></thead>
                     <tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
             },
         }
@@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type():
 @pytest.mark.parametrize(
     "text_as_html",
     [
-        """<table><thead><th>r1c1</th><th>r1c2</th></thead>
-            <tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
-            <td>r3c2</td></tr></tbody></table>""",
-        """<table><tr><th>r1c1</th><th>r1c2</th></tr>
-            <tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
-            <td>r3c2</td></tr></tbody></table>""",
-        """<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
-            <td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
+        """
+<table>
+    <thead>
+        <tr>
+            <th>r1c1</th>
+            <th>r1c2</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>r2c1</td>
+            <td>r2c2</td>
+        </tr>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+        </tr>
+    </tbody>
+</table>
+""",
+        """
+<table>
+    <tr>
+        <th>r1c1</th>
+        <th>r1c2</th>
+    </tr>
+    <tbody>
+        <tr>
+            <td>r2c1</td>
+            <td>r2c2</td>
+        </tr>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+        </tr>
+    </tbody>
+</table>
+""",
+        """
+<table>
+    </tbody>
+        <tr>
+            <td>r1c1</td>
+            <td>r1c2</td>
+        </tr>
+        <tr>
+            <td>r2c1</td>
+            <td>r2c2</td>
+        </tr>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+        </tr>
+    </tbody>
+</table>
+""",
     ],
 )
 def test_table_eval_processor_various_table_html_structures(text_as_html):
@@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table():
         {
             "type": "Table",
             "metadata": {
-                "text_as_html": """<table><thead><th>11</th><th>12</th></thead>
-                    <tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
+                "text_as_html": """
+<table>
+    <thead>
+        <tr>
+            <th>11</th>
+            <th>12</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>21</td>
+            <td>22</td>
+        </tr>
+    </tbody>
+</table>"""
             },
         }
     ]
@@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table():
     assert result.element_col_level_content_acc == 1.0
 
 
-@pytest.mark.xfail(
-    reason="This is expected to fail as table eval metrics does not cover merged cells"
-)
 def test_table_eval_processor_merged_cells():
     prediction = [
         {
             "type": "Table",
             "metadata": {
                 "text_as_html": """
-                <table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
-                <tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
-                <tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
-                <tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
+<table>
+    <thead>
+        <tr>
+            <th rowspan="2">r1c1</th>
+            <th>r1c2</th>
+            <th colspan="2">r1c3</th>
+        </tr>
+        <tr>
+            <th>r2c2</th>
+            <th>r2c3</th>
+            <th>r2c4</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+            <td colspan="2" rowspan="2">r3c3</td>
+        </tr>
+        <tr>
+            <td>r4c1</td>
+            <td>r4c2</td>
+        </tr>
+    </tbody>
+</table>
+"""
             },
         }
     ]
diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py
index 6e3d58babb..a15c2a5e8b 100644
--- a/test_unstructured/metrics/test_text_extraction.py
+++ b/test_unstructured/metrics/test_text_extraction.py
@@ -159,7 +159,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
         ),
         (
             """Sometimes sentences have a dash - like this one!
-                A hyphen connects 2 words with no gap: easy-peasy.""",
+                    A hyphen connects 2 words with no gap: easy-peasy.""",
             {
                 "sometimes": 1,
                 "sentences": 1,
@@ -222,24 +222,334 @@ def test_calculate_percent_missing_text(output_text, source_text, expected_perce
     )
 
 
-def test_cells_extraction_from_prediction_when_simple_example():
-    example_element = {
-        "type": "Table",
-        "metadata": {
-            "text_as_html": "<table><thead><th>Month A.</th></thead><tr><td>22</td></tr></table>",
-            "table_as_cells": [
+@pytest.mark.parametrize(
+    ("table_as_cells", "expected_extraction"),
+    [
+        pytest.param(
+            [
                 {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
                 {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
             ],
-        },
+            [
+                {"row_index": 0, "col_index": 0, "content": "Month A."},
+                {"row_index": 1, "col_index": 0, "content": "22"},
+            ],
+            id="Simple table, 1 head cell, 1 body cell, no spans",
+        ),
+        pytest.param(
+            [
+                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
+                {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
+                {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
+                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
+                {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
+                {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
+                {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
+                {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
+                {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
+            ],
+            [
+                {"row_index": 0, "col_index": 0, "content": "Month A."},
+                {"row_index": 0, "col_index": 1, "content": "Month B."},
+                {"row_index": 0, "col_index": 2, "content": "Month C."},
+                {"row_index": 1, "col_index": 0, "content": "11"},
+                {"row_index": 1, "col_index": 1, "content": "12"},
+                {"row_index": 1, "col_index": 2, "content": "13"},
+                {"row_index": 2, "col_index": 0, "content": "21"},
+                {"row_index": 2, "col_index": 1, "content": "22"},
+                {"row_index": 2, "col_index": 2, "content": "23"},
+            ],
+            id="Simple table, 3 head cell, 5 body cell, no spans",
+        ),
+        # +----------+---------------------+----------+
+        # |          |       h1col23       |  h1col4  |
+        # | h12col1  |----------+----------+----------|
+        # |          |  h2col2  |       h2col34       |
+        # |----------|----------+----------+----------+
+        # |  r3col1  |  r3col2  |                     |
+        # |----------+----------|      r34col34       |
+        # |       r4col12       |                     |
+        # +----------+----------+----------+----------+
+        pytest.param(
+            [
+                {
+                    "y": 0,
+                    "x": 0,
+                    "w": 2,
+                    "h": 1,
+                    "content": "h12col1",
+                },
+                {
+                    "y": 0,
+                    "x": 1,
+                    "w": 1,
+                    "h": 2,
+                    "content": "h1col23",
+                },
+                {
+                    "y": 0,
+                    "x": 3,
+                    "w": 1,
+                    "h": 1,
+                    "content": "h1col4",
+                },
+                {
+                    "y": 1,
+                    "x": 1,
+                    "w": 1,
+                    "h": 1,
+                    "content": "h2col2",
+                },
+                {
+                    "y": 1,
+                    "x": 2,
+                    "w": 1,
+                    "h": 2,
+                    "content": "h2col34",
+                },
+                {
+                    "y": 2,
+                    "x": 0,
+                    "w": 1,
+                    "h": 1,
+                    "content": "r3col1",
+                },
+                {
+                    "y": 2,
+                    "x": 1,
+                    "w": 1,
+                    "h": 1,
+                    "content": "r3col2",
+                },
+                {
+                    "y": 2,
+                    "x": 2,
+                    "w": 2,
+                    "h": 2,
+                    "content": "r34col34",
+                },
+                {
+                    "y": 3,
+                    "x": 0,
+                    "w": 1,
+                    "h": 2,
+                    "content": "r4col12",
+                },
+            ],
+            [
+                {
+                    "row_index": 0,
+                    "col_index": 0,
+                    "content": "h12col1",
+                },
+                {
+                    "row_index": 0,
+                    "col_index": 1,
+                    "content": "h1col23",
+                },
+                {
+                    "row_index": 0,
+                    "col_index": 3,
+                    "content": "h1col4",
+                },
+                {
+                    "row_index": 1,
+                    "col_index": 1,
+                    "content": "h2col2",
+                },
+                {
+                    "row_index": 1,
+                    "col_index": 2,
+                    "content": "h2col34",
+                },
+                {
+                    "row_index": 2,
+                    "col_index": 0,
+                    "content": "r3col1",
+                },
+                {
+                    "row_index": 2,
+                    "col_index": 1,
+                    "content": "r3col2",
+                },
+                {
+                    "row_index": 2,
+                    "col_index": 2,
+                    "content": "r34col34",
+                },
+                {
+                    "row_index": 3,
+                    "col_index": 0,
+                    "content": "r4col12",
+                },
+            ],
+            id="various spans, with 2 row header",
+        ),
+    ],
+)
+def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
+    example_element = {
+        "type": "Table",
+        "metadata": {"table_as_cells": table_as_cells},
     }
-    expected_extraction = [
-        {"row_index": 0, "col_index": 0, "content": "Month A."},
-        {"row_index": 1, "col_index": 0, "content": "22"},
-    ]
+    assert extract_cells_from_table_as_cells(example_element) == expected_extraction
+
 
+@pytest.mark.parametrize(
+    ("text_as_html", "expected_extraction"),
+    [
+        pytest.param(
+            """
+<table>
+    <thead>
+        <tr>
+            <th>Month A.</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>22</td>
+        </tr>
+    </tbody>
+</table>"
+            """,
+            [
+                {"row_index": 0, "col_index": 0, "content": "Month A."},
+                {"row_index": 1, "col_index": 0, "content": "22"},
+            ],
+            id="Simple table, 1 head cell, 1 body cell, no spans",
+        ),
+        pytest.param(
+            """
+<table>
+    <thead>
+        <tr>
+            <th>Month A.</th>
+            <th>Month B.</th>
+            <th>Month C.</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>11</td>
+            <td>12</td>
+            <td>13</td>
+        </tr>
+        <tr>
+            <td>21</td>
+            <td>22</td>
+            <td>23</td>
+        </tr>
+    </tbody>
+</table>"
+""",
+            [
+                {"row_index": 0, "col_index": 0, "content": "Month A."},
+                {"row_index": 0, "col_index": 1, "content": "Month B."},
+                {"row_index": 0, "col_index": 2, "content": "Month C."},
+                {"row_index": 1, "col_index": 0, "content": "11"},
+                {"row_index": 1, "col_index": 1, "content": "12"},
+                {"row_index": 1, "col_index": 2, "content": "13"},
+                {"row_index": 2, "col_index": 0, "content": "21"},
+                {"row_index": 2, "col_index": 1, "content": "22"},
+                {"row_index": 2, "col_index": 2, "content": "23"},
+            ],
+            id="Simple table, 3 head cell, 5 body cell, no spans",
+        ),
+        # +----------+---------------------+----------+
+        # |          |       h1col23       |  h1col4  |
+        # | h12col1  |----------+----------+----------|
+        # |          |  h2col2  |       h2col34       |
+        # |----------|----------+----------+----------+
+        # |  r3col1  |  r3col2  |                     |
+        # |----------+----------|      r34col34       |
+        # |       r4col12       |                     |
+        # +----------+----------+----------+----------+
+        pytest.param(
+            """
+<table>
+    <thead>
+        <tr>
+            <th rowspan="2">h12col1</th>
+            <th colspan="2">h1col23</th>
+            <th>h1col4</th>
+        </tr>
+        <tr>
+            <th>h2col2</th>
+            <th colspan="2">h2col34</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>r3col1</td>
+            <td>r3col2</td>
+            <td colspan="2" rowspan="2">r34col34</td>
+        </tr>
+        <tr>
+            <td colspan="2">r4col12</td>
+        </tr>
+    </tbody>
+</table>
+""",
+            [
+                {
+                    "row_index": 0,
+                    "col_index": 0,
+                    "content": "h12col1",
+                },
+                {
+                    "row_index": 0,
+                    "col_index": 1,
+                    "content": "h1col23",
+                },
+                {
+                    "row_index": 0,
+                    "col_index": 3,
+                    "content": "h1col4",
+                },
+                {
+                    "row_index": 1,
+                    "col_index": 1,
+                    "content": "h2col2",
+                },
+                {
+                    "row_index": 1,
+                    "col_index": 2,
+                    "content": "h2col34",
+                },
+                {
+                    "row_index": 2,
+                    "col_index": 0,
+                    "content": "r3col1",
+                },
+                {
+                    "row_index": 2,
+                    "col_index": 1,
+                    "content": "r3col2",
+                },
+                {
+                    "row_index": 2,
+                    "col_index": 2,
+                    "content": "r34col34",
+                },
+                {
+                    "row_index": 3,
+                    "col_index": 0,
+                    "content": "r4col12",
+                },
+            ],
+            id="various spans, with 2 row header",
+        ),
+    ],
+)
+def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
+    example_element = {
+        "type": "Table",
+        "metadata": {
+            "text_as_html": text_as_html,
+        },
+    }
     assert extract_cells_from_text_as_html(example_element) == expected_extraction
-    assert extract_cells_from_table_as_cells(example_element) == expected_extraction
 
 
 def test_cells_extraction_from_prediction_when_missing_prediction():
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index f34d8e8254..45f2dd0f27 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.6-dev5"  # pragma: no cover
+__version__ = "0.14.6-dev6"  # pragma: no cover
diff --git a/unstructured/metrics/table/table_extraction.py b/unstructured/metrics/table/table_extraction.py
index 0421c72910..94d77ad89e 100644
--- a/unstructured/metrics/table/table_extraction.py
+++ b/unstructured/metrics/table/table_extraction.py
@@ -11,8 +11,34 @@
 }
 
 
-def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
-    """Convert html format to table structure.
+def _move_cells_for_spanned_cells(cells: List[Dict[str, Any]]):
+    """Move cells to the right if spanned cells have an influence on the rendering.
+
+    Args:
+        cells: List of cells in the table in Deckerd format.
+
+    Returns:
+        List of cells in the table in Deckerd format with cells moved to the right if spanned.
+    """
+    sorted_cells = sorted(cells, key=lambda x: (x["y"], x["x"]))
+    cells_occupied_by_spanned = set()
+    for cell in sorted_cells:
+        if cell["w"] > 1 or cell["h"] > 1:
+            for i in range(cell["y"], cell["y"] + cell["h"]):
+                for j in range(cell["x"], cell["x"] + cell["w"]):
+                    if (i, j) != (cell["y"], cell["x"]):
+                        cells_occupied_by_spanned.add((i, j))
+        while (cell["y"], cell["x"]) in cells_occupied_by_spanned:
+            cell_y, cell_x = cell["y"], cell["x"]
+            cells_to_the_right = [c for c in sorted_cells if c["y"] == cell_y and c["x"] >= cell_x]
+            for cell_to_move in cells_to_the_right:
+                cell_to_move["x"] += 1
+            cells_occupied_by_spanned.remove((cell_y, cell_x))
+    return sorted_cells
+
+
+def _html_table_to_deckerd(content: str) -> List[Dict[str, Any]]:
+    """Convert html format to Deckerd table structure.
 
     Args:
         content: The html content with a table to extract.
@@ -20,33 +46,38 @@ def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
     Returns:
         A list of dictionaries where each dictionary represents a cell in the table.
     """
+
     soup = BeautifulSoup(content, "html.parser")
     table = soup.find("table")
-    rows = table.findAll(["tr", "thead"])
+    rows = table.findAll(["tr"])
     table_data = []
 
     for i, row in enumerate(rows):
-        headers = row.findAll("th")
-        data_row = row.findAll("td")
-
-        if headers:
-            for j, header in enumerate(headers):
-                cell = {
-                    "row_index": i,
-                    "col_index": j,
-                    "content": header.text,
-                }
-                table_data.append(cell)
-
-        if data_row:
-            for k, data in enumerate(data_row):
-                cell = {
-                    "row_index": i,
-                    "col_index": k,
-                    "content": data.text,
-                }
-                table_data.append(cell)
-    return table_data
+        cells = row.findAll(["th", "td"])
+        for j, cell_data in enumerate(cells):
+            cell = {
+                "y": i,
+                "x": j,
+                "w": int(cell_data.attrs.get("colspan", 1)),
+                "h": int(cell_data.attrs.get("rowspan", 1)),
+                "content": cell_data.text,
+            }
+            table_data.append(cell)
+    return _move_cells_for_spanned_cells(table_data)
+
+
+def _convert_table_from_html(content: str) -> List[Dict[str, Any]]:
+    """Convert html format to table structure. As a middle step it converts
+    html to the Deckerd format as it's more convenient to work with.
+
+    Args:
+        content: The html content with a table to extract.
+
+    Returns:
+        A list of dictionaries where each dictionary represents a cell in the table.
+    """
+    deckerd_cells = _html_table_to_deckerd(content)
+    return _convert_table_from_deckerd(deckerd_cells)
 
 
 def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -151,11 +182,15 @@ def extract_cells_from_text_as_html(element: Dict[str, Any]) -> List[Dict[str, A
             "metadata": {
                 "text_as_html": "<table>
                                     <thead>
-                                        <th>Month A.</th>
+                                        <tr>
+                                            <th>Month A.</th>
+                                        </tr>
                                     </thead>
-                                    <tr>
-                                        <td>22</td><
-                                    /tr>
+                                    </tbody>
+                                        <tr>
+                                            <td>22</td><
+                                        </tr>
+                                    </tbody>
                                 </table>"
             }
         }