feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for tables. See Unstructured-IO/unstructured-inference#355 for details. This change: - modifies transforming HTML tables to evaluation internal `cells` format - fixes the indexing of the output (internal format cells) when HTML cells use spans
Unstructured-IO · Jun 14, 2024 · 29e64eb · 29e64eb
1 parent dadc9c6
commit 29e64eb
Show file tree

Hide file tree

Showing 5 changed files with 487 additions and 61 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.6-dev5
+## 0.14.6-dev6
 
 ### Enhancements
 
@@ -13,6 +13,7 @@
 * **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
 * **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
 * **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'.
+* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format.
 
 ## 0.14.5
 

diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py
@@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
         {
             "type": "Table",
             "metadata": {
-                "text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
+                "text_as_html": """<table><thead><tr><th>r1c1</th><th>r1c2</th></tr></thead>
                     <tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
             },
         }
@@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type():
 @pytest.mark.parametrize(
     "text_as_html",
     [
-        """<table><thead><th>r1c1</th><th>r1c2</th></thead>
-            <tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
-            <td>r3c2</td></tr></tbody></table>""",
-        """<table><tr><th>r1c1</th><th>r1c2</th></tr>
-            <tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
-            <td>r3c2</td></tr></tbody></table>""",
-        """<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
-            <td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
+        """
+<table>
+    <thead>
+        <tr>
+            <th>r1c1</th>
+            <th>r1c2</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>r2c1</td>
+            <td>r2c2</td>
+        </tr>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+        </tr>
+    </tbody>
+</table>
+""",
+        """
+<table>
+    <tr>
+        <th>r1c1</th>
+        <th>r1c2</th>
+    </tr>
+    <tbody>
+        <tr>
+            <td>r2c1</td>
+            <td>r2c2</td>
+        </tr>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+        </tr>
+    </tbody>
+</table>
+""",
+        """
+<table>
+    </tbody>
+        <tr>
+            <td>r1c1</td>
+            <td>r1c2</td>
+        </tr>
+        <tr>
+            <td>r2c1</td>
+            <td>r2c2</td>
+        </tr>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+        </tr>
+    </tbody>
+</table>
+""",
     ],
 )
 def test_table_eval_processor_various_table_html_structures(text_as_html):
@@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table():
         {
             "type": "Table",
             "metadata": {
-                "text_as_html": """<table><thead><th>11</th><th>12</th></thead>
-                    <tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
+                "text_as_html": """
+<table>
+    <thead>
+        <tr>
+            <th>11</th>
+            <th>12</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>21</td>
+            <td>22</td>
+        </tr>
+    </tbody>
+</table>"""
             },
         }
     ]
@@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table():
     assert result.element_col_level_content_acc == 1.0
 
 
-@pytest.mark.xfail(
-    reason="This is expected to fail as table eval metrics does not cover merged cells"
-)
 def test_table_eval_processor_merged_cells():
     prediction = [
         {
             "type": "Table",
             "metadata": {
                 "text_as_html": """
-                <table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
-                <tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
-                <tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
-                <tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
+<table>
+    <thead>
+        <tr>
+            <th rowspan="2">r1c1</th>
+            <th>r1c2</th>
+            <th colspan="2">r1c3</th>
+        </tr>
+        <tr>
+            <th>r2c2</th>
+            <th>r2c3</th>
+            <th>r2c4</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>r3c1</td>
+            <td>r3c2</td>
+            <td colspan="2" rowspan="2">r3c3</td>
+        </tr>
+        <tr>
+            <td>r4c1</td>
+            <td>r4c2</td>
+        </tr>
+    </tbody>
+</table>
+"""
             },
         }
     ]