Skip to content

Commit

Permalink
feat: table evaluations for fixed html table generation (#3196)
Browse files Browse the repository at this point in the history
Update to the evaluation script to handle correct HTML syntax for
tables.
See Unstructured-IO/unstructured-inference#355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
  • Loading branch information
pawel-kmiecik authored Jun 14, 2024
1 parent dadc9c6 commit 29e64eb
Show file tree
Hide file tree
Showing 5 changed files with 487 additions and 61 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.6-dev5
## 0.14.6-dev6

### Enhancements

Expand All @@ -13,6 +13,7 @@
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
* **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'.
* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format.

## 0.14.5

Expand Down
116 changes: 98 additions & 18 deletions test_unstructured/metrics/test_table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_table_eval_processor_simple():
{
"type": "Table",
"metadata": {
"text_as_html": """<table><thead><th>r1c1</th><th>r1c2</th></thead>
"text_as_html": """<table><thead><tr><th>r1c1</th><th>r1c2</th></tr></thead>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr></tbody></table>"""
},
}
Expand Down Expand Up @@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type():
@pytest.mark.parametrize(
"text_as_html",
[
"""<table><thead><th>r1c1</th><th>r1c2</th></thead>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
<td>r3c2</td></tr></tbody></table>""",
"""<table><tr><th>r1c1</th><th>r1c2</th></tr>
<tbody><tr><td>r2c1</td><td>r2c2</td></tr><tr><td>r3c1</td>
<td>r3c2</td></tr></tbody></table>""",
"""<table><tr><td>r1c1</td><td>r1c2</td></tr><tr><td>r2c1</td>
<td>r2c2</td></tr><tr><td>r3c1</td><td>r3c2</td></tr></tbody></table>""",
"""
<table>
<thead>
<tr>
<th>r1c1</th>
<th>r1c2</th>
</tr>
</thead>
<tbody>
<tr>
<td>r2c1</td>
<td>r2c2</td>
</tr>
<tr>
<td>r3c1</td>
<td>r3c2</td>
</tr>
</tbody>
</table>
""",
"""
<table>
<tr>
<th>r1c1</th>
<th>r1c2</th>
</tr>
<tbody>
<tr>
<td>r2c1</td>
<td>r2c2</td>
</tr>
<tr>
<td>r3c1</td>
<td>r3c2</td>
</tr>
</tbody>
</table>
""",
"""
<table>
</tbody>
<tr>
<td>r1c1</td>
<td>r1c2</td>
</tr>
<tr>
<td>r2c1</td>
<td>r2c2</td>
</tr>
<tr>
<td>r3c1</td>
<td>r3c2</td>
</tr>
</tbody>
</table>
""",
],
)
def test_table_eval_processor_various_table_html_structures(text_as_html):
Expand Down Expand Up @@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table():
{
"type": "Table",
"metadata": {
"text_as_html": """<table><thead><th>11</th><th>12</th></thead>
<tbody><tr><td>21</td><td>22</td></tr></tbody></table>"""
"text_as_html": """
<table>
<thead>
<tr>
<th>11</th>
<th>12</th>
</tr>
</thead>
<tbody>
<tr>
<td>21</td>
<td>22</td>
</tr>
</tbody>
</table>"""
},
}
]
Expand Down Expand Up @@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table():
assert result.element_col_level_content_acc == 1.0


@pytest.mark.xfail(
reason="This is expected to fail as table eval metrics does not cover merged cells"
)
def test_table_eval_processor_merged_cells():
prediction = [
{
"type": "Table",
"metadata": {
"text_as_html": """
<table><thead><th rowspan="2">r1c1</th><th>r1c2</th><th colspan="2">r1c3</th></tr>
<tr><th>r2c2</th><th>r2c3</th><th>r2c4</th><</thead>
<tbody><tr><td>r3c1</td><td>r3c2</td><td colspan="2" rowspan="2">r3c3</td></tr>
<tr><td>r4c1</td><td>r4c2</td></tr></tbody></table>"""
<table>
<thead>
<tr>
<th rowspan="2">r1c1</th>
<th>r1c2</th>
<th colspan="2">r1c3</th>
</tr>
<tr>
<th>r2c2</th>
<th>r2c3</th>
<th>r2c4</th>
</tr>
</thead>
<tbody>
<tr>
<td>r3c1</td>
<td>r3c2</td>
<td colspan="2" rowspan="2">r3c3</td>
</tr>
<tr>
<td>r4c1</td>
<td>r4c2</td>
</tr>
</tbody>
</table>
"""
},
}
]
Expand Down
Loading

0 comments on commit 29e64eb

Please sign in to comment.