From 29e64eb281a720ea1dca57ae68bdf958755ecc34 Mon Sep 17 00:00:00 2001 From: Pawel Kmiecik Date: Fri, 14 Jun 2024 11:03:27 +0200 Subject: [PATCH] feat: table evaluations for fixed html table generation (#3196) Update to the evaluation script to handle correct HTML syntax for tables. See https://github.com/Unstructured-IO/unstructured-inference/pull/355 for details. This change: - modifies transforming HTML tables to evaluation internal `cells` format - fixes the indexing of the output (internal format cells) when HTML cells use spans --- CHANGELOG.md | 3 +- .../metrics/test_table_structure.py | 116 +++++- .../metrics/test_text_extraction.py | 336 +++++++++++++++++- unstructured/__version__.py | 2 +- .../metrics/table/table_extraction.py | 91 +++-- 5 files changed, 487 insertions(+), 61 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a688d1bd4..da5e783c16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.6-dev5 +## 0.14.6-dev6 ### Enhancements @@ -13,6 +13,7 @@ * **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. * **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it. * **Dropbox connector internal file path bugs** Dropbox source connector currently raises exceptions when indexing files due to two issues: a path formatting idiosyncrasy of the Dropbox library and a divergence in the definition of the Dropbox libraries fs.info method, expecting a 'url' parameter rather than 'path'. +* **update table metric evaluation to handle corrected HTML syntax for tables** This change is connected to the update in [unstructured-inference change](https://github.com/Unstructured-IO/unstructured-inference/pull/355) - fixes transforming HTML table to deckerd and internal cells format. ## 0.14.5 diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py index 332a64b326..def97b3792 100644 --- a/test_unstructured/metrics/test_table_structure.py +++ b/test_unstructured/metrics/test_table_structure.py @@ -33,7 +33,7 @@ def test_table_eval_processor_simple(): { "type": "Table", "metadata": { - "text_as_html": """ + "text_as_html": """
r1c1r1c2
r1c1r1c2
r2c1r2c2
""" }, } @@ -201,14 +201,62 @@ def test_table_eval_processor_when_wrong_source_type(): @pytest.mark.parametrize( "text_as_html", [ - """ - -
r1c1r1c2
r2c1r2c2
r3c1r3c2
""", - """ - -
r1c1r1c2
r2c1r2c2
r3c1r3c2
""", - """ -
r1c1r1c2
r2c1r2c2
r3c1r3c2
""", + """ + + + + + + + + + + + + + + + + + +
r1c1r1c2
r2c1r2c2
r3c1r3c2
+""", + """ + + + + + + + + + + + + + + + +
r1c1r1c2
r2c1r2c2
r3c1r3c2
+""", + """ + + + + + + + + + + + + + + + +
r1c1r1c2
r2c1r2c2
r3c1r3c2
+""", ], ) def test_table_eval_processor_various_table_html_structures(text_as_html): @@ -285,8 +333,21 @@ def test_table_eval_processor_non_str_values_in_table(): { "type": "Table", "metadata": { - "text_as_html": """ -
1112
2122
""" + "text_as_html": """ + + + + + + + + + + + + + +
1112
2122
""" }, } ] @@ -341,19 +402,38 @@ def test_table_eval_processor_non_str_values_in_table(): assert result.element_col_level_content_acc == 1.0 -@pytest.mark.xfail( - reason="This is expected to fail as table eval metrics does not cover merged cells" -) def test_table_eval_processor_merged_cells(): prediction = [ { "type": "Table", "metadata": { "text_as_html": """ - - < - -
r1c1r1c2r1c3
r2c2r2c3r2c4
r3c1r3c2r3c3
r4c1r4c2
""" + + + + + + + + + + + + + + + + + + + + + + + + +
r1c1r1c2r1c3
r2c2r2c3r2c4
r3c1r3c2r3c3
r4c1r4c2
+""" }, } ] diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 6e3d58babb..a15c2a5e8b 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -159,7 +159,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte ), ( """Sometimes sentences have a dash - like this one! - A hyphen connects 2 words with no gap: easy-peasy.""", + A hyphen connects 2 words with no gap: easy-peasy.""", { "sometimes": 1, "sentences": 1, @@ -222,24 +222,334 @@ def test_calculate_percent_missing_text(output_text, source_text, expected_perce ) -def test_cells_extraction_from_prediction_when_simple_example(): - example_element = { - "type": "Table", - "metadata": { - "text_as_html": "
Month A.
22
", - "table_as_cells": [ +@pytest.mark.parametrize( + ("table_as_cells", "expected_extraction"), + [ + pytest.param( + [ {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."}, {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}, ], - }, + [ + {"row_index": 0, "col_index": 0, "content": "Month A."}, + {"row_index": 1, "col_index": 0, "content": "22"}, + ], + id="Simple table, 1 head cell, 1 body cell, no spans", + ), + pytest.param( + [ + {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."}, + {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."}, + {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."}, + {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"}, + {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"}, + {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"}, + {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"}, + {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"}, + {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"}, + ], + [ + {"row_index": 0, "col_index": 0, "content": "Month A."}, + {"row_index": 0, "col_index": 1, "content": "Month B."}, + {"row_index": 0, "col_index": 2, "content": "Month C."}, + {"row_index": 1, "col_index": 0, "content": "11"}, + {"row_index": 1, "col_index": 1, "content": "12"}, + {"row_index": 1, "col_index": 2, "content": "13"}, + {"row_index": 2, "col_index": 0, "content": "21"}, + {"row_index": 2, "col_index": 1, "content": "22"}, + {"row_index": 2, "col_index": 2, "content": "23"}, + ], + id="Simple table, 3 head cell, 5 body cell, no spans", + ), + # +----------+---------------------+----------+ + # | | h1col23 | h1col4 | + # | h12col1 |----------+----------+----------| + # | | h2col2 | h2col34 | + # |----------|----------+----------+----------+ + # | r3col1 | r3col2 | | + # |----------+----------| r34col34 | + # | r4col12 | | + # +----------+----------+----------+----------+ + pytest.param( + [ + { + "y": 0, + "x": 0, + "w": 2, + "h": 1, + "content": "h12col1", + }, + { + "y": 0, + "x": 1, + "w": 1, + "h": 2, + "content": "h1col23", + }, + { + "y": 0, + "x": 3, + "w": 1, + "h": 1, + "content": "h1col4", + }, + { + "y": 1, + "x": 1, + "w": 1, + "h": 1, + "content": "h2col2", + }, + { + "y": 1, + "x": 2, + "w": 1, + "h": 2, + "content": "h2col34", + }, + { + "y": 2, + "x": 0, + "w": 1, + "h": 1, + "content": "r3col1", + }, + { + "y": 2, + "x": 1, + "w": 1, + "h": 1, + "content": "r3col2", + }, + { + "y": 2, + "x": 2, + "w": 2, + "h": 2, + "content": "r34col34", + }, + { + "y": 3, + "x": 0, + "w": 1, + "h": 2, + "content": "r4col12", + }, + ], + [ + { + "row_index": 0, + "col_index": 0, + "content": "h12col1", + }, + { + "row_index": 0, + "col_index": 1, + "content": "h1col23", + }, + { + "row_index": 0, + "col_index": 3, + "content": "h1col4", + }, + { + "row_index": 1, + "col_index": 1, + "content": "h2col2", + }, + { + "row_index": 1, + "col_index": 2, + "content": "h2col34", + }, + { + "row_index": 2, + "col_index": 0, + "content": "r3col1", + }, + { + "row_index": 2, + "col_index": 1, + "content": "r3col2", + }, + { + "row_index": 2, + "col_index": 2, + "content": "r34col34", + }, + { + "row_index": 3, + "col_index": 0, + "content": "r4col12", + }, + ], + id="various spans, with 2 row header", + ), + ], +) +def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction): + example_element = { + "type": "Table", + "metadata": {"table_as_cells": table_as_cells}, } - expected_extraction = [ - {"row_index": 0, "col_index": 0, "content": "Month A."}, - {"row_index": 1, "col_index": 0, "content": "22"}, - ] + assert extract_cells_from_table_as_cells(example_element) == expected_extraction + +@pytest.mark.parametrize( + ("text_as_html", "expected_extraction"), + [ + pytest.param( + """ + + + + + + + + + + + +
Month A.
22
" + """, + [ + {"row_index": 0, "col_index": 0, "content": "Month A."}, + {"row_index": 1, "col_index": 0, "content": "22"}, + ], + id="Simple table, 1 head cell, 1 body cell, no spans", + ), + pytest.param( + """ + + + + + + + + + + + + + + + + + + + + +
Month A.Month B.Month C.
111213
212223
" +""", + [ + {"row_index": 0, "col_index": 0, "content": "Month A."}, + {"row_index": 0, "col_index": 1, "content": "Month B."}, + {"row_index": 0, "col_index": 2, "content": "Month C."}, + {"row_index": 1, "col_index": 0, "content": "11"}, + {"row_index": 1, "col_index": 1, "content": "12"}, + {"row_index": 1, "col_index": 2, "content": "13"}, + {"row_index": 2, "col_index": 0, "content": "21"}, + {"row_index": 2, "col_index": 1, "content": "22"}, + {"row_index": 2, "col_index": 2, "content": "23"}, + ], + id="Simple table, 3 head cell, 5 body cell, no spans", + ), + # +----------+---------------------+----------+ + # | | h1col23 | h1col4 | + # | h12col1 |----------+----------+----------| + # | | h2col2 | h2col34 | + # |----------|----------+----------+----------+ + # | r3col1 | r3col2 | | + # |----------+----------| r34col34 | + # | r4col12 | | + # +----------+----------+----------+----------+ + pytest.param( + """ + + + + + + + + + + + + + + + + + + + + + + +
h12col1h1col23h1col4
h2col2h2col34
r3col1r3col2r34col34
r4col12
+""", + [ + { + "row_index": 0, + "col_index": 0, + "content": "h12col1", + }, + { + "row_index": 0, + "col_index": 1, + "content": "h1col23", + }, + { + "row_index": 0, + "col_index": 3, + "content": "h1col4", + }, + { + "row_index": 1, + "col_index": 1, + "content": "h2col2", + }, + { + "row_index": 1, + "col_index": 2, + "content": "h2col34", + }, + { + "row_index": 2, + "col_index": 0, + "content": "r3col1", + }, + { + "row_index": 2, + "col_index": 1, + "content": "r3col2", + }, + { + "row_index": 2, + "col_index": 2, + "content": "r34col34", + }, + { + "row_index": 3, + "col_index": 0, + "content": "r4col12", + }, + ], + id="various spans, with 2 row header", + ), + ], +) +def test_html_table_extraction_from_prediction(text_as_html, expected_extraction): + example_element = { + "type": "Table", + "metadata": { + "text_as_html": text_as_html, + }, + } assert extract_cells_from_text_as_html(example_element) == expected_extraction - assert extract_cells_from_table_as_cells(example_element) == expected_extraction def test_cells_extraction_from_prediction_when_missing_prediction(): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f34d8e8254..45f2dd0f27 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.6-dev5" # pragma: no cover +__version__ = "0.14.6-dev6" # pragma: no cover diff --git a/unstructured/metrics/table/table_extraction.py b/unstructured/metrics/table/table_extraction.py index 0421c72910..94d77ad89e 100644 --- a/unstructured/metrics/table/table_extraction.py +++ b/unstructured/metrics/table/table_extraction.py @@ -11,8 +11,34 @@ } -def _convert_table_from_html(content: str) -> List[Dict[str, Any]]: - """Convert html format to table structure. +def _move_cells_for_spanned_cells(cells: List[Dict[str, Any]]): + """Move cells to the right if spanned cells have an influence on the rendering. + + Args: + cells: List of cells in the table in Deckerd format. + + Returns: + List of cells in the table in Deckerd format with cells moved to the right if spanned. + """ + sorted_cells = sorted(cells, key=lambda x: (x["y"], x["x"])) + cells_occupied_by_spanned = set() + for cell in sorted_cells: + if cell["w"] > 1 or cell["h"] > 1: + for i in range(cell["y"], cell["y"] + cell["h"]): + for j in range(cell["x"], cell["x"] + cell["w"]): + if (i, j) != (cell["y"], cell["x"]): + cells_occupied_by_spanned.add((i, j)) + while (cell["y"], cell["x"]) in cells_occupied_by_spanned: + cell_y, cell_x = cell["y"], cell["x"] + cells_to_the_right = [c for c in sorted_cells if c["y"] == cell_y and c["x"] >= cell_x] + for cell_to_move in cells_to_the_right: + cell_to_move["x"] += 1 + cells_occupied_by_spanned.remove((cell_y, cell_x)) + return sorted_cells + + +def _html_table_to_deckerd(content: str) -> List[Dict[str, Any]]: + """Convert html format to Deckerd table structure. Args: content: The html content with a table to extract. @@ -20,33 +46,38 @@ def _convert_table_from_html(content: str) -> List[Dict[str, Any]]: Returns: A list of dictionaries where each dictionary represents a cell in the table. """ + soup = BeautifulSoup(content, "html.parser") table = soup.find("table") - rows = table.findAll(["tr", "thead"]) + rows = table.findAll(["tr"]) table_data = [] for i, row in enumerate(rows): - headers = row.findAll("th") - data_row = row.findAll("td") - - if headers: - for j, header in enumerate(headers): - cell = { - "row_index": i, - "col_index": j, - "content": header.text, - } - table_data.append(cell) - - if data_row: - for k, data in enumerate(data_row): - cell = { - "row_index": i, - "col_index": k, - "content": data.text, - } - table_data.append(cell) - return table_data + cells = row.findAll(["th", "td"]) + for j, cell_data in enumerate(cells): + cell = { + "y": i, + "x": j, + "w": int(cell_data.attrs.get("colspan", 1)), + "h": int(cell_data.attrs.get("rowspan", 1)), + "content": cell_data.text, + } + table_data.append(cell) + return _move_cells_for_spanned_cells(table_data) + + +def _convert_table_from_html(content: str) -> List[Dict[str, Any]]: + """Convert html format to table structure. As a middle step it converts + html to the Deckerd format as it's more convenient to work with. + + Args: + content: The html content with a table to extract. + + Returns: + A list of dictionaries where each dictionary represents a cell in the table. + """ + deckerd_cells = _html_table_to_deckerd(content) + return _convert_table_from_deckerd(deckerd_cells) def _convert_table_from_deckerd(content: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -151,11 +182,15 @@ def extract_cells_from_text_as_html(element: Dict[str, Any]) -> List[Dict[str, A "metadata": { "text_as_html": " - + + + - - < - /tr> + + + < + +
Month A.
Month A.
22
22
" } }