Skip to content

Commit

Permalink
Fix cells with duplicated indexes (#341)
Browse files Browse the repository at this point in the history
The post processing in table-transformer allows for returning multiple
cells with same indexes. It happens when 1 cell is covered in more than
50% by two different spanning cells. This happens because this 'cell' is
later assigned as subcell for both spanning cells instead of one with
the highest probability.

Example:
Simple 2x2 table
```
# +-----------+----------+
# |    one    |   two    |
# |-----------+----------|
# |    three  |   four   |
# +-----------+----------+
```
With spanning cells over cells 'one three' (column spanning cell) and
another one over 'one two' (row spanning cell). In this case cell "one"
will be assigned to both spanning cells. Reproduction:

```python3
from pprint import pprint

from unstructured_inference.models.tables import structure_to_cells

table_structure = {
    "rows": [
        {"bbox": [0, 0, 10, 20]},
        {"bbox": [10, 0, 20, 20]},
    ],
    "columns": [
        {"bbox": [0, 0, 20, 10]},
        {"bbox": [0, 10, 20, 20]},
    ],
    "spanning cells": [
        {"bbox": [0, 0, 20, 10], "score": 0.9, "projected row header": False},
        {"bbox": [0, 0, 10, 20], "score": 0.8, "projected row header": False},
    ],
}
tokens = [
    {"text": "one", "bbox": [0, 0, 10, 10], "span_num": 1, "line_num": 1, "block_num": 1},
    {"text": "two", "bbox": [0, 10, 10, 20], "span_num": 1, "line_num": 1, "block_num": 1},
    {"text": "three", "bbox": [10, 0, 20, 10], "span_num": 1, "line_num": 1, "block_num": 1},
    {"text": "four", "bbox": [10, 10, 20, 20], "span_num": 1, "line_num": 1, "block_num": 1},
]

predicted_cells, _ = structure_to_cells(table_structure, tokens=tokens)
pprint(predicted_cells)
```

This yields:
```
[
....
 {'cell text': 'one three',
  'column_nums': [0],
  'row_nums': [0, 1]
},
 {''cell text': 'two',
  'column_nums': [0, 1],
  'row_nums': [0]
}]
```
You can see coordinates (0,0) are included in both spanning cells. 

This PR fixes this by assigning only to the most probably spanning cell
  • Loading branch information
plutasnyy authored May 1, 2024
1 parent 9c3f644 commit a381155
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.7.30

* fix: table transformer doesn't return multiple cells with same coordinates
*
## 0.7.29

* fix: table transformer predictions are now removed if confidence is below threshold
Expand Down
100 changes: 99 additions & 1 deletion test_unstructured_inference/models/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
from transformers.models.table_transformer.modeling_table_transformer import (
TableTransformerDecoder,
)
from copy import deepcopy

import unstructured_inference.models.table_postprocess as postprocess
from unstructured_inference.models import tables
from unstructured_inference.models.tables import apply_thresholds_on_objects
from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells

skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}

Expand Down Expand Up @@ -1285,3 +1286,100 @@ def test_padded_results_has_right_dimensions(table_transformer, example_image):

def test_compute_confidence_score_zero_division_error_handling():
assert tables.compute_confidence_score([]) == 0


@pytest.mark.parametrize(
"column_span_score, row_span_score, expected_text_to_indexes",
[
(
0.9,
0.8,
(
{
"one three": {"row_nums": [0, 1], "column_nums": [0]},
"two": {"row_nums": [0], "column_nums": [1]},
"four": {"row_nums": [1], "column_nums": [1]},
}
),
),
(
0.8,
0.9,
(
{
"one two": {"row_nums": [0], "column_nums": [0, 1]},
"three": {"row_nums": [1], "column_nums": [0]},
"four": {"row_nums": [1], "column_nums": [1]},
}
),
),
],
)
def test_subcells_filtering_when_overlapping_spanning_cells(
column_span_score, row_span_score, expected_text_to_indexes
):
"""
# table
# +-----------+----------+
# | one | two |
# |-----------+----------|
# | three | four |
# +-----------+----------+
spanning cells over first row and over first column
"""
table_structure = {
"rows": [
{"bbox": [0, 0, 10, 20]},
{"bbox": [10, 0, 20, 20]},
],
"columns": [
{"bbox": [0, 0, 20, 10]},
{"bbox": [0, 10, 20, 20]},
],
"spanning cells": [
{"bbox": [0, 0, 20, 10], "score": column_span_score},
{"bbox": [0, 0, 10, 20], "score": row_span_score},
],
}
tokens = [
{
"text": "one",
"bbox": [0, 0, 10, 10],
},
{
"text": "two",
"bbox": [0, 10, 10, 20],
},
{
"text": "three",
"bbox": [10, 0, 20, 10],
},
{"text": "four", "bbox": [10, 10, 20, 20]},
]
token_args = {"span_num": 1, "line_num": 1, "block_num": 1}
for token in tokens:
token.update(token_args)
for spanning_cell in table_structure["spanning cells"]:
spanning_cell["projected row header"] = False

# table structure is edited inside structure_to_cells, save copy for future runs
saved_table_structure = deepcopy(table_structure)

predicted_cells, _ = structure_to_cells(table_structure, tokens=tokens)
predicted_text_to_indexes = {
cell["cell text"]: {
"row_nums": cell["row_nums"],
"column_nums": cell["column_nums"],
}
for cell in predicted_cells
}
assert predicted_text_to_indexes == expected_text_to_indexes

# swap spanning cells to ensure the highest prob spanning cell is used
spans = saved_table_structure["spanning cells"]
spans[0], spans[1] = spans[1], spans[0]
saved_table_structure["spanning cells"] = spans

predicted_cells_after_reorder, _ = structure_to_cells(saved_table_structure, tokens=tokens)
assert predicted_cells_after_reorder == predicted_cells
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.29" # pragma: no cover
__version__ = "0.7.30" # pragma: no cover
7 changes: 6 additions & 1 deletion unstructured_inference/models/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,8 @@ def structure_to_cells(table_structure, tokens):
columns = table_structure["columns"]
rows = table_structure["rows"]
spanning_cells = table_structure["spanning cells"]
spanning_cells = sorted(spanning_cells, reverse=True, key=lambda cell: cell["score"])

cells = []
subcells = []
# Identify complete cells and subcells
Expand All @@ -507,6 +509,7 @@ def structure_to_cells(table_structure, tokens):
spanning_cell_rect.intersect(cell_rect).get_area() / cell_rect.get_area()
) > inference_config.TABLE_IOB_THRESHOLD:
cell["subcell"] = True
cell["is_merged"] = False
break

if cell["subcell"]:
Expand All @@ -528,7 +531,7 @@ def structure_to_cells(table_structure, tokens):
subcell_rect_area = subcell_rect.get_area()
if (
subcell_rect.intersect(spanning_cell_rect).get_area() / subcell_rect_area
) > inference_config.TABLE_IOB_THRESHOLD:
) > inference_config.TABLE_IOB_THRESHOLD and subcell["is_merged"] is False:
if cell_rect is None:
cell_rect = Rect(list(subcell["bbox"]))
else:
Expand All @@ -539,6 +542,8 @@ def structure_to_cells(table_structure, tokens):
# as header cells for a spanning cell to be classified as a header cell;
# otherwise, this could lead to a non-rectangular header region
header = header and "column header" in subcell and subcell["column header"]
subcell["is_merged"] = True

if len(cell_rows) > 0 and len(cell_columns) > 0:
cell = {
"bbox": cell_rect.get_bbox(),
Expand Down

0 comments on commit a381155

Please sign in to comment.