diff --git a/CHANGELOG.md b/CHANGELOG.md
index 416ef54472..0602cbc04a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,11 +9,13 @@
### Fixes
+* **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
+ to avoid text being dynamically injected into the XML document.
* **Add backward compatibility for the deprecated pdf_infer_table_structure parameter**.
* **Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call**.
-* **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
to avoid text being dynamically injected into the XML document.
* **Chromadb change from Add to Upsert using element_id to make idempotent**
+* **Diable `table_as_cells` output by default** to reduce overhead in partition; now `table_as_cells` is only produced when the env `EXTACT_TABLE_AS_CELLS` is `true`
* **Reduce excessive logging** Change per page ocr info level logging into detail level trace logging
* **Replace try block in `document_to_element_list` for handling HTMLDocument** Use `getattr(element, "type", "")` to get the `type` attribute of an element when it exists. This is more explicit way to handle the special case for HTML documents and prevents other types of attribute error from being silenced by the try block
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json
index 78e2ebd6e6..f00183ac2b 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json
@@ -49,176 +49,6 @@
"text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents",
"metadata": {
"text_as_html": "
Dataset | | Base Model!| | Large Model | | Notes | PubLayNet [33] | P/M | M | Layouts of modern scientific documents |
PRImA [3] | M | | Layouts of scanned modern magazines and scientific reports |
Newspaper [17] | P | | Layouts of scanned US newspapers from the 20th century |
TableBank [18] | P | | Table region on modern scientific and business document |
HIDataset [31] | P/M | | Layouts of history Japanese documents |
",
- "table_as_cells": [
- {
- "x": 0,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "Dataset"
- },
- {
- "x": 0,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "PubLayNet [33]"
- },
- {
- "x": 0,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "PRImA [3]"
- },
- {
- "x": 0,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "Newspaper [17]"
- },
- {
- "x": 0,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "TableBank [18]"
- },
- {
- "x": 0,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "HIDataset [31]"
- },
- {
- "x": 1,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "| Base Model!|"
- },
- {
- "x": 1,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "P/M"
- },
- {
- "x": 1,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "M"
- },
- {
- "x": 1,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "P"
- },
- {
- "x": 1,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "P"
- },
- {
- "x": 1,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "P/M"
- },
- {
- "x": 2,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "Large Model"
- },
- {
- "x": 2,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "M"
- },
- {
- "x": 2,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 3,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "| Notes"
- },
- {
- "x": 3,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "Layouts of modern scientific documents"
- },
- {
- "x": 3,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "Layouts of scanned modern magazines and scientific reports"
- },
- {
- "x": 3,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "Layouts of scanned US newspapers from the 20th century"
- },
- {
- "x": 3,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "Table region on modern scientific and business document"
- },
- {
- "x": 3,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "Layouts of history Japanese documents"
- }
- ],
"filetype": "image/jpeg",
"languages": [
"eng"
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json
index 5ff6322270..03a9917a81 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json
@@ -841,134 +841,6 @@
"text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents",
"metadata": {
"text_as_html": "Dataset | | Base Model'| | | Notes | PubLayNet B8]| | F/M | Layouts of modern scientific documents |
PRImA | M | Layouts of scanned modern magazines and scientific report |
Newspaper | F | Layouts of scanned US newspapers from the 20th century |
TableBank | F | Table region on modern scientific and business document |
HJDataset | F/M | Layouts of history Japanese documents |
",
- "table_as_cells": [
- {
- "x": 0,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "Dataset"
- },
- {
- "x": 0,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "PubLayNet B8]|"
- },
- {
- "x": 0,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "PRImA"
- },
- {
- "x": 0,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "Newspaper"
- },
- {
- "x": 0,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "TableBank"
- },
- {
- "x": 0,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "HJDataset"
- },
- {
- "x": 1,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "| Base Model'|"
- },
- {
- "x": 1,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "F/M"
- },
- {
- "x": 1,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "M"
- },
- {
- "x": 1,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "F"
- },
- {
- "x": 1,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "F"
- },
- {
- "x": 1,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "F/M"
- },
- {
- "x": 2,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "| Notes"
- },
- {
- "x": 2,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "Layouts of modern scientific documents"
- },
- {
- "x": 2,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "Layouts of scanned modern magazines and scientific report"
- },
- {
- "x": 2,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "Layouts of scanned US newspapers from the 20th century"
- },
- {
- "x": 2,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "Table region on modern scientific and business document"
- },
- {
- "x": 2,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "Layouts of history Japanese documents"
- }
- ],
"filetype": "application/pdf",
"languages": [
"eng"
@@ -1520,260 +1392,6 @@
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region",
"metadata": {
"text_as_html": "block.pad(top, bottom, | right, | left) | Enlarge the current block according to the input | block.scale(fx, fy) | | | Scale the current block given the ratio in x and y direction |
block.shift(dx, dy) | | | Move the current block with the shift distances in x and y direction |
block1.is_in(block2) | | | Whether block] is inside of block2 |
block1. intersect (block2) | | | Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs |
block1.union(block2) | | | Return the union region of blockl and block2. Coordinate type to be determined based on the inputs |
block1.relative_to(block2) | | | Convert the absolute coordinates of block to relative coordinates to block2 |
block1.condition_on(block2) | | | Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates |
block. crop_image (image) | | | Obtain the image segments in the block region |
",
- "table_as_cells": [
- {
- "x": 0,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "block.pad(top, bottom,"
- },
- {
- "x": 0,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "block.scale(fx, fy)"
- },
- {
- "x": 0,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "block.shift(dx, dy)"
- },
- {
- "x": 0,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "block1.is_in(block2)"
- },
- {
- "x": 0,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "block1. intersect (block2)"
- },
- {
- "x": 0,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "block1.union(block2)"
- },
- {
- "x": 0,
- "y": 6,
- "w": 1,
- "h": 1,
- "content": "block1.relative_to(block2)"
- },
- {
- "x": 0,
- "y": 7,
- "w": 1,
- "h": 1,
- "content": "block1.condition_on(block2)"
- },
- {
- "x": 0,
- "y": 8,
- "w": 1,
- "h": 1,
- "content": "block. crop_image (image)"
- },
- {
- "x": 1,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "right,"
- },
- {
- "x": 1,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 6,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 7,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 1,
- "y": 8,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "left)"
- },
- {
- "x": 2,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 6,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 7,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 2,
- "y": 8,
- "w": 1,
- "h": 1,
- "content": ""
- },
- {
- "x": 3,
- "y": 0,
- "w": 1,
- "h": 1,
- "content": "Enlarge the current block according to the input"
- },
- {
- "x": 3,
- "y": 1,
- "w": 1,
- "h": 1,
- "content": "Scale the current block given the ratio in x and y direction"
- },
- {
- "x": 3,
- "y": 2,
- "w": 1,
- "h": 1,
- "content": "Move the current block with the shift distances in x and y direction"
- },
- {
- "x": 3,
- "y": 3,
- "w": 1,
- "h": 1,
- "content": "Whether block] is inside of block2"
- },
- {
- "x": 3,
- "y": 4,
- "w": 1,
- "h": 1,
- "content": "Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs"
- },
- {
- "x": 3,
- "y": 5,
- "w": 1,
- "h": 1,
- "content": "Return the union region of blockl and block2. Coordinate type to be determined based on the inputs"
- },
- {
- "x": 3,
- "y": 6,
- "w": 1,
- "h": 1,
- "content": "Convert the absolute coordinates of block to relative coordinates to block2"
- },
- {
- "x": 3,
- "y": 7,
- "w": 1,
- "h": 1,
- "content": "Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates"
- },
- {
- "x": 3,
- "y": 8,
- "w": 1,
- "h": 1,
- "content": "Obtain the image segments in the block region"
- }
- ],
"filetype": "application/pdf",
"languages": [
"eng"
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
index e4017fc4dc..ed8740bba5 100644
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@@ -253,7 +253,7 @@ def supplement_element_with_table_extraction(
"""Supplement the existing layout with table extraction. Any Table elements
that are extracted will have a metadata fields "text_as_html" where
the table's text content is rendered into a html string and "table_as_cells"
- with the raw table cells output from table agent
+ with the raw table cells output from table agent if env_config.EXTRACT_TABLE_AS_CELLS is True
"""
from unstructured_inference.models.tables import cells_to_html
@@ -279,13 +279,15 @@ def supplement_element_with_table_extraction(
tatr_cells = tables_agent.predict(
cropped_image, ocr_tokens=table_tokens, result_format="cells"
)
- text_as_html = cells_to_html(tatr_cells)
- simple_table_cells = [
- SimpleTableCell.from_table_transformer_cell(cell).to_dict() for cell in tatr_cells
- ]
+ text_as_html = cells_to_html(tatr_cells)
element.text_as_html = text_as_html
- element.table_as_cells = simple_table_cells
+
+ if env_config.EXTRACT_TABLE_AS_CELLS:
+ simple_table_cells = [
+ SimpleTableCell.from_table_transformer_cell(cell).to_dict() for cell in tatr_cells
+ ]
+ element.table_as_cells = simple_table_cells
return elements
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index f2b75e8026..270f94bfc1 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -116,6 +116,11 @@ def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
"""
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)
+ @property
+ def EXTRACT_TABLE_AS_CELLS(self) -> bool:
+ """adds `table_as_cells` to a Table element's metadata when it is True"""
+ return self._get_bool("EXTRACT_TABLE_AS_CELLS", False)
+
@property
def OCR_LAYOUT_SUBREGION_THRESHOLD(self) -> float:
"""threshold to determine if an OCR region is a sub-region of a given block