diff --git a/CHANGELOG.md b/CHANGELOG.md index af54982fad..2abf7d8179 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.3-dev3 +## 0.14.3-dev4 ### Enhancements @@ -11,7 +11,9 @@ * **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml` to avoid text being dynamically injected into the XML document. -* Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call. +* **Add backward compatibility for the deprecated pdf_infer_table_structure parameter**. +* **Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call**. + to avoid text being dynamically injected into the XML document. * **Chromadb change from Add to Upsert using element_id to make idempotent** * **Diable `table_as_cells` output by default** to reduce overhead in partition; now `table_as_cells` is only produced when the env `EXTACT_TABLE_AS_CELLS` is `true` diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index d7b837e547..afcbd14f7e 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -350,7 +350,7 @@ def test_auto_partition_pdf_uses_table_extraction(): "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES) - assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is False + assert mock_process_file_with_model.call_args[1]["infer_table_structure"] def test_auto_partition_pdf_with_fast_strategy(monkeypatch): diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json index e7dc78f839..5ff6322270 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json @@ -840,6 +840,135 @@ "element_id": "2a62c55be8401908c18140e858ec3345", "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents", "metadata": { + "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
", + "table_as_cells": [ + { + "x": 0, + "y": 0, + "w": 1, + "h": 1, + "content": "Dataset" + }, + { + "x": 0, + "y": 1, + "w": 1, + "h": 1, + "content": "PubLayNet B8]|" + }, + { + "x": 0, + "y": 2, + "w": 1, + "h": 1, + "content": "PRImA" + }, + { + "x": 0, + "y": 3, + "w": 1, + "h": 1, + "content": "Newspaper" + }, + { + "x": 0, + "y": 4, + "w": 1, + "h": 1, + "content": "TableBank" + }, + { + "x": 0, + "y": 5, + "w": 1, + "h": 1, + "content": "HJDataset" + }, + { + "x": 1, + "y": 0, + "w": 1, + "h": 1, + "content": "| Base Model'|" + }, + { + "x": 1, + "y": 1, + "w": 1, + "h": 1, + "content": "F/M" + }, + { + "x": 1, + "y": 2, + "w": 1, + "h": 1, + "content": "M" + }, + { + "x": 1, + "y": 3, + "w": 1, + "h": 1, + "content": "F" + }, + { + "x": 1, + "y": 4, + "w": 1, + "h": 1, + "content": "F" + }, + { + "x": 1, + "y": 5, + "w": 1, + "h": 1, + "content": "F/M" + }, + { + "x": 2, + "y": 0, + "w": 1, + "h": 1, + "content": "| Notes" + }, + { + "x": 2, + "y": 1, + "w": 1, + "h": 1, + "content": "Layouts of modern scientific documents" + }, + { + "x": 2, + "y": 2, + "w": 1, + "h": 1, + "content": "Layouts of scanned modern magazines and scientific report" + }, + { + "x": 2, + "y": 3, + "w": 1, + "h": 1, + "content": "Layouts of scanned US newspapers from the 20th century" + }, + { + "x": 2, + "y": 4, + "w": 1, + "h": 1, + "content": "Table region on modern scientific and business document" + }, + { + "x": 2, + "y": 5, + "w": 1, + "h": 1, + "content": "Layouts of history Japanese documents" + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1390,6 +1519,261 @@ "element_id": "64bc79d1132a89c71837f420d6e4e2dc", "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { + "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
", + "table_as_cells": [ + { + "x": 0, + "y": 0, + "w": 1, + "h": 1, + "content": "block.pad(top, bottom," + }, + { + "x": 0, + "y": 1, + "w": 1, + "h": 1, + "content": "block.scale(fx, fy)" + }, + { + "x": 0, + "y": 2, + "w": 1, + "h": 1, + "content": "block.shift(dx, dy)" + }, + { + "x": 0, + "y": 3, + "w": 1, + "h": 1, + "content": "block1.is_in(block2)" + }, + { + "x": 0, + "y": 4, + "w": 1, + "h": 1, + "content": "block1. intersect (block2)" + }, + { + "x": 0, + "y": 5, + "w": 1, + "h": 1, + "content": "block1.union(block2)" + }, + { + "x": 0, + "y": 6, + "w": 1, + "h": 1, + "content": "block1.relative_to(block2)" + }, + { + "x": 0, + "y": 7, + "w": 1, + "h": 1, + "content": "block1.condition_on(block2)" + }, + { + "x": 0, + "y": 8, + "w": 1, + "h": 1, + "content": "block. crop_image (image)" + }, + { + "x": 1, + "y": 0, + "w": 1, + "h": 1, + "content": "right," + }, + { + "x": 1, + "y": 1, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 2, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 3, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 4, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 5, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 6, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 7, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 1, + "y": 8, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 0, + "w": 1, + "h": 1, + "content": "left)" + }, + { + "x": 2, + "y": 1, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 2, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 3, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 4, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 5, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 6, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 7, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 2, + "y": 8, + "w": 1, + "h": 1, + "content": "" + }, + { + "x": 3, + "y": 0, + "w": 1, + "h": 1, + "content": "Enlarge the current block according to the input" + }, + { + "x": 3, + "y": 1, + "w": 1, + "h": 1, + "content": "Scale the current block given the ratio in x and y direction" + }, + { + "x": 3, + "y": 2, + "w": 1, + "h": 1, + "content": "Move the current block with the shift distances in x and y direction" + }, + { + "x": 3, + "y": 3, + "w": 1, + "h": 1, + "content": "Whether block] is inside of block2" + }, + { + "x": 3, + "y": 4, + "w": 1, + "h": 1, + "content": "Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs" + }, + { + "x": 3, + "y": 5, + "w": 1, + "h": 1, + "content": "Return the union region of blockl and block2. Coordinate type to be determined based on the inputs" + }, + { + "x": 3, + "y": 6, + "w": 1, + "h": 1, + "content": "Convert the absolute coordinates of block to relative coordinates to block2" + }, + { + "x": 3, + "y": 7, + "w": 1, + "h": 1, + "content": "Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates" + }, + { + "x": 3, + "y": 8, + "w": 1, + "h": 1, + "content": "Obtain the image segments in the block region" + } + ], "filetype": "application/pdf", "languages": [ "eng" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3915e31193..a1c7ac250b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.3-dev3" # pragma: no cover +__version__ = "0.14.3-dev4" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 71b888b0f3..544ef345e2 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -589,6 +589,6 @@ def decide_table_extraction( # completely and rely exclusively on `skip_infer_table_types` for all file types. # Until then for pdf files we first check pdf_infer_table_structure and then update # based on skip_infer_tables. - return pdf_infer_table_structure and doc_type not in skip_infer_table_types + return pdf_infer_table_structure or doc_type not in skip_infer_table_types return doc_type not in skip_infer_table_types