From 1dede5029dd51a9c27d6668479cba1611141461c Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Mon, 3 Jun 2024 11:49:38 -0700 Subject: [PATCH] fix: parsing pdf error - new_cells as str has no "copy" (#3130) Closes #3119. ### Testing Parsing the provided PDF should be successful. [testing_brochure_2.pdf](https://github.com/user-attachments/files/15518094/testing_brochure_2.pdf) ``` filename = "testing_brochure_2.pdf" with open(filename, "rb") as pdf_content: elements = partition_pdf( file=pdf_content, infer_table_structure=True, extract_image_block_types=["Image", "Table"], chunking_strategy="by_title", max_characters=1000, new_after_n_chars=3000, combine_text_under_n_chars=1000, ) print("\n\n".join([str(el) for el in elements])) ``` --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- unstructured/partition/pdf_image/ocr.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51f9cbf5b9..3337a69957 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.4-dev6 +## 0.14.4 ### Enhancements @@ -12,6 +12,7 @@ ### Fixes +* **Address the issue of unrecognized tables in `UnstructuredTableTransformerModel`** When a table is not recognized, the `element.metadata.text_as_html` attribute is set to an empty string. * **Remove root handlers in ingest logger**. Removes root handlers in ingest loggers to ensure secrets aren't accidentally exposed in Colab notebooks. * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized. * **Clarified dependence on particular version of `python-docx`** Pinned `python-docx` version to ensure a particular method `unstructured` uses is included. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c14a58bf38..9398730982 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev6" # pragma: no cover +__version__ = "0.14.4" # pragma: no cover diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index ed8740bba5..39ca6f995e 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -280,7 +280,8 @@ def supplement_element_with_table_extraction( cropped_image, ocr_tokens=table_tokens, result_format="cells" ) - text_as_html = cells_to_html(tatr_cells) + # NOTE(christine): `tatr_cells == ""` means that the table was not recognized + text_as_html = "" if tatr_cells == "" else cells_to_html(tatr_cells) element.text_as_html = text_as_html if env_config.EXTRACT_TABLE_AS_CELLS: