From 1dede5029dd51a9c27d6668479cba1611141461c Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Mon, 3 Jun 2024 11:49:38 -0700
Subject: [PATCH] fix: parsing pdf error - new_cells as str has no "copy"
 (#3130)

Closes #3119.

### Testing
Parsing the provided PDF should be successful.


[testing_brochure_2.pdf](https://github.com/user-attachments/files/15518094/testing_brochure_2.pdf)
```
filename = "testing_brochure_2.pdf"
with open(filename, "rb") as pdf_content:
    elements = partition_pdf(
        file=pdf_content,
        infer_table_structure=True,
        extract_image_block_types=["Image", "Table"],
        chunking_strategy="by_title",
        max_characters=1000,
        new_after_n_chars=3000,
        combine_text_under_n_chars=1000,
    )
print("\n\n".join([str(el) for el in elements]))
```
---
 CHANGELOG.md                            | 3 ++-
 unstructured/__version__.py             | 2 +-
 unstructured/partition/pdf_image/ocr.py | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 51f9cbf5b9..3337a69957 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.4-dev6
+## 0.14.4
 
 ### Enhancements
 
@@ -12,6 +12,7 @@
 
 ### Fixes
 
+* **Address the issue of unrecognized tables in `UnstructuredTableTransformerModel`** When a table is not recognized, the `element.metadata.text_as_html` attribute is set to an empty string.
 * **Remove root handlers in ingest logger**. Removes root handlers in ingest loggers to ensure secrets aren't accidentally exposed in Colab notebooks.
 * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized.
 * **Clarified dependence on particular version of `python-docx`** Pinned `python-docx` version to ensure a particular method `unstructured` uses is included.
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index c14a58bf38..9398730982 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.4-dev6"  # pragma: no cover
+__version__ = "0.14.4"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
index ed8740bba5..39ca6f995e 100644
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@@ -280,7 +280,8 @@ def supplement_element_with_table_extraction(
             cropped_image, ocr_tokens=table_tokens, result_format="cells"
         )
 
-        text_as_html = cells_to_html(tatr_cells)
+        # NOTE(christine): `tatr_cells == ""` means that the table was not recognized
+        text_as_html = "" if tatr_cells == "" else cells_to_html(tatr_cells)
         element.text_as_html = text_as_html
 
         if env_config.EXTRACT_TABLE_AS_CELLS: