diff --git a/CHANGELOG.md b/CHANGELOG.md index 9256b7e3bd..4be12648cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.14-dev4 +## 0.15.14-dev5 ### Enhancements @@ -11,6 +11,7 @@ * **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK. * **Remove "unused" `date_from_file_object` parameter.** As part of simplifying partitioning parameter set, remove `date_from_file_object` parameter. A file object does not have a last-modified date attribute so can never give a useful value. When a file-object is used as the document source (such as in Unstructured API) the last-modified date must come from the `metadata_last_modified` argument. * **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id. +* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned. ## 0.15.13 diff --git a/example-docs/fake-text-all-whitespace.txt b/example-docs/fake-text-all-whitespace.txt new file mode 100644 index 0000000000..b28b04f643 --- /dev/null +++ b/example-docs/fake-text-all-whitespace.txt @@ -0,0 +1,3 @@ + + + diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 3f28177467..c43bd52406 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -749,22 +749,30 @@ def test_auto_partition_tsv_from_filename(): # ================================================================================================ # TXT # ================================================================================================ - - -def test_auto_partition_text_from_filename(): - file_path = example_doc_path("fake-text.txt") +@pytest.mark.parametrize( + ("filename", "expected_elements"), + [ + ( + "fake-text.txt", + [ + NarrativeText(text="This is a test document to use for unit tests."), + Address(text="Doylestown, PA 18901"), + Title(text="Important points:"), + ListItem(text="Hamburgers are delicious"), + ListItem(text="Dogs are the best"), + ListItem(text="I love fuzzy blankets"), + ], + ), + ("fake-text-all-whitespace.txt", []), + ], +) +def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]): + file_path = example_doc_path(filename) elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES) - assert elements == [ - NarrativeText(text="This is a test document to use for unit tests."), - Address(text="Doylestown, PA 18901"), - Title(text="Important points:"), - ListItem(text="Hamburgers are delicious"), - ListItem(text="Dogs are the best"), - ListItem(text="I love fuzzy blankets"), - ] - assert all(e.metadata.filename == "fake-text.txt" for e in elements) + assert elements == expected_elements + assert all(e.metadata.filename == filename for e in elements) assert all(e.metadata.file_directory == example_doc_path("") for e in elements) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 18a138fccd..5809122b14 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev4" # pragma: no cover +__version__ = "0.15.14-dev5" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 56d58ef507..d109cd7384 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -601,7 +601,7 @@ def _is_json(self) -> bool: text_head = self._ctx.text_head # -- an empty file is not JSON -- - if not text_head: + if not text_head.lstrip(): return False # -- has to be a list or object, no string, number, or bool --