From b517efa92c73d50ae2c979e10df46c939487730d Mon Sep 17 00:00:00 2001 From: Matt Robinson <mrobinson@unstructuredai.io> Date: Thu, 30 May 2024 13:19:55 -0400 Subject: [PATCH 1/5] enhancement: add tar filters for py3.12 --- .../ingest/utils/test_compression.py | 15 +++++++++++++++ unstructured/ingest/utils/compression.py | 6 ++++++ 2 files changed, 21 insertions(+) create mode 100644 test_unstructured/ingest/utils/test_compression.py diff --git a/test_unstructured/ingest/utils/test_compression.py b/test_unstructured/ingest/utils/test_compression.py new file mode 100644 index 0000000000..7699a385ee --- /dev/null +++ b/test_unstructured/ingest/utils/test_compression.py @@ -0,0 +1,15 @@ +import os +import tarfile + +from unstructured.ingest.utils.compression import uncompress_tar_file + + +def test_uncompress_tar_file(tmpdir): + tar_filename = os.path.join(tmpdir, "test.tar") + filename = "example-docs/fake-text.txt" + + with tarfile.open(tar_filename, "w:gz") as tar: + tar.add(filename, arcname=os.path.basename(filename)) + + path = uncompress_tar_file(tar_filename, path=tmpdir.dirname) + assert path == tmpdir.dirname diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py index a1d0bfada4..627f5d861d 100644 --- a/unstructured/ingest/utils/compression.py +++ b/unstructured/ingest/utils/compression.py @@ -1,5 +1,6 @@ import copy import os +import sys import tarfile import zipfile from dataclasses import dataclass @@ -63,6 +64,11 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str: path = path if path else os.path.join(head, f"{tail}-tar-uncompressed") logger.info(f"extracting tar {tar_filename} -> {path}") with tarfile.open(tar_filename, "r:gz") as tfile: + # NOTE(robinson: Mitigate against malicious content being extracted from the tar file. + # This was added in Python 3.12 + # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters + if sys.version_info >= (3, 12): + tfile.extraction_filter = tar_file.tar_filter tfile.extractall(path=path) return path From 754eb36f6b0e89519e0d5b13a3a1eaee8af9ba83 Mon Sep 17 00:00:00 2001 From: Matt Robinson <mrobinson@unstructuredai.io> Date: Thu, 30 May 2024 13:21:45 -0400 Subject: [PATCH 2/5] changelog and version --- CHANGELOG.md | 4 +++- unstructured/__version__.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73725edd1f..7a2f223fe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.14.4-dev2 +## 0.14.4-dev3 ### Enhancements +* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python standard library in Python 3.12. The change only applies when using Python 3.12. + ### Features ### Fixes diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 469f3223a0..2846481ea6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev2" # pragma: no cover +__version__ = "0.14.4-dev3" # pragma: no cover From 81d5c423e956ac31165bc231be860796948257b7 Mon Sep 17 00:00:00 2001 From: Matt Robinson <mrobinson@unstructuredai.io> Date: Thu, 30 May 2024 13:24:55 -0400 Subject: [PATCH 3/5] fix typo --- unstructured/ingest/utils/compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py index 627f5d861d..68d53c4a51 100644 --- a/unstructured/ingest/utils/compression.py +++ b/unstructured/ingest/utils/compression.py @@ -68,7 +68,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str: # This was added in Python 3.12 # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters if sys.version_info >= (3, 12): - tfile.extraction_filter = tar_file.tar_filter + tfile.extraction_filter = tarfile.tar_filter tfile.extractall(path=path) return path From b5fed0682f69cef5d1a835a7a39a390a814957bc Mon Sep 17 00:00:00 2001 From: Matt Robinson <mrobinson@unstructuredai.io> Date: Thu, 30 May 2024 13:25:17 -0400 Subject: [PATCH 4/5] changelog tweak --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a2f223fe3..8028a67bd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ### Enhancements -* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python standard library in Python 3.12. The change only applies when using Python 3.12. +* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python `tarfile` lib in Python 3.12. The change only applies when using Python 3.12 and above. ### Features From 3cce997274d319e89fa0fae881c6c9ef124bb176 Mon Sep 17 00:00:00 2001 From: Matt Robinson <mrobinson@unstructuredai.io> Date: Thu, 30 May 2024 13:31:20 -0400 Subject: [PATCH 5/5] add warning or earlier python versions --- unstructured/ingest/utils/compression.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py index 68d53c4a51..41f4b32406 100644 --- a/unstructured/ingest/utils/compression.py +++ b/unstructured/ingest/utils/compression.py @@ -69,6 +69,12 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str: # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters if sys.version_info >= (3, 12): tfile.extraction_filter = tarfile.tar_filter + else: + logger.warning( + "Extraction filtering for tar files is available for Python 3.12 and above. " + "Consider upgrading your Python version to improve security. " + "See https://docs.python.org/3/library/tarfile.html#extraction-filters" + ) tfile.extractall(path=path) return path