diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e77a4db5c..6b03c08f8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python `tarfile` lib in Python 3.12. The change only applies when using Python 3.12 and above. + ### Features ### Fixes diff --git a/test_unstructured/ingest/utils/test_compression.py b/test_unstructured/ingest/utils/test_compression.py new file mode 100644 index 0000000000..7699a385ee --- /dev/null +++ b/test_unstructured/ingest/utils/test_compression.py @@ -0,0 +1,15 @@ +import os +import tarfile + +from unstructured.ingest.utils.compression import uncompress_tar_file + + +def test_uncompress_tar_file(tmpdir): + tar_filename = os.path.join(tmpdir, "test.tar") + filename = "example-docs/fake-text.txt" + + with tarfile.open(tar_filename, "w:gz") as tar: + tar.add(filename, arcname=os.path.basename(filename)) + + path = uncompress_tar_file(tar_filename, path=tmpdir.dirname) + assert path == tmpdir.dirname diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py index a1d0bfada4..41f4b32406 100644 --- a/unstructured/ingest/utils/compression.py +++ b/unstructured/ingest/utils/compression.py @@ -1,5 +1,6 @@ import copy import os +import sys import tarfile import zipfile from dataclasses import dataclass @@ -63,6 +64,17 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str: path = path if path else os.path.join(head, f"{tail}-tar-uncompressed") logger.info(f"extracting tar {tar_filename} -> {path}") with tarfile.open(tar_filename, "r:gz") as tfile: + # NOTE(robinson: Mitigate against malicious content being extracted from the tar file. + # This was added in Python 3.12 + # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters + if sys.version_info >= (3, 12): + tfile.extraction_filter = tarfile.tar_filter + else: + logger.warning( + "Extraction filtering for tar files is available for Python 3.12 and above. " + "Consider upgrading your Python version to improve security. " + "See https://docs.python.org/3/library/tarfile.html#extraction-filters" + ) tfile.extractall(path=path) return path