From b517efa92c73d50ae2c979e10df46c939487730d Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructuredai.io>
Date: Thu, 30 May 2024 13:19:55 -0400
Subject: [PATCH 1/5] enhancement: add tar filters for py3.12

---
 .../ingest/utils/test_compression.py              | 15 +++++++++++++++
 unstructured/ingest/utils/compression.py          |  6 ++++++
 2 files changed, 21 insertions(+)
 create mode 100644 test_unstructured/ingest/utils/test_compression.py

diff --git a/test_unstructured/ingest/utils/test_compression.py b/test_unstructured/ingest/utils/test_compression.py
new file mode 100644
index 0000000000..7699a385ee
--- /dev/null
+++ b/test_unstructured/ingest/utils/test_compression.py
@@ -0,0 +1,15 @@
+import os
+import tarfile
+
+from unstructured.ingest.utils.compression import uncompress_tar_file
+
+
+def test_uncompress_tar_file(tmpdir):
+    tar_filename = os.path.join(tmpdir, "test.tar")
+    filename = "example-docs/fake-text.txt"
+
+    with tarfile.open(tar_filename, "w:gz") as tar:
+        tar.add(filename, arcname=os.path.basename(filename))
+
+    path = uncompress_tar_file(tar_filename, path=tmpdir.dirname)
+    assert path == tmpdir.dirname
diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py
index a1d0bfada4..627f5d861d 100644
--- a/unstructured/ingest/utils/compression.py
+++ b/unstructured/ingest/utils/compression.py
@@ -1,5 +1,6 @@
 import copy
 import os
+import sys
 import tarfile
 import zipfile
 from dataclasses import dataclass
@@ -63,6 +64,11 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
     path = path if path else os.path.join(head, f"{tail}-tar-uncompressed")
     logger.info(f"extracting tar {tar_filename} -> {path}")
     with tarfile.open(tar_filename, "r:gz") as tfile:
+        # NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
+        # This was added in Python 3.12
+        # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
+        if sys.version_info >= (3, 12):
+            tfile.extraction_filter = tar_file.tar_filter
         tfile.extractall(path=path)
     return path
 

From 754eb36f6b0e89519e0d5b13a3a1eaee8af9ba83 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructuredai.io>
Date: Thu, 30 May 2024 13:21:45 -0400
Subject: [PATCH 2/5] changelog and version

---
 CHANGELOG.md                | 4 +++-
 unstructured/__version__.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 73725edd1f..7a2f223fe3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.14.4-dev2
+## 0.14.4-dev3
 
 ### Enhancements
 
+* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python standard library in Python 3.12. The change only applies when using Python 3.12.
+
 ### Features
 
 ### Fixes
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 469f3223a0..2846481ea6 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.4-dev2"  # pragma: no cover
+__version__ = "0.14.4-dev3"  # pragma: no cover

From 81d5c423e956ac31165bc231be860796948257b7 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructuredai.io>
Date: Thu, 30 May 2024 13:24:55 -0400
Subject: [PATCH 3/5] fix typo

---
 unstructured/ingest/utils/compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py
index 627f5d861d..68d53c4a51 100644
--- a/unstructured/ingest/utils/compression.py
+++ b/unstructured/ingest/utils/compression.py
@@ -68,7 +68,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
         # This was added in Python 3.12
         # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
         if sys.version_info >= (3, 12):
-            tfile.extraction_filter = tar_file.tar_filter
+            tfile.extraction_filter = tarfile.tar_filter
         tfile.extractall(path=path)
     return path
 

From b5fed0682f69cef5d1a835a7a39a390a814957bc Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructuredai.io>
Date: Thu, 30 May 2024 13:25:17 -0400
Subject: [PATCH 4/5] changelog tweak

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7a2f223fe3..8028a67bd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ### Enhancements
 
-* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python standard library in Python 3.12. The change only applies when using Python 3.12.
+* **Filtering for tar extraction** Adds tar filtering to the compression module for connectors to avoid decompression malicious content in `.tar.gz` files. This was added to the Python `tarfile` lib in Python 3.12. The change only applies when using Python 3.12 and above.
 
 ### Features
 

From 3cce997274d319e89fa0fae881c6c9ef124bb176 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructuredai.io>
Date: Thu, 30 May 2024 13:31:20 -0400
Subject: [PATCH 5/5] add warning or earlier python versions

---
 unstructured/ingest/utils/compression.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py
index 68d53c4a51..41f4b32406 100644
--- a/unstructured/ingest/utils/compression.py
+++ b/unstructured/ingest/utils/compression.py
@@ -69,6 +69,12 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
         # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
         if sys.version_info >= (3, 12):
             tfile.extraction_filter = tarfile.tar_filter
+        else:
+            logger.warning(
+                "Extraction filtering for tar files is available for Python 3.12 and above. "
+                "Consider upgrading your Python version to improve security. "
+                "See https://docs.python.org/3/library/tarfile.html#extraction-filters"
+            )
         tfile.extractall(path=path)
     return path