From 1c8b2b23ebee03bbf2cb26dae0f0ae4a00e52c3b Mon Sep 17 00:00:00 2001 From: amadeusz-ds <165173689+amadeusz-ds@users.noreply.github.com> Date: Fri, 17 May 2024 21:16:10 +0200 Subject: [PATCH] feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres (#3014) This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR controlling where temporary files are stored during partition flow, via tempfile.tempdir. #### Edit: Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_ #### Edit 2: Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_ --- CHANGELOG.md | 3 +- .../pdf_image/test_pdf_image_utils.py | 11 ++++- .../partition/utils/test_config.py | 47 +++++++++++++++++++ unstructured/__init__.py | 4 ++ unstructured/__version__.py | 2 +- unstructured/metrics/evaluate.py | 1 - unstructured/partition/pdf.py | 9 ++++ .../partition/pdf_image/pdf_image_utils.py | 7 ++- unstructured/partition/utils/config.py | 45 ++++++++++++++++++ 9 files changed, 122 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 33b39cc44d..c82d914043 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.0-dev14 +## 0.14.0-dev15 ### BREAKING CHANGES @@ -9,6 +9,7 @@ * **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted. * **Faster evaluation** Support for concurrent processing of documents during evaluation * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. +* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage. ### Features * **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`. diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index 4496e74257..09038873ab 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -143,7 +143,9 @@ def test_save_elements( assert not el.metadata.image_mime_type -def test_save_elements_with_output_dir_path_none(): +@pytest.mark.parametrize("storage_enabled", [False, True]) +def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled): + monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled) with ( patch("PIL.Image.open"), patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"), @@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none(): ) # Verify that the images are saved in the expected directory - expected_output_dir = os.path.join(tmpdir, "figures") + if storage_enabled: + from unstructured.partition.utils.config import env_config + + expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures") + else: + expected_output_dir = os.path.join(tmpdir, "figures") assert os.path.exists(expected_output_dir) assert os.path.isdir(expected_output_dir) os.chdir(original_cwd) diff --git a/test_unstructured/partition/utils/test_config.py b/test_unstructured/partition/utils/test_config.py index 76962f634e..d9a4bf5f9d 100644 --- a/test_unstructured/partition/utils/test_config.py +++ b/test_unstructured/partition/utils/test_config.py @@ -1,3 +1,10 @@ +import shutil +import tempfile +from pathlib import Path + +import pytest + + def test_default_config(): from unstructured.partition.utils.config import env_config @@ -9,3 +16,43 @@ def test_env_override(monkeypatch): from unstructured.partition.utils.config import env_config assert env_config.IMAGE_CROP_PAD == 1 + + +@pytest.fixture() +def _setup_tmpdir(): + from unstructured.partition.utils.config import env_config + + _tmpdir = tempfile.tempdir + _storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR + _storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak" + if Path(_storage_tmpdir).is_dir(): + shutil.move(_storage_tmpdir, _storage_tmpdir_bak) + tempfile.tempdir = None + yield + if Path(_storage_tmpdir_bak).is_dir(): + if Path(_storage_tmpdir).is_dir(): + shutil.rmtree(_storage_tmpdir) + shutil.move(_storage_tmpdir_bak, _storage_tmpdir) + tempfile.tempdir = _tmpdir + + +@pytest.mark.usefixtures("_setup_tmpdir") +def test_env_storage_disabled(monkeypatch): + monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false") + from unstructured.partition.utils.config import env_config + + assert not env_config.GLOBAL_WORKING_DIR_ENABLED + assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR + assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir() + assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR + + +@pytest.mark.usefixtures("_setup_tmpdir") +def test_env_storage_enabled(monkeypatch): + monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true") + from unstructured.partition.utils.config import env_config + + assert env_config.GLOBAL_WORKING_DIR_ENABLED + assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR + assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir() + assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR diff --git a/unstructured/__init__.py b/unstructured/__init__.py index e69de29bb2..b8f3f32f1d 100644 --- a/unstructured/__init__.py +++ b/unstructured/__init__.py @@ -0,0 +1,4 @@ +from .partition.utils.config import env_config + +# init env_config +env_config diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2c8d1cb2fc..1a54bfd152 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.0-dev14" # pragma: no cover +__version__ = "0.14.0-dev15" # pragma: no cover diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index a6d97962cf..4ea44a237a 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]: @abstractmethod def _process_document(self, doc: Path) -> list: """Should return all metadata and metrics for a single document.""" - pass @dataclass diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 1463fe87aa..78001f9dd4 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -6,6 +6,7 @@ import os import re import warnings +from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast import numpy as np @@ -438,6 +439,14 @@ def _partition_pdf_or_image_local( ) if analysis: + if not analyzed_image_output_dir_path: + if env_config.GLOBAL_WORKING_DIR_ENABLED: + analyzed_image_output_dir_path = str( + Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated" + ) + else: + analyzed_image_output_dir_path = str(Path.cwd() / "annotated") + os.makedirs(analyzed_image_output_dir_path, exist_ok=True) annotate_layout_elements( inferred_document_layout=inferred_document_layout, extracted_layout=extracted_layout, diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index 26432c3b87..e57a97e6ea 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -4,7 +4,7 @@ import tempfile from copy import deepcopy from io import BytesIO -from pathlib import PurePath +from pathlib import Path, PurePath from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast import cv2 @@ -131,7 +131,10 @@ def save_elements( """ if not output_dir_path: - output_dir_path = os.path.join(os.getcwd(), "figures") + if env_config.GLOBAL_WORKING_DIR_ENABLED: + output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures") + else: + output_dir_path = str(Path.cwd() / "figures") os.makedirs(output_dir_path, exist_ok=True) with tempfile.TemporaryDirectory() as temp_dir: diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 28eade91d9..f6952df406 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -7,15 +7,28 @@ """ import os +import tempfile from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT +@lru_cache(maxsize=1) +def get_tempdir(dir: str) -> str: + tempdir = Path(dir) / f"tmp/{os.getpgid(0)}" + return str(tempdir) + + @dataclass class ENVConfig: """class for configuring enviorment parameters""" + def __post_init__(self): + if self.GLOBAL_WORKING_DIR_ENABLED: + self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR) + def _get_string(self, var: str, default_value: str = "") -> str: """attempt to get the value of var from the os environment; if not present return the default_value""" @@ -31,6 +44,15 @@ def _get_float(self, var: str, default_value: float) -> float: return float(value) return default_value + def _get_bool(self, var: str, default_value: bool) -> bool: + if value := self._get_string(var): + return value.lower() in ("true", "1", "t") + return default_value + + def _setup_tmpdir(self, tmpdir: str) -> None: + Path(tmpdir).mkdir(parents=True, exist_ok=True) + tempfile.tempdir = tmpdir + @property def IMAGE_CROP_PAD(self) -> int: """extra image content to add around an identified element region; measured in pixels""" @@ -117,5 +139,28 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float: return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9) + @property + def GLOBAL_WORKING_DIR_ENABLED(self) -> bool: + """Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR.""" + return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False) + + @property + def GLOBAL_WORKING_DIR(self) -> str: + """Path to Unstructured cache directory.""" + return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured")) + + @property + def GLOBAL_WORKING_PROCESS_DIR(self) -> str: + """Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP. + Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'. + """ + default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR) + tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir) + if tmpdir == "": + tmpdir = default_tmpdir + if self.GLOBAL_WORKING_DIR_ENABLED: + self._setup_tmpdir(tmpdir) + return tmpdir + env_config = ENVConfig()