From 1b43102762d17924cdb01f14c808b122e98c5aba Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 31 May 2024 18:07:38 -0400 Subject: [PATCH] fix: remote root handlers when they exist (#3128) ### Summary In some environments, such as Google Colab, loggers have a root handling that did not mask sensitive values. As a result, secrets such as API keys appeared in the logs. The PR removes root handlers when they exist to ensure sensitive values are handler properly. ### Testing Run the following in a Colab notebook. You should see two log outputs, one with the API key masked and one with it exposed. ``` !pip install unstructured ``` ```python import logging import json from unstructured.ingest.interfaces import ( ChunkingConfig, EmbeddingConfig, PartitionConfig, ProcessorConfig, ReadConfig, ) partition_config = PartitionConfig( partition_by_api=True, api_key="super secret", ) from unstructured.ingest.logger import ingest_log_streaming_init ingest_log_streaming_init(logging.INFO) logger = logging.getLogger("unstructured.ingest") logger.setLevel(logging.INFO) logger.info( f"Running partition node to extract content from json files. " f"Config: {partition_config.to_json()}, " ) ``` Now replace the first cell with the following and rerun the Python code. Only the masked logging output should remain. ``` !git clone https://github.com/Unstructured-IO/unstructured.git && cd unstructured && git checkout fix/rm-log-dupes && pip install -e . ``` --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- unstructured/ingest/logger.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7982a7165f..51f9cbf5b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.4-dev5 +## 0.14.4-dev6 ### Enhancements @@ -12,6 +12,7 @@ ### Fixes +* **Remove root handlers in ingest logger**. Removes root handlers in ingest loggers to ensure secrets aren't accidentally exposed in Colab notebooks. * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized. * **Clarified dependence on particular version of `python-docx`** Pinned `python-docx` version to ensure a particular method `unstructured` uses is included. * **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ff19fe3450..c14a58bf38 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev5" # pragma: no cover +__version__ = "0.14.4-dev6" # pragma: no cover diff --git a/unstructured/ingest/logger.py b/unstructured/ingest/logger.py index 6970c0ef03..ed4e7180e5 100644 --- a/unstructured/ingest/logger.py +++ b/unstructured/ingest/logger.py @@ -94,6 +94,15 @@ def format(self, record): return redact_jsons(s) +def remove_root_handlers(logger: logging.Logger) -> None: + # NOTE(robinson) - in some environments such as Google Colab, there is a root handler + # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs. + # Removing these when they exist prevents this behavior + if logger.root.hasHandlers(): + for handler in logger.root.handlers: + logger.root.removeHandler(handler) + + def ingest_log_streaming_init(level: int) -> None: handler = logging.StreamHandler() handler.name = "ingest_log_handler" @@ -104,6 +113,7 @@ def ingest_log_streaming_init(level: int) -> None: if "ingest_log_handler" not in [h.name for h in logger.handlers]: logger.addHandler(handler) + remove_root_handlers(logger) logger.setLevel(level) @@ -116,4 +126,5 @@ def make_default_logger(level: int) -> logging.Logger: handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(level) + remove_root_handlers(logger) return logger