From 79da2ee14ab3511fd6850c186543c10e4ecaeee4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:21:57 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/README.md | 7 ++- comps/guardrails/pii_detection/config.py | 3 +- comps/guardrails/pii_detection/data_utils.py | 15 ++--- .../guardrails/pii_detection/pii/__init__.py | 5 +- .../pii_detection/pii/detect/__init__.py | 2 + .../pii/detect/emails_detection.py | 16 +++-- .../pii_detection/pii/detect/ip_detection.py | 29 +++++----- .../pii/detect/keys_detection.py | 20 +++---- .../pii/detect/name_password_detection.py | 8 ++- .../pii/detect/phones_detection.py | 8 ++- .../pii_detection/pii/detect/utils.py | 3 + .../guardrails/pii_detection/pii/pii_utils.py | 43 ++++++++------ .../guardrails/pii_detection/pii_detection.py | 39 +++++++++---- comps/guardrails/pii_detection/ray_utils.py | 22 ++++--- .../guardrails/pii_detection/requirements.txt | 6 +- comps/guardrails/pii_detection/test.py | 58 ++++++++++--------- comps/guardrails/pii_detection/utils.py | 15 ++++- 17 files changed, 183 insertions(+), 116 deletions(-) diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md index db95577519..99eaec4226 100644 --- a/comps/guardrails/pii_detection/README.md +++ b/comps/guardrails/pii_detection/README.md @@ -40,7 +40,7 @@ TBD ## 2.1.2 use NER model (default mode) -``` bash +```bash mkdir -p pii/bigcode apt install git-lfs cd pii/bigcode; git clone https://{hf_username}:{hf_token}@huggingface.co/bigcode/starpii/; cd ../.. @@ -64,13 +64,14 @@ docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p ``` > debug mode + ```bash docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest ``` # 🚀3. Status Microservice -``` bash +```bash docker container logs -f guardrails-pii-detection-endpoint ``` @@ -78,7 +79,7 @@ docker container logs -f guardrails-pii-detection-endpoint Once microservice starts, user can use below script to invoke the microservice for pii detection. -``` python +```python import requests import json diff --git a/comps/guardrails/pii_detection/config.py b/comps/guardrails/pii_detection/config.py index 69a0836c5e..430532a150 100644 --- a/comps/guardrails/pii_detection/config.py +++ b/comps/guardrails/pii_detection/config.py @@ -39,8 +39,7 @@ def get_boolean_env_var(var_name, default_value=False): return default_value - LLM_URL = os.getenv("LLM_ENDPOINT_URL", None) current_file_path = pathlib.Path(__file__).parent.resolve() -comps_path = os.path.join(current_file_path, "../../../") \ No newline at end of file +comps_path = os.path.join(current_file_path, "../../../") diff --git a/comps/guardrails/pii_detection/data_utils.py b/comps/guardrails/pii_detection/data_utils.py index b6c09a2902..dfafbc6707 100644 --- a/comps/guardrails/pii_detection/data_utils.py +++ b/comps/guardrails/pii_detection/data_utils.py @@ -1,7 +1,14 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import io +import json +import multiprocessing import os +import re +import unicodedata +from urllib.parse import urlparse, urlunparse + import easyocr import fitz import numpy as np @@ -9,13 +16,6 @@ import requests import yaml from bs4 import BeautifulSoup - -import io -import json -import multiprocessing -import re -import unicodedata -from urllib.parse import urlparse, urlunparse from docx import Document as DDocument from langchain_community.document_loaders import ( UnstructuredImageLoader, @@ -26,6 +26,7 @@ from PIL import Image from utils import timeout + def load_pdf(pdf_path): """Load the pdf file.""" doc = fitz.open(pdf_path) diff --git a/comps/guardrails/pii_detection/pii/__init__.py b/comps/guardrails/pii_detection/pii/__init__.py index df198ca44e..4505e776ec 100644 --- a/comps/guardrails/pii_detection/pii/__init__.py +++ b/comps/guardrails/pii_detection/pii/__init__.py @@ -1 +1,4 @@ -__all__ = ['pii_detection', 'pii_redaction'] +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +__all__ = ["pii_detection", "pii_redaction"] diff --git a/comps/guardrails/pii_detection/pii/detect/__init__.py b/comps/guardrails/pii_detection/pii/detect/__init__.py index e69de29bb2..916f3a44b2 100644 --- a/comps/guardrails/pii_detection/pii/detect/__init__.py +++ b/comps/guardrails/pii_detection/pii/detect/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/guardrails/pii_detection/pii/detect/emails_detection.py b/comps/guardrails/pii_detection/pii/detect/emails_detection.py index 02c40ea3e0..4c5704fbd4 100644 --- a/comps/guardrails/pii_detection/pii/detect/emails_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/emails_detection.py @@ -1,4 +1,6 @@ -""" This code is adapted from BigScience PII detection +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +""" This code is adapted from BigScience PII detection https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py MST BigScience PII Code @@ -18,11 +20,13 @@ """ import sys + import regex # Note: to reduce false positives, a number of technically-valid-but-rarely-used # email address patterns (e.g. with parenthesis or slashes) will not match -email_pattern = regex.compile(r''' +email_pattern = regex.compile( + r""" (?<= ^ | [[({<\b\s@,?!;'"\p{Han}¿¡:.] | \\['"] ) # left delimiter ( (?: # local part @@ -44,7 +48,9 @@ (?: [\p{L}\p{M}]{2,63} | xn-- \w+ ) # TLD, including IDN ) (?= $ | [])}>\b\s@,?!;'"\p{Han}] | \\['"] | : (?! \d) | \. (?! \S)) # right delim -''', flags=regex.MULTILINE | regex.VERBOSE) +""", + flags=regex.MULTILINE | regex.VERBOSE, +) def detect_email(content): @@ -63,9 +69,7 @@ def detect_email(content): for match in matches_tmp: if match.groups(): if len(match.groups()) > 1 and match.groups()[1]: - sys.stderr.write( - "Warning: Found substring matches in the main match." - ) + sys.stderr.write("Warning: Found substring matches in the main match.") # setup outputs value = match.group(1) start, end = match.span(1) diff --git a/comps/guardrails/pii_detection/pii/detect/ip_detection.py b/comps/guardrails/pii_detection/pii/detect/ip_detection.py index 24bf33b423..3616f1903f 100644 --- a/comps/guardrails/pii_detection/pii/detect/ip_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/ip_detection.py @@ -1,4 +1,6 @@ -""" This code is adapted from BigScience PII detection +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +""" This code is adapted from BigScience PII detection https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py MST BigScience PII Code @@ -17,8 +19,9 @@ limitations under the License. """ -import sys import ipaddress +import sys + import regex year_patterns = [ @@ -43,10 +46,8 @@ ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])" ip_pattern = regex.compile( - r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" - + r"|".join([ipv4_pattern, ipv6_pattern]) - + ")(?:$|[\s@,?!;:'\"(.\p{Han}])", - flags=regex.MULTILINE + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:'\"(.\p{Han}])", + flags=regex.MULTILINE, ) @@ -66,10 +67,10 @@ def ip_has_digit(matched_str): def filter_versions(matched_str, context): """Filter addresses in this format x.x.x.x and the words dns/server - don't appear in the neighboring context, usually they are just versions""" - # count occurrence of dots - dot_count = matched_str.count('.') - exclude = (dot_count == 3 and len(matched_str) == 7) + don't appear in the neighboring context, usually they are just versions.""" + # count occurrence of dots + dot_count = matched_str.count(".") + exclude = dot_count == 3 and len(matched_str) == 7 if exclude: if "dns" in context.lower() or "server" in context.lower(): return False @@ -77,7 +78,7 @@ def filter_versions(matched_str, context): def not_ip_address(matched_str): - """ make sure the string has a valid IP address format + """make sure the string has a valid IP address format e.g: 33.01.33.33 is not a valid IP address because of the 0 in front of 1 TODO: fix this directly in the regex""" try: @@ -103,9 +104,7 @@ def detect_ip(content): for match in matches_tmp: if match.groups(): if len(match.groups()) > 1 and match.groups()[1]: - sys.stderr.write( - "Warning: Found substring matches in the main match." - ) + sys.stderr.write("Warning: Found substring matches in the main match.") # setup outputs value = match.group(1) start, end = match.span(1) @@ -115,7 +114,7 @@ def detect_ip(content): continue if matches_date_pattern(value): continue - if filter_versions(value, content[start - 100:end + 100]) or not_ip_address(value): + if filter_versions(value, content[start - 100 : end + 100]) or not_ip_address(value): continue # combine if conditions in one diff --git a/comps/guardrails/pii_detection/pii/detect/keys_detection.py b/comps/guardrails/pii_detection/pii/detect/keys_detection.py index ada10f71d2..97fe1e8a2a 100755 --- a/comps/guardrails/pii_detection/pii/detect/keys_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/keys_detection.py @@ -1,9 +1,10 @@ -""" This code is adapted from BigCode PII +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +""" This code is adapted from BigCode PII https://github.com/bigcode-project/bigcode-dataset/blob/main/pii/utils/keys_detection.py """ import os - # Secrets detection with detect-secrets tool @@ -44,7 +45,7 @@ def get_detector_model(): def is_gibberish(matched_str): - """Checks to make sure the PII span is gibberish and not word like""" + """Checks to make sure the PII span is gibberish and not word like.""" # pip install gibberish-detector # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds) @@ -62,7 +63,7 @@ def is_hash(content, value): # TODO: fix this issue happened one for JS in the stack-smol, file did contain value print("Value not found in content, why this happened?") return False - lines = content[:content.index(value)].splitlines() + lines = content[: content.index(value)].splitlines() target_line = lines[-1] if len(value) in [32, 40, 64]: # if "sha" or "md5" are in content: @@ -73,7 +74,7 @@ def is_hash(content, value): def file_has_hashes(content, coeff=0.02): - """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines""" + """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines.""" lines = content.splitlines() count_sha = 0 count_hash = 0 @@ -108,13 +109,12 @@ def scan_secrets(line: str): lines = line.splitlines(keepends=True) for secret in _process_line_based_plugins( - lines=list(enumerate(lines, start=1)), - filename="Adhoc String", + lines=list(enumerate(lines, start=1)), + filename="Adhoc String", ): yield secret - def detect_keys(content): """Detect secret keys in content using detect-secrets tool Args: @@ -135,9 +135,7 @@ def detect_keys(content): from detect_secrets.settings import transient_settings - with transient_settings( - {"plugins_used": plugins, "filters_used": filters} - ) as settings: + with transient_settings({"plugins_used": plugins, "filters_used": filters}) as settings: matches = [] for secret in scan_secrets(content): if is_hash(content, secret.secret_value) or file_has_hashes(content): diff --git a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py index 8a42ec054c..c02c6c80b9 100644 --- a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from .utils import PIIEntityType @@ -17,8 +20,9 @@ def detect_name_password(content, pipeline, entity_types=None): try: for entity in pipeline(content): entity_group = entity["entity_group"] - if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or \ - ("PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types): + if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or ( + "PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types + ): matches.append( { "tag": entity_group, diff --git a/comps/guardrails/pii_detection/pii/detect/phones_detection.py b/comps/guardrails/pii_detection/pii/detect/phones_detection.py index cf130be36c..0f6f770d3c 100644 --- a/comps/guardrails/pii_detection/pii/detect/phones_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/phones_detection.py @@ -1,13 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os + def detect_phones(text): - """Detects phone in a string using phonenumbers libray only detection the international phone number""" + """Detects phone in a string using phonenumbers library only detection the international phone number.""" try: import phonenumbers except ImportError: os.system("pip install phonenumbers") import phonenumbers - + matches = [] for match in phonenumbers.PhoneNumberMatcher(text, "IN"): diff --git a/comps/guardrails/pii_detection/pii/detect/utils.py b/comps/guardrails/pii_detection/pii/detect/utils.py index da5d6d65a5..60b8414fe0 100644 --- a/comps/guardrails/pii_detection/pii/detect/utils.py +++ b/comps/guardrails/pii_detection/pii/detect/utils.py @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum, auto diff --git a/comps/guardrails/pii_detection/pii/pii_utils.py b/comps/guardrails/pii_detection/pii/pii_utils.py index c92fc06a06..6dae7d002b 100644 --- a/comps/guardrails/pii_detection/pii/pii_utils.py +++ b/comps/guardrails/pii_detection/pii/pii_utils.py @@ -1,39 +1,47 @@ -from .detect.ip_detection import detect_ip +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import List + from .detect.emails_detection import detect_email -from .detect.phones_detection import detect_phones -from .detect.name_password_detection import detect_name_password +from .detect.ip_detection import detect_ip from .detect.keys_detection import detect_keys +from .detect.name_password_detection import detect_name_password +from .detect.phones_detection import detect_phones from .detect.utils import PIIEntityType -from typing import List -import os class PIIDetector: def __init__(strategy: str): pass - + def detect_pii(self, data): import random + return random.choice([True, False]) - + + class PIIDetectorWithLLM(PIIDetector): def __init__(self): super().__init__() - + def detect_pii(self, text): return True - + + class PIIDetectorWithNER(PIIDetector): def __init__(self, model_path=None): super().__init__() - from transformers import pipeline - from transformers import AutoTokenizer + from transformers import AutoTokenizer, pipeline _model_key = "bigcode/starpii" _model_key = _model_key if model_path is None else os.path.join(model_path, _model_key) tokenizer = AutoTokenizer.from_pretrained(_model_key, model_max_length=512) - self.pipeline = pipeline(model=_model_key, task='token-classification', tokenizer=tokenizer, grouped_entities=True) - + self.pipeline = pipeline( + model=_model_key, task="token-classification", tokenizer=tokenizer, grouped_entities=True + ) + def detect_pii(self, text): result = [] # use a regex to detect ip addresses @@ -53,14 +61,15 @@ def detect_pii(self, text): if PIIEntityType.NAME in entity_types or PIIEntityType.PASSWORD in entity_types: result = result + detect_name_password(text, self.pipeline, entity_types) - + print(result) - + return True if len(result) > 0 else False # Dummy function, replace with actual logic - + + class PIIDetectorWithML(PIIDetector): def __init__(self): super().__init__() - + def detect_pii(self, text): return True diff --git a/comps/guardrails/pii_detection/pii_detection.py b/comps/guardrails/pii_detection/pii_detection.py index 499ff834ac..8f73a4e4ec 100644 --- a/comps/guardrails/pii_detection/pii_detection.py +++ b/comps/guardrails/pii_detection/pii_detection.py @@ -14,14 +14,27 @@ comps_path = os.path.join(cur_path, "../../../") sys.path.append(comps_path) -from tqdm import tqdm from typing import List +from tqdm import tqdm + from comps import DocPath, opea_microservices, register_microservice -from comps.guardrails.pii_detection.utils import Timer, generate_log_name, prepare_env, save_file_to_local_disk, get_max_cpus from comps.guardrails.pii_detection.data_utils import document_loader, parse_html -from comps.guardrails.pii_detection.ray_utils import rayds_initialization, ray_runner_initialization, ray_execute -from comps.guardrails.pii_detection.pii.pii_utils import PIIDetector, PIIDetectorWithLLM, PIIDetectorWithNER, PIIDetectorWithML +from comps.guardrails.pii_detection.pii.pii_utils import ( + PIIDetector, + PIIDetectorWithLLM, + PIIDetectorWithML, + PIIDetectorWithNER, +) +from comps.guardrails.pii_detection.ray_utils import ray_execute, ray_runner_initialization, rayds_initialization +from comps.guardrails.pii_detection.utils import ( + Timer, + generate_log_name, + get_max_cpus, + prepare_env, + save_file_to_local_disk, +) + def get_pii_detection_inst(strategy="dummy", settings=None): if strategy == "ner": @@ -34,6 +47,7 @@ def get_pii_detection_inst(strategy="dummy", settings=None): # Default strategy - dummy return PIIDetector() + def file_based_pii_detect(file_list: List[DocPath], strategy, enable_ray=False, debug=False): """Ingest document to Redis.""" file_list = [f.path for f in file_list] @@ -57,6 +71,7 @@ def file_based_pii_detect(file_list: List[DocPath], strategy, enable_ray=False, ret.append(pii_detector.detect_pii(data)) return ret + def link_based_pii_detect(link_list: List[str], strategy, enable_ray=False, debug=False): link_list = [str(f) for f in link_list] pii_detector = get_pii_detection_inst(strategy=strategy) @@ -64,11 +79,11 @@ def link_based_pii_detect(link_list: List[str], strategy, enable_ray=False, debu def _parse_html(link): data = parse_html([link]) return data[0][0] - + if enable_ray: num_cpus = get_max_cpus(len(link_list)) print(f"per task num_cpus: {num_cpus}") - + log_name = generate_log_name(link_list) ds = rayds_initialization(link_list, _parse_html, lazy_mode=True, num_cpus=num_cpus) ds = ds.map(ray_runner_initialization(pii_detector.detect_pii, debug=debug), num_cpus=num_cpus) @@ -84,6 +99,7 @@ def _parse_html(link): ret.append(pii_detector.detect_pii(data)) return ret + def text_based_pii_detect(text_list: List[str], strategy, enable_ray=False, debug=False): text_list = [str(f) for f in text_list] pii_detector = get_pii_detection_inst(strategy=strategy) @@ -105,11 +121,14 @@ def text_based_pii_detect(text_list: List[str], strategy, enable_ray=False, debu ret.append(pii_detector.detect_pii(data)) return ret -@register_microservice(name="opea_service@guardrails-pii-detection", endpoint="/v1/piidetect", host="0.0.0.0", port=6357) + +@register_microservice( + name="opea_service@guardrails-pii-detection", endpoint="/v1/piidetect", host="0.0.0.0", port=6357 +) async def pii_detection(files: List[UploadFile] = File(None), link_list: str = Form(None), text_list: str = Form(None)): if not files and not link_list and not text_list: raise HTTPException(status_code=400, detail="Either files, link_list, or text_list must be provided.") - + strategy = "ner" # Default strategy pip_requirement = ["detect-secrets", "phonenumbers", "gibberish-detector"] @@ -135,7 +154,7 @@ async def pii_detection(files: List[UploadFile] = File(None), link_list: str = F return {"status": 200, "message": json.dumps(ret)} except Exception as e: raise HTTPException(status_code=400, detail=f"An error occurred: {e}") - + if text_list: try: text_list = json.loads(text_list) # Parse JSON string to list @@ -168,4 +187,4 @@ async def pii_detection(files: List[UploadFile] = File(None), link_list: str = F if __name__ == "__main__": - opea_microservices["opea_service@guardrails-pii-detection"].start() \ No newline at end of file + opea_microservices["opea_service@guardrails-pii-detection"].start() diff --git a/comps/guardrails/pii_detection/ray_utils.py b/comps/guardrails/pii_detection/ray_utils.py index 103bbc5268..c4fca4c932 100644 --- a/comps/guardrails/pii_detection/ray_utils.py +++ b/comps/guardrails/pii_detection/ray_utils.py @@ -1,15 +1,15 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Union + import pyarrow import ray from ray.data.block import Block from ray.data.datasource import FileBasedDatasource from tqdm import tqdm -from utils import Timer, get_failable_with_time, timeout, save_logs +from utils import Timer, get_failable_with_time, save_logs, timeout -from typing import TYPE_CHECKING, Any, Dict, Iterator -from typing import Callable, List, Optional, Union class RayDataLoader(FileBasedDatasource): def __init__( @@ -32,12 +32,15 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]: item = {"data": data, "filename": path, "error": error, "read_time": f"{read_time} secs"} builder.add(item) yield builder.build() - + + def rayds_initialization(file_paths, dataloader_callable, lazy_mode=True, num_cpus=20): if dataloader_callable is None: - text_list = [{"data": data, "filename": data[:50], "error": None, "read_time": f"0 secs"} for data in file_paths] + text_list = [ + {"data": data, "filename": data[:50], "error": None, "read_time": "0 secs"} for data in file_paths + ] return ray.data.from_items(text_list) - + decorated_dataloader_callable = get_failable_with_time(dataloader_callable) if lazy_mode: if num_cpus is None: @@ -53,7 +56,8 @@ def rayds_initialization(file_paths, dataloader_callable, lazy_mode=True, num_cp item = {"data": content, "filename": file, "error": error, "read_time": f"{elapse_time} secs"} data.append(item) return ray.data.from_items(data) - + + def ray_runner_initialization(func, debug=False): @timeout(600) def ray_runner(data): @@ -83,11 +87,13 @@ def ray_runner(data): "read_time": data["read_time"], "elaspe_time": f"{elapse_time} secs", } + return ray_runner + def ray_execute(ds, log_name): with Timer(f"execute with Ray, status log: {log_name}"): ret_with_status = ds.take_all() df = save_logs(log_name, ret_with_status) ret = df["ret"].to_list() - return ret \ No newline at end of file + return ret diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt index ef2d97c9a6..d942bf347a 100644 --- a/comps/guardrails/pii_detection/requirements.txt +++ b/comps/guardrails/pii_detection/requirements.txt @@ -1,7 +1,9 @@ beautifulsoup4 +detect_secrets docarray[full] easyocr fastapi +gibberish-detector huggingface_hub langchain langchain-community @@ -11,6 +13,7 @@ opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk pandas +phonenumbers Pillow pyarrow pymupdf @@ -20,6 +23,3 @@ redis sentence_transformers shortuuid virtualenv -phonenumbers -detect_secrets -gibberish-detector diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index db40b7d1f3..acfc40843f 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -1,69 +1,75 @@ -import requests +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json import os import timeit + import pandas as pd -import json +import requests from utils import Timer -import argparse + def test_html(ip_addr="localhost", batch_size=20): - proxies = {'http':""} - url = f'http://{ip_addr}:6357/v1/piidetect' - urls = pd.read_csv("data/ai_rss.csv")['Permalink'] + proxies = {"http": ""} + url = f"http://{ip_addr}:6357/v1/piidetect" + urls = pd.read_csv("data/ai_rss.csv")["Permalink"] urls = urls[:batch_size].to_list() payload = {"link_list": json.dumps(urls)} with Timer(f"send {len(urls)} link to pii detection endpoint"): try: - resp = requests.post(url=url, data=payload, proxies=proxies) + resp = requests.post(url=url, data=payload, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") except requests.exceptions.RequestException as e: print("An error occurred:", e) - + def test_text(ip_addr="localhost", batch_size=20): - proxies = {'http':""} - url = f'http://{ip_addr}:6357/v1/piidetect' - content = pd.read_csv("data/ai_rss.csv")['Description'] + proxies = {"http": ""} + url = f"http://{ip_addr}:6357/v1/piidetect" + content = pd.read_csv("data/ai_rss.csv")["Description"] content = content[:batch_size].to_list() payload = {"text_list": json.dumps(content)} with Timer(f"send {len(content)} text to pii detection endpoint"): try: - resp = requests.post(url=url, data=payload, proxies=proxies) + resp = requests.post(url=url, data=payload, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") except requests.exceptions.RequestException as e: - print("An error occurred:", e) + print("An error occurred:", e) + - def test_pdf(ip_addr="localhost", batch_size=20): - proxies = {'http':""} - url = f'http://{ip_addr}:6357/v1/piidetect' + proxies = {"http": ""} + url = f"http://{ip_addr}:6357/v1/piidetect" dir_path = "data/pdf" file_list = os.listdir(dir_path) file_list = file_list[:batch_size] - files = [('files', (f, open(os.path.join(dir_path, f), 'rb'), 'application/pdf')) for f in file_list] + files = [("files", (f, open(os.path.join(dir_path, f), "rb"), "application/pdf")) for f in file_list] with Timer(f"send {len(files)} documents to pii detection endpoint"): try: - resp = requests.request('POST', url=url, headers={}, files=files, proxies=proxies) + resp = requests.request("POST", url=url, headers={}, files=files, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") except requests.exceptions.RequestException as e: print("An error occurred:", e) - + + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--test_html', action='store_true', help='Test HTML pii detection') - parser.add_argument('--test_pdf', action='store_true', help='Test PDF pii detection') - parser.add_argument('--test_text', action='store_true', help='Test Text pii detection') - parser.add_argument('--batch_size', type=int, default=20, help='Batch size for testing') - parser.add_argument('--ip_addr', type=str, default="localhost", help='IP address of the server') - + parser.add_argument("--test_html", action="store_true", help="Test HTML pii detection") + parser.add_argument("--test_pdf", action="store_true", help="Test PDF pii detection") + parser.add_argument("--test_text", action="store_true", help="Test Text pii detection") + parser.add_argument("--batch_size", type=int, default=20, help="Batch size for testing") + parser.add_argument("--ip_addr", type=str, default="localhost", help="IP address of the server") + args = parser.parse_args() args.ip_addr = "100.83.111.250" if args.test_html: @@ -73,4 +79,4 @@ def test_pdf(ip_addr="localhost", batch_size=20): elif args.test_text: test_text(ip_addr=args.ip_addr, batch_size=args.batch_size) else: - print("Please specify the test type") \ No newline at end of file + print("Please specify the test type") diff --git a/comps/guardrails/pii_detection/utils.py b/comps/guardrails/pii_detection/utils.py index 1e557f51ea..c231455534 100644 --- a/comps/guardrails/pii_detection/utils.py +++ b/comps/guardrails/pii_detection/utils.py @@ -3,14 +3,16 @@ import errno import functools +import hashlib import os import signal import timeit -import hashlib from pathlib import Path + import pandas as pd from fastapi import HTTPException + class Timer: level = 0 viewer = None @@ -33,9 +35,11 @@ def __exit__(self, *a, **kw): else: print(f'{" " * Timer.level}{self.name} took {timeit.default_timer() - self.start} sec') + class TimeoutError(Exception): pass + def save_logs(log_name, data): df = pd.DataFrame.from_records(data) try: @@ -47,6 +51,7 @@ def save_logs(log_name, data): pass return df + def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): def decorator(func): def _handle_timeout(signum, frame): @@ -63,14 +68,17 @@ def wrapper(*args, **kwargs): return result return wrapper + return decorator + def generate_log_name(file_list): file_set = f"{sorted(file_list)}" # print(f"file_set: {file_set}") md5_str = hashlib.md5(file_set.encode()).hexdigest() return f"status/status_{md5_str}.log" + def get_failable_with_time(callable): def failable_callable(*args, **kwargs): start_time = timeit.default_timer() @@ -85,6 +93,7 @@ def failable_callable(*args, **kwargs): return failable_callable + def prepare_env(enable_ray=False, pip_requirements=None, comps_path=None): if enable_ray: import ray @@ -95,8 +104,8 @@ def prepare_env(enable_ray=False, pip_requirements=None, comps_path=None): ray.init(runtime_env={"pip": pip_requirements, "env_vars": {"PYTHONPATH": comps_path}}) else: ray.init(runtime_env={"env_vars": {"PYTHONPATH": comps_path}}) - - + + def get_max_cpus(total_num_tasks): num_cpus_available = os.cpu_count() num_cpus_per_task = num_cpus_available // total_num_tasks