From a6b2efe8706bc6cb6cf7be9f91c8187b6146527a Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Tue, 11 Jun 2024 19:54:37 +0000 Subject: [PATCH 01/18] add initial framework for pii detection Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/.gitignore | 5 + comps/guardrails/pii_detection/README.md | 99 +++++ comps/guardrails/pii_detection/__init__.py | 2 + comps/guardrails/pii_detection/config.py | 46 ++ comps/guardrails/pii_detection/data_utils.py | 392 ++++++++++++++++++ .../pii_detection/docker/Dockerfile | 38 ++ .../guardrails/pii_detection/pii/__init__.py | 1 + .../pii_detection/pii/detect/__init__.py | 0 .../pii/detect/emails_detection.py | 83 ++++ .../pii_detection/pii/detect/ip_detection.py | 132 ++++++ .../pii/detect/keys_detection.py | 155 +++++++ .../pii/detect/name_password_detection.py | 33 ++ .../pii/detect/phones_detection.py | 22 + .../pii_detection/pii/detect/utils.py | 31 ++ .../guardrails/pii_detection/pii/pii_utils.py | 66 +++ .../guardrails/pii_detection/pii_detection.py | 171 ++++++++ comps/guardrails/pii_detection/ray_utils.py | 93 +++++ .../guardrails/pii_detection/requirements.txt | 25 ++ comps/guardrails/pii_detection/schema.yml | 14 + comps/guardrails/pii_detection/test.py | 76 ++++ comps/guardrails/pii_detection/utils.py | 116 ++++++ 21 files changed, 1600 insertions(+) create mode 100644 comps/guardrails/pii_detection/.gitignore create mode 100644 comps/guardrails/pii_detection/README.md create mode 100644 comps/guardrails/pii_detection/__init__.py create mode 100644 comps/guardrails/pii_detection/config.py create mode 100644 comps/guardrails/pii_detection/data_utils.py create mode 100644 comps/guardrails/pii_detection/docker/Dockerfile create mode 100644 comps/guardrails/pii_detection/pii/__init__.py create mode 100644 comps/guardrails/pii_detection/pii/detect/__init__.py create mode 100644 comps/guardrails/pii_detection/pii/detect/emails_detection.py create mode 100644 comps/guardrails/pii_detection/pii/detect/ip_detection.py create mode 100755 comps/guardrails/pii_detection/pii/detect/keys_detection.py create mode 100644 comps/guardrails/pii_detection/pii/detect/name_password_detection.py create mode 100644 comps/guardrails/pii_detection/pii/detect/phones_detection.py create mode 100644 comps/guardrails/pii_detection/pii/detect/utils.py create mode 100644 comps/guardrails/pii_detection/pii/pii_utils.py create mode 100644 comps/guardrails/pii_detection/pii_detection.py create mode 100644 comps/guardrails/pii_detection/ray_utils.py create mode 100644 comps/guardrails/pii_detection/requirements.txt create mode 100644 comps/guardrails/pii_detection/schema.yml create mode 100644 comps/guardrails/pii_detection/test.py create mode 100644 comps/guardrails/pii_detection/utils.py diff --git a/comps/guardrails/pii_detection/.gitignore b/comps/guardrails/pii_detection/.gitignore new file mode 100644 index 000000000..0407124a4 --- /dev/null +++ b/comps/guardrails/pii_detection/.gitignore @@ -0,0 +1,5 @@ +**/*pdf +**/*csv +**/*log +**/*pyc +**/*model diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md new file mode 100644 index 000000000..db9557751 --- /dev/null +++ b/comps/guardrails/pii_detection/README.md @@ -0,0 +1,99 @@ +# PII Detection Microservice + +# 🚀1. Start Microservice with Python(Option 1) + +## 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +## 1.2 Start LLM endpoint + +TBD: Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). + +## 1.3 Setup Environment Variables + + + +## 1.4 Start PII Detection Microservice with Python Script + +Start pii detection microservice with below command. + +```bash +python pii_detection.py +``` + +# 🚀2. Start Microservice with Docker (Option 2) + +## 2.1 Prepare PII detection model + +## 2.1.1 use LLM endpoint + +TBD + +## 2.1.2 use NER model (default mode) + +``` bash +mkdir -p pii/bigcode +apt install git-lfs +cd pii/bigcode; git clone https://{hf_username}:{hf_token}@huggingface.co/bigcode/starpii/; cd ../.. +``` + +## 2.2 Setup Environment Variables + +TBD + +## 2.3 Build Docker Image + +```bash +cd ../../../ # back to GenAIComps/ folder +docker build -t opea/guardrails-pii-detection:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile . +``` + +## 2.4 Run Docker with CLI + +```bash +docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest +``` + +> debug mode +```bash +docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest +``` + +# 🚀3. Status Microservice + +``` bash +docker container logs -f guardrails-pii-detection-endpoint +``` + +# 🚀4. Consume Microservice + +Once microservice starts, user can use below script to invoke the microservice for pii detection. + +``` python +import requests +import json + +proxies = {"http": ""} +url = "http://localhost:6357/v1/dataprep" +urls = [ + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" +] +payload = {"link_list": json.dumps(urls)} + +try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") +except requests.exceptions.RequestException as e: + print("An error occurred:", e) +``` diff --git a/comps/guardrails/pii_detection/__init__.py b/comps/guardrails/pii_detection/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/guardrails/pii_detection/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/guardrails/pii_detection/config.py b/comps/guardrails/pii_detection/config.py new file mode 100644 index 000000000..69a0836c5 --- /dev/null +++ b/comps/guardrails/pii_detection/config.py @@ -0,0 +1,46 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import pathlib + +# Embedding model + +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# Redis Connection Information +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", 6379)) + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + + +LLM_URL = os.getenv("LLM_ENDPOINT_URL", None) + +current_file_path = pathlib.Path(__file__).parent.resolve() +comps_path = os.path.join(current_file_path, "../../../") \ No newline at end of file diff --git a/comps/guardrails/pii_detection/data_utils.py b/comps/guardrails/pii_detection/data_utils.py new file mode 100644 index 000000000..b6c09a290 --- /dev/null +++ b/comps/guardrails/pii_detection/data_utils.py @@ -0,0 +1,392 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import easyocr +import fitz +import numpy as np +import pandas as pd +import requests +import yaml +from bs4 import BeautifulSoup + +import io +import json +import multiprocessing +import re +import unicodedata +from urllib.parse import urlparse, urlunparse +from docx import Document as DDocument +from langchain_community.document_loaders import ( + UnstructuredImageLoader, + UnstructuredMarkdownLoader, + UnstructuredPowerPointLoader, + UnstructuredXMLLoader, +) +from PIL import Image +from utils import timeout + +def load_pdf(pdf_path): + """Load the pdf file.""" + doc = fitz.open(pdf_path) + reader = easyocr.Reader(["en"], gpu=False) + result = "" + for i in range(doc.page_count): + page = doc.load_page(i) + pagetext = page.get_text().strip() + if pagetext: + if pagetext.endswith("!") or pagetext.endswith("?") or pagetext.endswith("."): + result = result + pagetext + else: + result = result + pagetext + "." + if len(doc.get_page_images(i)) > 0: + for img in doc.get_page_images(i): + if img: + pageimg = "" + xref = img[0] + img_data = doc.extract_image(xref) + img_bytes = img_data["image"] + pil_image = Image.open(io.BytesIO(img_bytes)) + img = np.array(pil_image) + img_result = reader.readtext(img, paragraph=True, detail=0) + pageimg = pageimg + ", ".join(img_result).strip() + if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): + pass + else: + pageimg = pageimg + "." + result = result + pageimg + return result + + +def load_html(html_path): + """Load the html file.""" + with open(html_path, "r", encoding="utf-8") as file: + html = file.read() + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text(strip=True) + return text + + +def load_txt(txt_path): + """Load txt file.""" + with open(txt_path, "r") as file: + text = file.read() + return text + + +def load_doc(doc_path): + """Load doc file.""" + txt_path = doc_path.replace(".doc", ".txt") + try: + os.system(f'antiword "{doc_path}" > "{txt_path}"') + except: + raise AssertionError( + "antiword failed or not installed, if not installed," + + 'use "apt-get update && apt-get install -y antiword" to install it.' + ) + text = load_txt(txt_path) + os.remove(txt_path) + return text + + +def load_docx(docx_path): + """Load docx file.""" + doc = DDocument(docx_path) + text = "" + for paragraph in doc.paragraphs: + text += paragraph.text + return text + + +def load_pptx(pptx_path): + """Load pptx file.""" + loader = UnstructuredPowerPointLoader(pptx_path) + text = loader.load()[0].page_content + return text + + +def load_md(md_path): + """Load md file.""" + loader = UnstructuredMarkdownLoader(md_path) + text = loader.load()[0].page_content + return text + + +def load_xml(xml_path): + """Load xml file.""" + loader = UnstructuredXMLLoader(xml_path) + text = loader.load()[0].page_content + return text + + +def load_json(json_path): + """Load and process json file.""" + with open(json_path, "r") as file: + data = json.load(file) + return json.dumps(data) + + +def load_yaml(yaml_path): + """Load and process yaml file.""" + with open(yaml_path, "r") as file: + data = yaml.safe_load(file) + return yaml.dump(data) + + +def load_xlsx(input_path): + """Load and process xlsx file.""" + df = pd.read_excel(input_path) + return df.to_string() + + +def load_csv(input_path): + """Load the csv file.""" + df = pd.read_csv(input_path) + return df.to_string() + + +def load_image(image_path): + """Load the image file.""" + loader = UnstructuredImageLoader(image_path) + text = loader.load()[0].page_content + return text + + +def load_svg(svg_path): + """Load the svg file.""" + import cairosvg + + png_path = svg_path.replace(".svg", ".png") + cairosvg.svg2png(url=svg_path, write_to=png_path) + text = load_image(png_path) + os.remove(png_path) + return text + + +@timeout(600) +def document_loader(doc_path): + if doc_path.endswith(".pdf"): + return load_pdf(doc_path) + elif doc_path.endswith(".html"): + return load_html(doc_path) + elif doc_path.endswith(".txt"): + return load_txt(doc_path) + elif doc_path.endswith(".doc"): + return load_doc(doc_path) + elif doc_path.endswith(".docx"): + return load_docx(doc_path) + elif doc_path.endswith(".pptx") or doc_path.endswith(".ppt"): + return load_pptx(doc_path) + elif doc_path.endswith(".md"): + return load_md(doc_path) + elif doc_path.endswith(".xml"): + return load_xml(doc_path) + elif doc_path.endswith(".json") or doc_path.endswith(".jsonl"): + return load_json(doc_path) + elif doc_path.endswith(".yaml"): + return load_yaml(doc_path) + elif doc_path.endswith(".xlsx") or doc_path.endswith(".xls"): + return load_xlsx(doc_path) + elif doc_path.endswith(".csv"): + return load_csv(doc_path) + elif doc_path.endswith(".tiff"): + return load_image(doc_path) + elif doc_path.endswith(".svg"): + return load_image(doc_path) + else: + raise NotImplementedError( + "Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml" + + ", json, jsonl, yaml, xlsx, xls, csv, tiff and svg format." + ) + + +class Crawler: + def __init__(self, pool=None): + if pool: + assert isinstance(pool, (str, list, tuple)), "url pool should be str, list or tuple" + self.pool = pool + self.headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng, \ + */*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, \ + like Gecko) Chrome/113.0.0.0 Safari/537.36", + } + self.fetched_pool = set() + + def get_sublinks(self, soup): + sublinks = [] + for links in soup.find_all("a"): + sublinks.append(str(links.get("href"))) + return sublinks + + def get_hyperlink(self, soup, base_url): + sublinks = [] + for links in soup.find_all("a"): + link = str(links.get("href")) + if link.startswith("#") or link is None or link == "None": + continue + suffix = link.split("/")[-1] + if "." in suffix and suffix.split(".")[-1] not in ["html", "htmld"]: + continue + link_parse = urlparse(link) + base_url_parse = urlparse(base_url) + if link_parse.path == "": + continue + if link_parse.netloc != "": + # keep crawler works in the same domain + if link_parse.netloc != base_url_parse.netloc: + continue + sublinks.append(link) + else: + sublinks.append( + urlunparse( + ( + base_url_parse.scheme, + base_url_parse.netloc, + link_parse.path, + link_parse.params, + link_parse.query, + link_parse.fragment, + ) + ) + ) + return sublinks + + def fetch(self, url, headers=None, max_times=5): + if not headers: + headers = self.headers + while max_times: + if not url.startswith("http") or not url.startswith("https"): + url = "http://" + url + print("start fetch %s...", url) + try: + response = requests.get(url, headers=headers, verify=True) + if response.status_code != 200: + print("fail to fetch %s, response status code: %s", url, response.status_code) + else: + return response + except Exception as e: + print("fail to fetch %s, caused by %s", url, e) + raise Exception(e) + max_times -= 1 + return None + + def process_work(self, sub_url, work): + response = self.fetch(sub_url) + if response is None: + return [] + self.fetched_pool.add(sub_url) + soup = self.parse(response.text) + base_url = self.get_base_url(sub_url) + sublinks = self.get_hyperlink(soup, base_url) + if work: + work(sub_url, soup) + return sublinks + + def crawl(self, pool, work=None, max_depth=10, workers=10): + url_pool = set() + for url in pool: + base_url = self.get_base_url(url) + response = self.fetch(url) + soup = self.parse(response.text) + sublinks = self.get_hyperlink(soup, base_url) + self.fetched_pool.add(url) + url_pool.update(sublinks) + depth = 0 + while len(url_pool) > 0 and depth < max_depth: + print("current depth %s...", depth) + mp = multiprocessing.Pool(processes=workers) + results = [] + for sub_url in url_pool: + if sub_url not in self.fetched_pool: + results.append(mp.apply_async(self.process_work, (sub_url, work))) + mp.close() + mp.join() + url_pool = set() + for result in results: + sublinks = result.get() + url_pool.update(sublinks) + depth += 1 + + def parse(self, html_doc): + soup = BeautifulSoup(html_doc, "lxml") + return soup + + def download(self, url, file_name): + print("download %s into %s...", url, file_name) + try: + r = requests.get(url, stream=True, headers=self.headers, verify=True) + f = open(file_name, "wb") + for chunk in r.iter_content(chunk_size=512): + if chunk: + f.write(chunk) + except Exception as e: + print("fail to download %s, caused by %s", url, e) + + def get_base_url(self, url): + result = urlparse(url) + return urlunparse((result.scheme, result.netloc, "", "", "", "")) + + def clean_text(self, text): + text = text.strip().replace("\r", "\n") + text = re.sub(" +", " ", text) + text = re.sub("\n+", "\n", text) + text = text.split("\n") + return "\n".join([i for i in text if i and i != " "]) + + +def uni_pro(text): + """Check if the character is ASCII or falls in the category of non-spacing marks.""" + normalized_text = unicodedata.normalize("NFKD", text) + filtered_text = "" + for char in normalized_text: + if ord(char) < 128 or unicodedata.category(char) == "Mn": + filtered_text += char + return filtered_text + + +def load_html_data(url): + crawler = Crawler() + res = crawler.fetch(url) + if res is None: + return None + soup = crawler.parse(res.text) + all_text = crawler.clean_text(soup.select_one("body").text) + main_content = "" + for element_name in ["main", "container"]: + main_block = None + if soup.select(f".{element_name}"): + main_block = soup.select(f".{element_name}") + elif soup.select(f"#{element_name}"): + main_block = soup.select(f"#{element_name}") + if main_block: + for element in main_block: + text = crawler.clean_text(element.text) + if text not in main_content: + main_content += f"\n{text}" + main_content = crawler.clean_text(main_content) + main_content = all_text if main_content == "" else main_content + main_content = main_content.replace("\n", "") + main_content = main_content.replace("\n\n", "") + main_content = uni_pro(main_content) + main_content = re.sub(r"\s+", " ", main_content) + + return main_content + + +def parse_html(input): + """Parse the uploaded file.""" + chucks = [] + for link in input: + if re.match(r"^https?:/{2}\w.+$", link): + content = load_html_data(link) + if content is None: + continue + chuck = [[content.strip(), link]] + chucks += chuck + else: + print("The given link/str {} cannot be parsed.".format(link)) + + return chucks diff --git a/comps/guardrails/pii_detection/docker/Dockerfile b/comps/guardrails/pii_detection/docker/Dockerfile new file mode 100644 index 000000000..f097352cf --- /dev/null +++ b/comps/guardrails/pii_detection/docker/Dockerfile @@ -0,0 +1,38 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/guardrails/pii_detection/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/guardrails/pii_detection/uploaded_files && chown -R user /home/user/comps/guardrails/pii_detection/uploaded_files +RUN mkdir -p /home/user/comps/guardrails/pii_detection/status && chown -R user /home/user/comps/guardrails/pii_detection/status + +USER user + +WORKDIR /home/user/comps/guardrails/pii_detection + +ENTRYPOINT ["python", "pii_detection.py"] + diff --git a/comps/guardrails/pii_detection/pii/__init__.py b/comps/guardrails/pii_detection/pii/__init__.py new file mode 100644 index 000000000..df198ca44 --- /dev/null +++ b/comps/guardrails/pii_detection/pii/__init__.py @@ -0,0 +1 @@ +__all__ = ['pii_detection', 'pii_redaction'] diff --git a/comps/guardrails/pii_detection/pii/detect/__init__.py b/comps/guardrails/pii_detection/pii/detect/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comps/guardrails/pii_detection/pii/detect/emails_detection.py b/comps/guardrails/pii_detection/pii/detect/emails_detection.py new file mode 100644 index 000000000..02c40ea3e --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/emails_detection.py @@ -0,0 +1,83 @@ +""" This code is adapted from BigScience PII detection +https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py + +MST BigScience PII Code +Original colab that is a source of this file is located at + https://colab.research.google.com/drive/1086H3-LGMz3gX0pGy9ECgr8KflosSKso +# License +Copyright 2022 Authors of this Notebook +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import sys +import regex + +# Note: to reduce false positives, a number of technically-valid-but-rarely-used +# email address patterns (e.g. with parenthesis or slashes) will not match +email_pattern = regex.compile(r''' + (?<= ^ | [[({<\b\s@,?!;'"\p{Han}¿¡:.] | \\['"] ) # left delimiter + ( + (?: # local part + [^][(){}<>\b\s@,?!;'":#/\\=.\-] # arbitrary character + | + (?: [=.\-] (?! [.@]) ) # ".=-" not before ".@" + )+ + @ + (?: + (?: + \w # single-letter subdomain + | + [^.\b\s@?!;,/()>\-:] # subdomain (>=2 letter) + [^.\b\s@?!;,/()>]{0,62} + [^.\b\s@?!;,/()>\-:'"] + ) + \. + ){1,10} + (?: [\p{L}\p{M}]{2,63} | xn-- \w+ ) # TLD, including IDN + ) + (?= $ | [])}>\b\s@,?!;'"\p{Han}] | \\['"] | : (?! \d) | \. (?! \S)) # right delim +''', flags=regex.MULTILINE | regex.VERBOSE) + + +def detect_email(content): + """Detects email addresses in a string using regex matching + Args: + content (str): A string containing the text to be analyzed. + Returns: + A list of dicts containing the tag type, the matched string, and the start and + end indices of the match. + """ + + matches = [] + + # regex matching + matches_tmp = email_pattern.finditer(content) + for match in matches_tmp: + if match.groups(): + if len(match.groups()) > 1 and match.groups()[1]: + sys.stderr.write( + "Warning: Found substring matches in the main match." + ) + # setup outputs + value = match.group(1) + start, end = match.span(1) + if value: + matches.append( + { + "tag": "EMAIL", + "value": value, + "start": start, + "end": end, + } + ) + else: + raise ValueError("No match found inside groups") + return matches diff --git a/comps/guardrails/pii_detection/pii/detect/ip_detection.py b/comps/guardrails/pii_detection/pii/detect/ip_detection.py new file mode 100644 index 000000000..24bf33b42 --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/ip_detection.py @@ -0,0 +1,132 @@ +""" This code is adapted from BigScience PII detection +https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py + +MST BigScience PII Code +Original colab that is a source of this file is located at + https://colab.research.google.com/drive/1086H3-LGMz3gX0pGy9ECgr8KflosSKso +# License +Copyright 2022 Authors of this Notebook +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import sys +import ipaddress +import regex + +year_patterns = [ + regex.compile( + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])" + ), # yyyy-yyyy or yyyy/yyyy + regex.compile( + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])" + ), # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm + regex.compile( + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])" + ), + # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy + regex.compile( + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])" + ), # mm-yyyy or mm/yyyy or the same but with yy + regex.compile( + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])" + ), # yyyy-mm or yyyy/mm +] + +ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" +ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])" +ip_pattern = regex.compile( + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + + r"|".join([ipv4_pattern, ipv6_pattern]) + + ")(?:$|[\s@,?!;:'\"(.\p{Han}])", + flags=regex.MULTILINE +) + + +def matches_date_pattern(matched_str): + # Screen out date false positives + for year_regex in year_patterns: + if year_regex.match(matched_str): + return True + return False + + +def ip_has_digit(matched_str): + """Checks to make sure the PII span is not just :: or whatever that may + accidentally be picked up by making sure there are digits.""" + return any(map(str.isdigit, matched_str)) + + +def filter_versions(matched_str, context): + """Filter addresses in this format x.x.x.x and the words dns/server + don't appear in the neighboring context, usually they are just versions""" + # count occurrence of dots + dot_count = matched_str.count('.') + exclude = (dot_count == 3 and len(matched_str) == 7) + if exclude: + if "dns" in context.lower() or "server" in context.lower(): + return False + return exclude + + +def not_ip_address(matched_str): + """ make sure the string has a valid IP address format + e.g: 33.01.33.33 is not a valid IP address because of the 0 in front of 1 + TODO: fix this directly in the regex""" + try: + ipaddress.ip_address(matched_str) + return False + except ValueError: + return True + + +def detect_ip(content): + """Detects ip addresses in a string using regex matching + Args: + content (str): A string containing the text to be analyzed. + Returns: + A list of dicts containing the tag type, the matched string, and the start and + end indices of the match. + """ + + matches = [] + + # regex matching + matches_tmp = ip_pattern.finditer(content) + for match in matches_tmp: + if match.groups(): + if len(match.groups()) > 1 and match.groups()[1]: + sys.stderr.write( + "Warning: Found substring matches in the main match." + ) + # setup outputs + value = match.group(1) + start, end = match.span(1) + if value: + # Filter out false positive IPs + if not ip_has_digit(value): + continue + if matches_date_pattern(value): + continue + if filter_versions(value, content[start - 100:end + 100]) or not_ip_address(value): + continue + # combine if conditions in one + + matches.append( + { + "tag": "IP_ADDRESS", + "value": value, + "start": start, + "end": end, + } + ) + else: + raise ValueError("No match found inside groups") + return matches diff --git a/comps/guardrails/pii_detection/pii/detect/keys_detection.py b/comps/guardrails/pii_detection/pii/detect/keys_detection.py new file mode 100755 index 000000000..ada10f71d --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/keys_detection.py @@ -0,0 +1,155 @@ +""" This code is adapted from BigCode PII +https://github.com/bigcode-project/bigcode-dataset/blob/main/pii/utils/keys_detection.py +""" +import os + + +# Secrets detection with detect-secrets tool + + +def get_detector_model(): + return os.path.join(os.path.dirname(__file__), "gibberish_data/big.model") + + +filters = [ + # some filters from [original list](https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters) + # were removed based on their targets + {"path": "detect_secrets.filters.heuristic.is_potential_uuid"}, + {"path": "detect_secrets.filters.heuristic.is_likely_id_string"}, + {"path": "detect_secrets.filters.heuristic.is_templated_secret"}, + {"path": "detect_secrets.filters.heuristic.is_sequential_string"}, + {"path": "detect_secrets.filters.gibberish.should_exclude_secret", "model": get_detector_model(), "limit": 4.0}, +] +plugins = [ + {"name": "ArtifactoryDetector"}, + {"name": "AWSKeyDetector"}, + # the entropy detectors esp Base64 need the gibberish detector on top + {"name": "Base64HighEntropyString"}, + {"name": "HexHighEntropyString"}, + {"name": "AzureStorageKeyDetector"}, + {"name": "CloudantDetector"}, + {"name": "DiscordBotTokenDetector"}, + {"name": "GitHubTokenDetector"}, + {"name": "IbmCloudIamDetector"}, + {"name": "IbmCosHmacDetector"}, + {"name": "JwtTokenDetector"}, + {"name": "MailchimpDetector"}, + {"name": "NpmDetector"}, + {"name": "SendGridDetector"}, + {"name": "SlackDetector"}, + {"name": "SoftlayerDetector"}, + {"name": "StripeDetector"}, + {"name": "TwilioKeyDetector"}, +] + + +def is_gibberish(matched_str): + """Checks to make sure the PII span is gibberish and not word like""" + # pip install gibberish-detector + # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt + # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds) + # Detector = detector.create_from_model(os.path.abspath('utils/gibberish_data/big.model')) + detector = get_detector_model() + return detector.is_gibberish(matched_str.lower()) + + +def is_hash(content, value): + """Second check if the value is a hash (after gibberish detector)""" + # get the line where value occurred + try: + res = content.index(value) + except ValueError: + # TODO: fix this issue happened one for JS in the stack-smol, file did contain value + print("Value not found in content, why this happened?") + return False + lines = content[:content.index(value)].splitlines() + target_line = lines[-1] + if len(value) in [32, 40, 64]: + # if "sha" or "md5" are in content: + keywords = ["sha", "md5", "hash", "byte"] + if any(x in target_line.lower() for x in keywords): + return True + return False + + +def file_has_hashes(content, coeff=0.02): + """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines""" + lines = content.splitlines() + count_sha = 0 + count_hash = 0 + nlines = content.count("\n") + threshold = int(coeff * nlines) + for line in lines: + count_sha += line.lower().count("sha") + count_hash += line.lower().count("hash") + if count_sha > threshold or count_hash > threshold: + return True + return False + + +def get_indexes(text, value): + string = text + indexes = [] + new_start = 0 + while True: + try: + start = string.index(value) + indexes.append(new_start + start) + new_start = new_start + start + len(value) + string = text[new_start:] + except ValueError: + break + indexes = [(x, x + len(value)) for x in indexes] + return indexes + + +def scan_secrets(line: str): + from detect_secrets.core.scan import _process_line_based_plugins + + lines = line.splitlines(keepends=True) + for secret in _process_line_based_plugins( + lines=list(enumerate(lines, start=1)), + filename="Adhoc String", + ): + yield secret + + + +def detect_keys(content): + """Detect secret keys in content using detect-secrets tool + Args: + content (str): string containing the text to be analyzed. + suffix (str): suffix of the file + Returns: + A list of dicts containing the tag type, the matched string, and the start and + end indices of the match.""" + + # We initialize the `settings` variable here, but we can't save it to the global object + # yet, since the contextmanager will revert those changes. As such, we quit the context + # first, then set it to the global namespace. + try: + from detect_secrets.core import scan + except ImportError: + os.system("pip install detect-secrets") + os.system("pip install gibberish-detector") + + from detect_secrets.settings import transient_settings + + with transient_settings( + {"plugins_used": plugins, "filters_used": filters} + ) as settings: + matches = [] + for secret in scan_secrets(content): + if is_hash(content, secret.secret_value) or file_has_hashes(content): + continue + indexes = get_indexes(content, secret.secret_value) + for start, end in indexes: + matches.append( + { + "tag": "KEY", + "value": secret.secret_value, + "start": start, + "end": end, + } + ) + return matches diff --git a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py new file mode 100644 index 000000000..8a42ec054 --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py @@ -0,0 +1,33 @@ +from .utils import PIIEntityType + + +def detect_name_password(content, pipeline, entity_types=None): + """Detects name and password in a string using bigcode/starpii model + Args: + entity_types: detection types + pipeline: a transformer model + content (str): A string containing the text to be analyzed. + Returns: + A list of dicts containing the tag type, the matched string, and the start and + end indices of the match. + """ + if entity_types is None: + entity_types = [PIIEntityType.NAME, PIIEntityType.PASSWORD] + matches = [] + try: + for entity in pipeline(content): + entity_group = entity["entity_group"] + if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or \ + ("PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types): + matches.append( + { + "tag": entity_group, + "value": entity["word"], + "start": entity["start"], + "end": entity["end"], + } + ) + except: + pass + + return matches diff --git a/comps/guardrails/pii_detection/pii/detect/phones_detection.py b/comps/guardrails/pii_detection/pii/detect/phones_detection.py new file mode 100644 index 000000000..cf130be36 --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/phones_detection.py @@ -0,0 +1,22 @@ +import os + +def detect_phones(text): + """Detects phone in a string using phonenumbers libray only detection the international phone number""" + try: + import phonenumbers + except ImportError: + os.system("pip install phonenumbers") + import phonenumbers + + matches = [] + + for match in phonenumbers.PhoneNumberMatcher(text, "IN"): + matches.append( + { + "tag": "PHONE_NUMBER", + "value": match.raw_string, + "start": match.start, + "end": match.end, + } + ) + return matches diff --git a/comps/guardrails/pii_detection/pii/detect/utils.py b/comps/guardrails/pii_detection/pii/detect/utils.py new file mode 100644 index 000000000..da5d6d65a --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/utils.py @@ -0,0 +1,31 @@ +from enum import Enum, auto + + +class PIIEntityType(Enum): + IP_ADDRESS = auto() + NAME = auto() + EMAIL = auto() + PHONE_NUMBER = auto() + PASSWORD = auto() + KEY = auto() + + @classmethod + def default(cls): + return [PIIEntityType.IP_ADDRESS, PIIEntityType.EMAIL, PIIEntityType.PHONE_NUMBER, PIIEntityType.KEY] + + @classmethod + def parse(cls, entity): + if "name" == entity: + return PIIEntityType.NAME + elif "password" == entity: + return PIIEntityType.PASSWORD + elif "email" == entity: + return PIIEntityType.EMAIL + elif "phone_number" == entity: + return PIIEntityType.PHONE_NUMBER + elif "ip" == entity: + return PIIEntityType.PHONE_NUMBER + elif "key" == entity: + return PIIEntityType.KEY + else: + raise NotImplementedError(f" entity type {entity} is not supported!") diff --git a/comps/guardrails/pii_detection/pii/pii_utils.py b/comps/guardrails/pii_detection/pii/pii_utils.py new file mode 100644 index 000000000..c92fc06a0 --- /dev/null +++ b/comps/guardrails/pii_detection/pii/pii_utils.py @@ -0,0 +1,66 @@ +from .detect.ip_detection import detect_ip +from .detect.emails_detection import detect_email +from .detect.phones_detection import detect_phones +from .detect.name_password_detection import detect_name_password +from .detect.keys_detection import detect_keys +from .detect.utils import PIIEntityType +from typing import List +import os + + +class PIIDetector: + def __init__(strategy: str): + pass + + def detect_pii(self, data): + import random + return random.choice([True, False]) + +class PIIDetectorWithLLM(PIIDetector): + def __init__(self): + super().__init__() + + def detect_pii(self, text): + return True + +class PIIDetectorWithNER(PIIDetector): + def __init__(self, model_path=None): + super().__init__() + from transformers import pipeline + from transformers import AutoTokenizer + + _model_key = "bigcode/starpii" + _model_key = _model_key if model_path is None else os.path.join(model_path, _model_key) + tokenizer = AutoTokenizer.from_pretrained(_model_key, model_max_length=512) + self.pipeline = pipeline(model=_model_key, task='token-classification', tokenizer=tokenizer, grouped_entities=True) + + def detect_pii(self, text): + result = [] + # use a regex to detect ip addresses + + entity_types = PIIEntityType.default() + + if PIIEntityType.IP_ADDRESS in entity_types: + result = result + detect_ip(text) + # use a regex to detect emails + if PIIEntityType.EMAIL in entity_types: + result = result + detect_email(text) + # for phone number use phonenumbers tool + if PIIEntityType.PHONE_NUMBER in entity_types: + result = result + detect_phones(text) + if PIIEntityType.KEY in entity_types: + result = result + detect_keys(text) + + if PIIEntityType.NAME in entity_types or PIIEntityType.PASSWORD in entity_types: + result = result + detect_name_password(text, self.pipeline, entity_types) + + print(result) + + return True if len(result) > 0 else False # Dummy function, replace with actual logic + +class PIIDetectorWithML(PIIDetector): + def __init__(self): + super().__init__() + + def detect_pii(self, text): + return True diff --git a/comps/guardrails/pii_detection/pii_detection.py b/comps/guardrails/pii_detection/pii_detection.py new file mode 100644 index 000000000..499ff834a --- /dev/null +++ b/comps/guardrails/pii_detection/pii_detection.py @@ -0,0 +1,171 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import pathlib +import sys +from pathlib import Path + +from fastapi import File, Form, HTTPException, UploadFile +from langsmith import traceable + +cur_path = pathlib.Path(__file__).parent.resolve() +comps_path = os.path.join(cur_path, "../../../") +sys.path.append(comps_path) + +from tqdm import tqdm +from typing import List + +from comps import DocPath, opea_microservices, register_microservice +from comps.guardrails.pii_detection.utils import Timer, generate_log_name, prepare_env, save_file_to_local_disk, get_max_cpus +from comps.guardrails.pii_detection.data_utils import document_loader, parse_html +from comps.guardrails.pii_detection.ray_utils import rayds_initialization, ray_runner_initialization, ray_execute +from comps.guardrails.pii_detection.pii.pii_utils import PIIDetector, PIIDetectorWithLLM, PIIDetectorWithNER, PIIDetectorWithML + +def get_pii_detection_inst(strategy="dummy", settings=None): + if strategy == "ner": + return PIIDetectorWithNER(model_path="pii") + elif strategy == "ml": + return PIIDetectorWithML() + elif strategy == "llm": + return PIIDetectorWithLLM() + else: + # Default strategy - dummy + return PIIDetector() + +def file_based_pii_detect(file_list: List[DocPath], strategy, enable_ray=False, debug=False): + """Ingest document to Redis.""" + file_list = [f.path for f in file_list] + pii_detector = get_pii_detection_inst(strategy=strategy) + + if enable_ray: + num_cpus = get_max_cpus(len(file_list)) + print(f"per task num_cpus: {num_cpus}") + + log_name = generate_log_name(file_list) + ds = rayds_initialization(file_list, document_loader, lazy_mode=True, num_cpus=num_cpus) + ds = ds.map(ray_runner_initialization(pii_detector.detect_pii, debug=debug), num_cpus=num_cpus) + ret = ray_execute(ds, log_name) + + else: + ret = [] + for file in tqdm(file_list, total=len(file_list)): + with Timer(f"read document {file}."): + data = document_loader(file) + with Timer(f"detect pii on document {file} to Redis."): + ret.append(pii_detector.detect_pii(data)) + return ret + +def link_based_pii_detect(link_list: List[str], strategy, enable_ray=False, debug=False): + link_list = [str(f) for f in link_list] + pii_detector = get_pii_detection_inst(strategy=strategy) + + def _parse_html(link): + data = parse_html([link]) + return data[0][0] + + if enable_ray: + num_cpus = get_max_cpus(len(link_list)) + print(f"per task num_cpus: {num_cpus}") + + log_name = generate_log_name(link_list) + ds = rayds_initialization(link_list, _parse_html, lazy_mode=True, num_cpus=num_cpus) + ds = ds.map(ray_runner_initialization(pii_detector.detect_pii, debug=debug), num_cpus=num_cpus) + ret = ray_execute(ds, log_name) + else: + ret = [] + for link in tqdm(link_list, total=len(link_list)): + with Timer(f"read document {link}."): + data = _parse_html(link) + if debug: + print("content is: ", data) + with Timer(f"detect pii on document {link} to Redis."): + ret.append(pii_detector.detect_pii(data)) + return ret + +def text_based_pii_detect(text_list: List[str], strategy, enable_ray=False, debug=False): + text_list = [str(f) for f in text_list] + pii_detector = get_pii_detection_inst(strategy=strategy) + + if enable_ray: + num_cpus = get_max_cpus(len(text_list)) + print(f"per task num_cpus: {num_cpus}") + + log_name = generate_log_name(text_list) + ds = rayds_initialization(text_list, None, lazy_mode=True, num_cpus=num_cpus) + ds = ds.map(ray_runner_initialization(pii_detector.detect_pii, debug=debug), num_cpus=num_cpus) + ret = ray_execute(ds, log_name) + else: + ret = [] + for data in tqdm(text_list, total=len(text_list)): + if debug: + print("content is: ", data) + with Timer(f"detect pii on document {data[:50]} to Redis."): + ret.append(pii_detector.detect_pii(data)) + return ret + +@register_microservice(name="opea_service@guardrails-pii-detection", endpoint="/v1/piidetect", host="0.0.0.0", port=6357) +async def pii_detection(files: List[UploadFile] = File(None), link_list: str = Form(None), text_list: str = Form(None)): + if not files and not link_list and not text_list: + raise HTTPException(status_code=400, detail="Either files, link_list, or text_list must be provided.") + + strategy = "ner" # Default strategy + pip_requirement = ["detect-secrets", "phonenumbers", "gibberish-detector"] + + if files: + saved_path_list = [] + try: + if not isinstance(files, list): + files = [files] + upload_folder = "./uploaded_files/" + if not os.path.exists(upload_folder): + Path(upload_folder).mkdir(parents=True, exist_ok=True) + + # TODO: use ray to parallelize the file saving + for file in files: + save_path = upload_folder + file.filename + await save_file_to_local_disk(save_path, file) + saved_path_list.append(DocPath(path=save_path)) + + enable_ray = False if len(saved_path_list) <= 10 else True + if enable_ray: + prepare_env(enable_ray=enable_ray, pip_requirements=pip_requirement, comps_path=comps_path) + ret = file_based_pii_detect(saved_path_list, strategy, enable_ray=enable_ray) + return {"status": 200, "message": json.dumps(ret)} + except Exception as e: + raise HTTPException(status_code=400, detail=f"An error occurred: {e}") + + if text_list: + try: + text_list = json.loads(text_list) # Parse JSON string to list + if not isinstance(text_list, list): + text_list = [text_list] + enable_ray = False if len(text_list) <= 10 else True + if enable_ray: + prepare_env(enable_ray=enable_ray, pip_requirements=pip_requirement, comps_path=comps_path) + ret = text_based_pii_detect(text_list, strategy, enable_ray=enable_ray) + return {"status": 200, "message": json.dumps(ret)} + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail="Invalid JSON format for link_list.") + except Exception as e: + raise HTTPException(status_code=400, detail=f"An error occurred: {e}") + + if link_list: + try: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + link_list = [link_list] + enable_ray = False if len(link_list) <= 10 else True + if enable_ray: + prepare_env(enable_ray=enable_ray, pip_requirements=pip_requirement, comps_path=comps_path) + ret = link_based_pii_detect(link_list, strategy, enable_ray=enable_ray) + return {"status": 200, "message": json.dumps(ret)} + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail="Invalid JSON format for link_list.") + except Exception as e: + raise HTTPException(status_code=400, detail=f"An error occurred: {e}") + + +if __name__ == "__main__": + opea_microservices["opea_service@guardrails-pii-detection"].start() \ No newline at end of file diff --git a/comps/guardrails/pii_detection/ray_utils.py b/comps/guardrails/pii_detection/ray_utils.py new file mode 100644 index 000000000..103bbc526 --- /dev/null +++ b/comps/guardrails/pii_detection/ray_utils.py @@ -0,0 +1,93 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pyarrow +import ray +from ray.data.block import Block +from ray.data.datasource import FileBasedDatasource +from tqdm import tqdm +from utils import Timer, get_failable_with_time, timeout, save_logs + +from typing import TYPE_CHECKING, Any, Dict, Iterator +from typing import Callable, List, Optional, Union + +class RayDataLoader(FileBasedDatasource): + def __init__( + self, + paths: Union[str, List[str]], + dataloader_callable: Optional[Callable], + document_ld_args: Optional[Dict[str, Any]] = None, + **file_based_datasource_kwargs, + ): + super().__init__(paths, **file_based_datasource_kwargs) + self.dataloader_callable = dataloader_callable + self.args = document_ld_args or {} + + def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]: + from ray.data._internal.arrow_block import ArrowBlockBuilder + + builder = ArrowBlockBuilder() + path = f"{path}" + data, error, read_time = self.dataloader_callable(path) + item = {"data": data, "filename": path, "error": error, "read_time": f"{read_time} secs"} + builder.add(item) + yield builder.build() + +def rayds_initialization(file_paths, dataloader_callable, lazy_mode=True, num_cpus=20): + if dataloader_callable is None: + text_list = [{"data": data, "filename": data[:50], "error": None, "read_time": f"0 secs"} for data in file_paths] + return ray.data.from_items(text_list) + + decorated_dataloader_callable = get_failable_with_time(dataloader_callable) + if lazy_mode: + if num_cpus is None: + return ray.data.read_datasource(RayDataLoader(file_paths, decorated_dataloader_callable)) + else: + return ray.data.read_datasource( + RayDataLoader(file_paths, decorated_dataloader_callable), ray_remote_args={"num_cpus": num_cpus} + ) + else: + data = [] + for file in tqdm(file_paths, total=len(file_paths)): + content, error, elapse_time = decorated_dataloader_callable(file) + item = {"data": content, "filename": file, "error": error, "read_time": f"{elapse_time} secs"} + data.append(item) + return ray.data.from_items(data) + +def ray_runner_initialization(func, debug=False): + @timeout(600) + def ray_runner(data): + content = data["data"] + if content is None: + return { + "filename": data["filename"], + "content": content, + "status": "failed", + "ret": -1, + "error": data["error"], + "read_time": data["read_time"], + "elaspe_time": "0.0 secs", + } + + decorated_callable = get_failable_with_time(func) + ret, error, elapse_time = decorated_callable(content) + status = "success" if not error else "failed" + if not debug: + content = None + return { + "filename": data["filename"], + "content": content, + "status": status, + "ret": ret, + "error": error, + "read_time": data["read_time"], + "elaspe_time": f"{elapse_time} secs", + } + return ray_runner + +def ray_execute(ds, log_name): + with Timer(f"execute with Ray, status log: {log_name}"): + ret_with_status = ds.take_all() + df = save_logs(log_name, ret_with_status) + ret = df["ret"].to_list() + return ret \ No newline at end of file diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt new file mode 100644 index 000000000..ef2d97c9a --- /dev/null +++ b/comps/guardrails/pii_detection/requirements.txt @@ -0,0 +1,25 @@ +beautifulsoup4 +docarray[full] +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langsmith +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +pyarrow +pymupdf +python-docx +ray +redis +sentence_transformers +shortuuid +virtualenv +phonenumbers +detect_secrets +gibberish-detector diff --git a/comps/guardrails/pii_detection/schema.yml b/comps/guardrails/pii_detection/schema.yml new file mode 100644 index 000000000..0c0ca9711 --- /dev/null +++ b/comps/guardrails/pii_detection/schema.yml @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +text: + - name: content + - name: source +numeric: + - name: start_index +vector: + - name: content_vector + algorithm: HNSW + datatype: FLOAT32 + dims: 384 + distance_metric: COSINE diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py new file mode 100644 index 000000000..db40b7d1f --- /dev/null +++ b/comps/guardrails/pii_detection/test.py @@ -0,0 +1,76 @@ +import requests +import os +import timeit +import pandas as pd +import json +from utils import Timer +import argparse + +def test_html(ip_addr="localhost", batch_size=20): + proxies = {'http':""} + url = f'http://{ip_addr}:6357/v1/piidetect' + urls = pd.read_csv("data/ai_rss.csv")['Permalink'] + urls = urls[:batch_size].to_list() + payload = {"link_list": json.dumps(urls)} + + with Timer(f"send {len(urls)} link to pii detection endpoint"): + try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") + except requests.exceptions.RequestException as e: + print("An error occurred:", e) + + +def test_text(ip_addr="localhost", batch_size=20): + proxies = {'http':""} + url = f'http://{ip_addr}:6357/v1/piidetect' + content = pd.read_csv("data/ai_rss.csv")['Description'] + content = content[:batch_size].to_list() + payload = {"text_list": json.dumps(content)} + + with Timer(f"send {len(content)} text to pii detection endpoint"): + try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") + except requests.exceptions.RequestException as e: + print("An error occurred:", e) + + +def test_pdf(ip_addr="localhost", batch_size=20): + proxies = {'http':""} + url = f'http://{ip_addr}:6357/v1/piidetect' + dir_path = "data/pdf" + file_list = os.listdir(dir_path) + file_list = file_list[:batch_size] + files = [('files', (f, open(os.path.join(dir_path, f), 'rb'), 'application/pdf')) for f in file_list] + with Timer(f"send {len(files)} documents to pii detection endpoint"): + try: + resp = requests.request('POST', url=url, headers={}, files=files, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") + except requests.exceptions.RequestException as e: + print("An error occurred:", e) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--test_html', action='store_true', help='Test HTML pii detection') + parser.add_argument('--test_pdf', action='store_true', help='Test PDF pii detection') + parser.add_argument('--test_text', action='store_true', help='Test Text pii detection') + parser.add_argument('--batch_size', type=int, default=20, help='Batch size for testing') + parser.add_argument('--ip_addr', type=str, default="localhost", help='IP address of the server') + + args = parser.parse_args() + args.ip_addr = "100.83.111.250" + if args.test_html: + test_html(ip_addr=args.ip_addr, batch_size=args.batch_size) + elif args.test_pdf: + test_pdf(ip_addr=args.ip_addr, batch_size=args.batch_size) + elif args.test_text: + test_text(ip_addr=args.ip_addr, batch_size=args.batch_size) + else: + print("Please specify the test type") \ No newline at end of file diff --git a/comps/guardrails/pii_detection/utils.py b/comps/guardrails/pii_detection/utils.py new file mode 100644 index 000000000..1e557f51e --- /dev/null +++ b/comps/guardrails/pii_detection/utils.py @@ -0,0 +1,116 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import errno +import functools +import os +import signal +import timeit +import hashlib +from pathlib import Path +import pandas as pd +from fastapi import HTTPException + +class Timer: + level = 0 + viewer = None + + def __init__(self, name): + self.name = name + if Timer.viewer: + Timer.viewer.display(f"{name} started ...") + else: + print(f"{name} started ...") + + def __enter__(self): + self.start = timeit.default_timer() + Timer.level += 1 + + def __exit__(self, *a, **kw): + Timer.level -= 1 + if Timer.viewer: + Timer.viewer.display(f'{" " * Timer.level}{self.name} took {timeit.default_timer() - self.start} sec') + else: + print(f'{" " * Timer.level}{self.name} took {timeit.default_timer() - self.start} sec') + +class TimeoutError(Exception): + pass + +def save_logs(log_name, data): + df = pd.DataFrame.from_records(data) + try: + dir_path = os.path.dirname(log_name) + if not os.path.exists(dir_path): + os.makedirs(dir_path, exist_ok=True) + df.to_csv(log_name) + except: + pass + return df + +def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): + def decorator(func): + def _handle_timeout(signum, frame): + raise TimeoutError(error_message) + + @functools.wraps(func) + def wrapper(*args, **kwargs): + signal.signal(signal.SIGALRM, _handle_timeout) + signal.alarm(seconds) + try: + result = func(*args, **kwargs) + finally: + signal.alarm(0) + return result + + return wrapper + return decorator + +def generate_log_name(file_list): + file_set = f"{sorted(file_list)}" + # print(f"file_set: {file_set}") + md5_str = hashlib.md5(file_set.encode()).hexdigest() + return f"status/status_{md5_str}.log" + +def get_failable_with_time(callable): + def failable_callable(*args, **kwargs): + start_time = timeit.default_timer() + try: + content = callable(*args, **kwargs) + error = None + except Exception as e: + content = None + error = str(e) + end_time = timeit.default_timer() + return content, error, f"{'%.3f' % (end_time - start_time)}" + + return failable_callable + +def prepare_env(enable_ray=False, pip_requirements=None, comps_path=None): + if enable_ray: + import ray + + if ray.is_initialized(): + ray.shutdown() + if pip_requirements is not None: + ray.init(runtime_env={"pip": pip_requirements, "env_vars": {"PYTHONPATH": comps_path}}) + else: + ray.init(runtime_env={"env_vars": {"PYTHONPATH": comps_path}}) + + +def get_max_cpus(total_num_tasks): + num_cpus_available = os.cpu_count() + num_cpus_per_task = num_cpus_available // total_num_tasks + if num_cpus_per_task == 0: + return 8 + return num_cpus_per_task + + +async def save_file_to_local_disk(save_path: str, file): + save_path = Path(save_path) + with save_path.open("wb") as fout: + try: + content = await file.read() + fout.write(content) + except Exception as e: + print(f"Write file failed. Exception: {e}") + raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") From 20e2e57fc2096ed259ab02733deb2e6c504dd0c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:23:34 +0000 Subject: [PATCH 02/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/README.md | 7 ++- comps/guardrails/pii_detection/config.py | 3 +- comps/guardrails/pii_detection/data_utils.py | 15 ++--- .../guardrails/pii_detection/pii/__init__.py | 5 +- .../pii_detection/pii/detect/__init__.py | 2 + .../pii/detect/emails_detection.py | 16 +++-- .../pii_detection/pii/detect/ip_detection.py | 29 +++++----- .../pii/detect/keys_detection.py | 20 +++---- .../pii/detect/name_password_detection.py | 8 ++- .../pii/detect/phones_detection.py | 8 ++- .../pii_detection/pii/detect/utils.py | 3 + .../guardrails/pii_detection/pii/pii_utils.py | 43 ++++++++------ .../guardrails/pii_detection/pii_detection.py | 39 +++++++++---- comps/guardrails/pii_detection/ray_utils.py | 22 ++++--- .../guardrails/pii_detection/requirements.txt | 6 +- comps/guardrails/pii_detection/test.py | 58 ++++++++++--------- comps/guardrails/pii_detection/utils.py | 15 ++++- 17 files changed, 183 insertions(+), 116 deletions(-) diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md index db9557751..99eaec422 100644 --- a/comps/guardrails/pii_detection/README.md +++ b/comps/guardrails/pii_detection/README.md @@ -40,7 +40,7 @@ TBD ## 2.1.2 use NER model (default mode) -``` bash +```bash mkdir -p pii/bigcode apt install git-lfs cd pii/bigcode; git clone https://{hf_username}:{hf_token}@huggingface.co/bigcode/starpii/; cd ../.. @@ -64,13 +64,14 @@ docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p ``` > debug mode + ```bash docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest ``` # 🚀3. Status Microservice -``` bash +```bash docker container logs -f guardrails-pii-detection-endpoint ``` @@ -78,7 +79,7 @@ docker container logs -f guardrails-pii-detection-endpoint Once microservice starts, user can use below script to invoke the microservice for pii detection. -``` python +```python import requests import json diff --git a/comps/guardrails/pii_detection/config.py b/comps/guardrails/pii_detection/config.py index 69a0836c5..430532a15 100644 --- a/comps/guardrails/pii_detection/config.py +++ b/comps/guardrails/pii_detection/config.py @@ -39,8 +39,7 @@ def get_boolean_env_var(var_name, default_value=False): return default_value - LLM_URL = os.getenv("LLM_ENDPOINT_URL", None) current_file_path = pathlib.Path(__file__).parent.resolve() -comps_path = os.path.join(current_file_path, "../../../") \ No newline at end of file +comps_path = os.path.join(current_file_path, "../../../") diff --git a/comps/guardrails/pii_detection/data_utils.py b/comps/guardrails/pii_detection/data_utils.py index b6c09a290..dfafbc670 100644 --- a/comps/guardrails/pii_detection/data_utils.py +++ b/comps/guardrails/pii_detection/data_utils.py @@ -1,7 +1,14 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import io +import json +import multiprocessing import os +import re +import unicodedata +from urllib.parse import urlparse, urlunparse + import easyocr import fitz import numpy as np @@ -9,13 +16,6 @@ import requests import yaml from bs4 import BeautifulSoup - -import io -import json -import multiprocessing -import re -import unicodedata -from urllib.parse import urlparse, urlunparse from docx import Document as DDocument from langchain_community.document_loaders import ( UnstructuredImageLoader, @@ -26,6 +26,7 @@ from PIL import Image from utils import timeout + def load_pdf(pdf_path): """Load the pdf file.""" doc = fitz.open(pdf_path) diff --git a/comps/guardrails/pii_detection/pii/__init__.py b/comps/guardrails/pii_detection/pii/__init__.py index df198ca44..4505e776e 100644 --- a/comps/guardrails/pii_detection/pii/__init__.py +++ b/comps/guardrails/pii_detection/pii/__init__.py @@ -1 +1,4 @@ -__all__ = ['pii_detection', 'pii_redaction'] +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +__all__ = ["pii_detection", "pii_redaction"] diff --git a/comps/guardrails/pii_detection/pii/detect/__init__.py b/comps/guardrails/pii_detection/pii/detect/__init__.py index e69de29bb..916f3a44b 100644 --- a/comps/guardrails/pii_detection/pii/detect/__init__.py +++ b/comps/guardrails/pii_detection/pii/detect/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/guardrails/pii_detection/pii/detect/emails_detection.py b/comps/guardrails/pii_detection/pii/detect/emails_detection.py index 02c40ea3e..4c5704fbd 100644 --- a/comps/guardrails/pii_detection/pii/detect/emails_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/emails_detection.py @@ -1,4 +1,6 @@ -""" This code is adapted from BigScience PII detection +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +""" This code is adapted from BigScience PII detection https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py MST BigScience PII Code @@ -18,11 +20,13 @@ """ import sys + import regex # Note: to reduce false positives, a number of technically-valid-but-rarely-used # email address patterns (e.g. with parenthesis or slashes) will not match -email_pattern = regex.compile(r''' +email_pattern = regex.compile( + r""" (?<= ^ | [[({<\b\s@,?!;'"\p{Han}¿¡:.] | \\['"] ) # left delimiter ( (?: # local part @@ -44,7 +48,9 @@ (?: [\p{L}\p{M}]{2,63} | xn-- \w+ ) # TLD, including IDN ) (?= $ | [])}>\b\s@,?!;'"\p{Han}] | \\['"] | : (?! \d) | \. (?! \S)) # right delim -''', flags=regex.MULTILINE | regex.VERBOSE) +""", + flags=regex.MULTILINE | regex.VERBOSE, +) def detect_email(content): @@ -63,9 +69,7 @@ def detect_email(content): for match in matches_tmp: if match.groups(): if len(match.groups()) > 1 and match.groups()[1]: - sys.stderr.write( - "Warning: Found substring matches in the main match." - ) + sys.stderr.write("Warning: Found substring matches in the main match.") # setup outputs value = match.group(1) start, end = match.span(1) diff --git a/comps/guardrails/pii_detection/pii/detect/ip_detection.py b/comps/guardrails/pii_detection/pii/detect/ip_detection.py index 24bf33b42..3616f1903 100644 --- a/comps/guardrails/pii_detection/pii/detect/ip_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/ip_detection.py @@ -1,4 +1,6 @@ -""" This code is adapted from BigScience PII detection +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +""" This code is adapted from BigScience PII detection https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py MST BigScience PII Code @@ -17,8 +19,9 @@ limitations under the License. """ -import sys import ipaddress +import sys + import regex year_patterns = [ @@ -43,10 +46,8 @@ ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])" ip_pattern = regex.compile( - r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" - + r"|".join([ipv4_pattern, ipv6_pattern]) - + ")(?:$|[\s@,?!;:'\"(.\p{Han}])", - flags=regex.MULTILINE + r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:'\"(.\p{Han}])", + flags=regex.MULTILINE, ) @@ -66,10 +67,10 @@ def ip_has_digit(matched_str): def filter_versions(matched_str, context): """Filter addresses in this format x.x.x.x and the words dns/server - don't appear in the neighboring context, usually they are just versions""" - # count occurrence of dots - dot_count = matched_str.count('.') - exclude = (dot_count == 3 and len(matched_str) == 7) + don't appear in the neighboring context, usually they are just versions.""" + # count occurrence of dots + dot_count = matched_str.count(".") + exclude = dot_count == 3 and len(matched_str) == 7 if exclude: if "dns" in context.lower() or "server" in context.lower(): return False @@ -77,7 +78,7 @@ def filter_versions(matched_str, context): def not_ip_address(matched_str): - """ make sure the string has a valid IP address format + """make sure the string has a valid IP address format e.g: 33.01.33.33 is not a valid IP address because of the 0 in front of 1 TODO: fix this directly in the regex""" try: @@ -103,9 +104,7 @@ def detect_ip(content): for match in matches_tmp: if match.groups(): if len(match.groups()) > 1 and match.groups()[1]: - sys.stderr.write( - "Warning: Found substring matches in the main match." - ) + sys.stderr.write("Warning: Found substring matches in the main match.") # setup outputs value = match.group(1) start, end = match.span(1) @@ -115,7 +114,7 @@ def detect_ip(content): continue if matches_date_pattern(value): continue - if filter_versions(value, content[start - 100:end + 100]) or not_ip_address(value): + if filter_versions(value, content[start - 100 : end + 100]) or not_ip_address(value): continue # combine if conditions in one diff --git a/comps/guardrails/pii_detection/pii/detect/keys_detection.py b/comps/guardrails/pii_detection/pii/detect/keys_detection.py index ada10f71d..97fe1e8a2 100755 --- a/comps/guardrails/pii_detection/pii/detect/keys_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/keys_detection.py @@ -1,9 +1,10 @@ -""" This code is adapted from BigCode PII +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +""" This code is adapted from BigCode PII https://github.com/bigcode-project/bigcode-dataset/blob/main/pii/utils/keys_detection.py """ import os - # Secrets detection with detect-secrets tool @@ -44,7 +45,7 @@ def get_detector_model(): def is_gibberish(matched_str): - """Checks to make sure the PII span is gibberish and not word like""" + """Checks to make sure the PII span is gibberish and not word like.""" # pip install gibberish-detector # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds) @@ -62,7 +63,7 @@ def is_hash(content, value): # TODO: fix this issue happened one for JS in the stack-smol, file did contain value print("Value not found in content, why this happened?") return False - lines = content[:content.index(value)].splitlines() + lines = content[: content.index(value)].splitlines() target_line = lines[-1] if len(value) in [32, 40, 64]: # if "sha" or "md5" are in content: @@ -73,7 +74,7 @@ def is_hash(content, value): def file_has_hashes(content, coeff=0.02): - """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines""" + """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines.""" lines = content.splitlines() count_sha = 0 count_hash = 0 @@ -108,13 +109,12 @@ def scan_secrets(line: str): lines = line.splitlines(keepends=True) for secret in _process_line_based_plugins( - lines=list(enumerate(lines, start=1)), - filename="Adhoc String", + lines=list(enumerate(lines, start=1)), + filename="Adhoc String", ): yield secret - def detect_keys(content): """Detect secret keys in content using detect-secrets tool Args: @@ -135,9 +135,7 @@ def detect_keys(content): from detect_secrets.settings import transient_settings - with transient_settings( - {"plugins_used": plugins, "filters_used": filters} - ) as settings: + with transient_settings({"plugins_used": plugins, "filters_used": filters}) as settings: matches = [] for secret in scan_secrets(content): if is_hash(content, secret.secret_value) or file_has_hashes(content): diff --git a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py index 8a42ec054..c02c6c80b 100644 --- a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from .utils import PIIEntityType @@ -17,8 +20,9 @@ def detect_name_password(content, pipeline, entity_types=None): try: for entity in pipeline(content): entity_group = entity["entity_group"] - if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or \ - ("PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types): + if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or ( + "PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types + ): matches.append( { "tag": entity_group, diff --git a/comps/guardrails/pii_detection/pii/detect/phones_detection.py b/comps/guardrails/pii_detection/pii/detect/phones_detection.py index cf130be36..0f6f770d3 100644 --- a/comps/guardrails/pii_detection/pii/detect/phones_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/phones_detection.py @@ -1,13 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os + def detect_phones(text): - """Detects phone in a string using phonenumbers libray only detection the international phone number""" + """Detects phone in a string using phonenumbers library only detection the international phone number.""" try: import phonenumbers except ImportError: os.system("pip install phonenumbers") import phonenumbers - + matches = [] for match in phonenumbers.PhoneNumberMatcher(text, "IN"): diff --git a/comps/guardrails/pii_detection/pii/detect/utils.py b/comps/guardrails/pii_detection/pii/detect/utils.py index da5d6d65a..60b8414fe 100644 --- a/comps/guardrails/pii_detection/pii/detect/utils.py +++ b/comps/guardrails/pii_detection/pii/detect/utils.py @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum, auto diff --git a/comps/guardrails/pii_detection/pii/pii_utils.py b/comps/guardrails/pii_detection/pii/pii_utils.py index c92fc06a0..6dae7d002 100644 --- a/comps/guardrails/pii_detection/pii/pii_utils.py +++ b/comps/guardrails/pii_detection/pii/pii_utils.py @@ -1,39 +1,47 @@ -from .detect.ip_detection import detect_ip +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import List + from .detect.emails_detection import detect_email -from .detect.phones_detection import detect_phones -from .detect.name_password_detection import detect_name_password +from .detect.ip_detection import detect_ip from .detect.keys_detection import detect_keys +from .detect.name_password_detection import detect_name_password +from .detect.phones_detection import detect_phones from .detect.utils import PIIEntityType -from typing import List -import os class PIIDetector: def __init__(strategy: str): pass - + def detect_pii(self, data): import random + return random.choice([True, False]) - + + class PIIDetectorWithLLM(PIIDetector): def __init__(self): super().__init__() - + def detect_pii(self, text): return True - + + class PIIDetectorWithNER(PIIDetector): def __init__(self, model_path=None): super().__init__() - from transformers import pipeline - from transformers import AutoTokenizer + from transformers import AutoTokenizer, pipeline _model_key = "bigcode/starpii" _model_key = _model_key if model_path is None else os.path.join(model_path, _model_key) tokenizer = AutoTokenizer.from_pretrained(_model_key, model_max_length=512) - self.pipeline = pipeline(model=_model_key, task='token-classification', tokenizer=tokenizer, grouped_entities=True) - + self.pipeline = pipeline( + model=_model_key, task="token-classification", tokenizer=tokenizer, grouped_entities=True + ) + def detect_pii(self, text): result = [] # use a regex to detect ip addresses @@ -53,14 +61,15 @@ def detect_pii(self, text): if PIIEntityType.NAME in entity_types or PIIEntityType.PASSWORD in entity_types: result = result + detect_name_password(text, self.pipeline, entity_types) - + print(result) - + return True if len(result) > 0 else False # Dummy function, replace with actual logic - + + class PIIDetectorWithML(PIIDetector): def __init__(self): super().__init__() - + def detect_pii(self, text): return True diff --git a/comps/guardrails/pii_detection/pii_detection.py b/comps/guardrails/pii_detection/pii_detection.py index 499ff834a..8f73a4e4e 100644 --- a/comps/guardrails/pii_detection/pii_detection.py +++ b/comps/guardrails/pii_detection/pii_detection.py @@ -14,14 +14,27 @@ comps_path = os.path.join(cur_path, "../../../") sys.path.append(comps_path) -from tqdm import tqdm from typing import List +from tqdm import tqdm + from comps import DocPath, opea_microservices, register_microservice -from comps.guardrails.pii_detection.utils import Timer, generate_log_name, prepare_env, save_file_to_local_disk, get_max_cpus from comps.guardrails.pii_detection.data_utils import document_loader, parse_html -from comps.guardrails.pii_detection.ray_utils import rayds_initialization, ray_runner_initialization, ray_execute -from comps.guardrails.pii_detection.pii.pii_utils import PIIDetector, PIIDetectorWithLLM, PIIDetectorWithNER, PIIDetectorWithML +from comps.guardrails.pii_detection.pii.pii_utils import ( + PIIDetector, + PIIDetectorWithLLM, + PIIDetectorWithML, + PIIDetectorWithNER, +) +from comps.guardrails.pii_detection.ray_utils import ray_execute, ray_runner_initialization, rayds_initialization +from comps.guardrails.pii_detection.utils import ( + Timer, + generate_log_name, + get_max_cpus, + prepare_env, + save_file_to_local_disk, +) + def get_pii_detection_inst(strategy="dummy", settings=None): if strategy == "ner": @@ -34,6 +47,7 @@ def get_pii_detection_inst(strategy="dummy", settings=None): # Default strategy - dummy return PIIDetector() + def file_based_pii_detect(file_list: List[DocPath], strategy, enable_ray=False, debug=False): """Ingest document to Redis.""" file_list = [f.path for f in file_list] @@ -57,6 +71,7 @@ def file_based_pii_detect(file_list: List[DocPath], strategy, enable_ray=False, ret.append(pii_detector.detect_pii(data)) return ret + def link_based_pii_detect(link_list: List[str], strategy, enable_ray=False, debug=False): link_list = [str(f) for f in link_list] pii_detector = get_pii_detection_inst(strategy=strategy) @@ -64,11 +79,11 @@ def link_based_pii_detect(link_list: List[str], strategy, enable_ray=False, debu def _parse_html(link): data = parse_html([link]) return data[0][0] - + if enable_ray: num_cpus = get_max_cpus(len(link_list)) print(f"per task num_cpus: {num_cpus}") - + log_name = generate_log_name(link_list) ds = rayds_initialization(link_list, _parse_html, lazy_mode=True, num_cpus=num_cpus) ds = ds.map(ray_runner_initialization(pii_detector.detect_pii, debug=debug), num_cpus=num_cpus) @@ -84,6 +99,7 @@ def _parse_html(link): ret.append(pii_detector.detect_pii(data)) return ret + def text_based_pii_detect(text_list: List[str], strategy, enable_ray=False, debug=False): text_list = [str(f) for f in text_list] pii_detector = get_pii_detection_inst(strategy=strategy) @@ -105,11 +121,14 @@ def text_based_pii_detect(text_list: List[str], strategy, enable_ray=False, debu ret.append(pii_detector.detect_pii(data)) return ret -@register_microservice(name="opea_service@guardrails-pii-detection", endpoint="/v1/piidetect", host="0.0.0.0", port=6357) + +@register_microservice( + name="opea_service@guardrails-pii-detection", endpoint="/v1/piidetect", host="0.0.0.0", port=6357 +) async def pii_detection(files: List[UploadFile] = File(None), link_list: str = Form(None), text_list: str = Form(None)): if not files and not link_list and not text_list: raise HTTPException(status_code=400, detail="Either files, link_list, or text_list must be provided.") - + strategy = "ner" # Default strategy pip_requirement = ["detect-secrets", "phonenumbers", "gibberish-detector"] @@ -135,7 +154,7 @@ async def pii_detection(files: List[UploadFile] = File(None), link_list: str = F return {"status": 200, "message": json.dumps(ret)} except Exception as e: raise HTTPException(status_code=400, detail=f"An error occurred: {e}") - + if text_list: try: text_list = json.loads(text_list) # Parse JSON string to list @@ -168,4 +187,4 @@ async def pii_detection(files: List[UploadFile] = File(None), link_list: str = F if __name__ == "__main__": - opea_microservices["opea_service@guardrails-pii-detection"].start() \ No newline at end of file + opea_microservices["opea_service@guardrails-pii-detection"].start() diff --git a/comps/guardrails/pii_detection/ray_utils.py b/comps/guardrails/pii_detection/ray_utils.py index 103bbc526..c4fca4c93 100644 --- a/comps/guardrails/pii_detection/ray_utils.py +++ b/comps/guardrails/pii_detection/ray_utils.py @@ -1,15 +1,15 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Union + import pyarrow import ray from ray.data.block import Block from ray.data.datasource import FileBasedDatasource from tqdm import tqdm -from utils import Timer, get_failable_with_time, timeout, save_logs +from utils import Timer, get_failable_with_time, save_logs, timeout -from typing import TYPE_CHECKING, Any, Dict, Iterator -from typing import Callable, List, Optional, Union class RayDataLoader(FileBasedDatasource): def __init__( @@ -32,12 +32,15 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]: item = {"data": data, "filename": path, "error": error, "read_time": f"{read_time} secs"} builder.add(item) yield builder.build() - + + def rayds_initialization(file_paths, dataloader_callable, lazy_mode=True, num_cpus=20): if dataloader_callable is None: - text_list = [{"data": data, "filename": data[:50], "error": None, "read_time": f"0 secs"} for data in file_paths] + text_list = [ + {"data": data, "filename": data[:50], "error": None, "read_time": "0 secs"} for data in file_paths + ] return ray.data.from_items(text_list) - + decorated_dataloader_callable = get_failable_with_time(dataloader_callable) if lazy_mode: if num_cpus is None: @@ -53,7 +56,8 @@ def rayds_initialization(file_paths, dataloader_callable, lazy_mode=True, num_cp item = {"data": content, "filename": file, "error": error, "read_time": f"{elapse_time} secs"} data.append(item) return ray.data.from_items(data) - + + def ray_runner_initialization(func, debug=False): @timeout(600) def ray_runner(data): @@ -83,11 +87,13 @@ def ray_runner(data): "read_time": data["read_time"], "elaspe_time": f"{elapse_time} secs", } + return ray_runner + def ray_execute(ds, log_name): with Timer(f"execute with Ray, status log: {log_name}"): ret_with_status = ds.take_all() df = save_logs(log_name, ret_with_status) ret = df["ret"].to_list() - return ret \ No newline at end of file + return ret diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt index ef2d97c9a..d942bf347 100644 --- a/comps/guardrails/pii_detection/requirements.txt +++ b/comps/guardrails/pii_detection/requirements.txt @@ -1,7 +1,9 @@ beautifulsoup4 +detect_secrets docarray[full] easyocr fastapi +gibberish-detector huggingface_hub langchain langchain-community @@ -11,6 +13,7 @@ opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk pandas +phonenumbers Pillow pyarrow pymupdf @@ -20,6 +23,3 @@ redis sentence_transformers shortuuid virtualenv -phonenumbers -detect_secrets -gibberish-detector diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index db40b7d1f..acfc40843 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -1,69 +1,75 @@ -import requests +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json import os import timeit + import pandas as pd -import json +import requests from utils import Timer -import argparse + def test_html(ip_addr="localhost", batch_size=20): - proxies = {'http':""} - url = f'http://{ip_addr}:6357/v1/piidetect' - urls = pd.read_csv("data/ai_rss.csv")['Permalink'] + proxies = {"http": ""} + url = f"http://{ip_addr}:6357/v1/piidetect" + urls = pd.read_csv("data/ai_rss.csv")["Permalink"] urls = urls[:batch_size].to_list() payload = {"link_list": json.dumps(urls)} with Timer(f"send {len(urls)} link to pii detection endpoint"): try: - resp = requests.post(url=url, data=payload, proxies=proxies) + resp = requests.post(url=url, data=payload, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") except requests.exceptions.RequestException as e: print("An error occurred:", e) - + def test_text(ip_addr="localhost", batch_size=20): - proxies = {'http':""} - url = f'http://{ip_addr}:6357/v1/piidetect' - content = pd.read_csv("data/ai_rss.csv")['Description'] + proxies = {"http": ""} + url = f"http://{ip_addr}:6357/v1/piidetect" + content = pd.read_csv("data/ai_rss.csv")["Description"] content = content[:batch_size].to_list() payload = {"text_list": json.dumps(content)} with Timer(f"send {len(content)} text to pii detection endpoint"): try: - resp = requests.post(url=url, data=payload, proxies=proxies) + resp = requests.post(url=url, data=payload, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") except requests.exceptions.RequestException as e: - print("An error occurred:", e) + print("An error occurred:", e) + - def test_pdf(ip_addr="localhost", batch_size=20): - proxies = {'http':""} - url = f'http://{ip_addr}:6357/v1/piidetect' + proxies = {"http": ""} + url = f"http://{ip_addr}:6357/v1/piidetect" dir_path = "data/pdf" file_list = os.listdir(dir_path) file_list = file_list[:batch_size] - files = [('files', (f, open(os.path.join(dir_path, f), 'rb'), 'application/pdf')) for f in file_list] + files = [("files", (f, open(os.path.join(dir_path, f), "rb"), "application/pdf")) for f in file_list] with Timer(f"send {len(files)} documents to pii detection endpoint"): try: - resp = requests.request('POST', url=url, headers={}, files=files, proxies=proxies) + resp = requests.request("POST", url=url, headers={}, files=files, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") except requests.exceptions.RequestException as e: print("An error occurred:", e) - + + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--test_html', action='store_true', help='Test HTML pii detection') - parser.add_argument('--test_pdf', action='store_true', help='Test PDF pii detection') - parser.add_argument('--test_text', action='store_true', help='Test Text pii detection') - parser.add_argument('--batch_size', type=int, default=20, help='Batch size for testing') - parser.add_argument('--ip_addr', type=str, default="localhost", help='IP address of the server') - + parser.add_argument("--test_html", action="store_true", help="Test HTML pii detection") + parser.add_argument("--test_pdf", action="store_true", help="Test PDF pii detection") + parser.add_argument("--test_text", action="store_true", help="Test Text pii detection") + parser.add_argument("--batch_size", type=int, default=20, help="Batch size for testing") + parser.add_argument("--ip_addr", type=str, default="localhost", help="IP address of the server") + args = parser.parse_args() args.ip_addr = "100.83.111.250" if args.test_html: @@ -73,4 +79,4 @@ def test_pdf(ip_addr="localhost", batch_size=20): elif args.test_text: test_text(ip_addr=args.ip_addr, batch_size=args.batch_size) else: - print("Please specify the test type") \ No newline at end of file + print("Please specify the test type") diff --git a/comps/guardrails/pii_detection/utils.py b/comps/guardrails/pii_detection/utils.py index 1e557f51e..c23145553 100644 --- a/comps/guardrails/pii_detection/utils.py +++ b/comps/guardrails/pii_detection/utils.py @@ -3,14 +3,16 @@ import errno import functools +import hashlib import os import signal import timeit -import hashlib from pathlib import Path + import pandas as pd from fastapi import HTTPException + class Timer: level = 0 viewer = None @@ -33,9 +35,11 @@ def __exit__(self, *a, **kw): else: print(f'{" " * Timer.level}{self.name} took {timeit.default_timer() - self.start} sec') + class TimeoutError(Exception): pass + def save_logs(log_name, data): df = pd.DataFrame.from_records(data) try: @@ -47,6 +51,7 @@ def save_logs(log_name, data): pass return df + def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): def decorator(func): def _handle_timeout(signum, frame): @@ -63,14 +68,17 @@ def wrapper(*args, **kwargs): return result return wrapper + return decorator + def generate_log_name(file_list): file_set = f"{sorted(file_list)}" # print(f"file_set: {file_set}") md5_str = hashlib.md5(file_set.encode()).hexdigest() return f"status/status_{md5_str}.log" + def get_failable_with_time(callable): def failable_callable(*args, **kwargs): start_time = timeit.default_timer() @@ -85,6 +93,7 @@ def failable_callable(*args, **kwargs): return failable_callable + def prepare_env(enable_ray=False, pip_requirements=None, comps_path=None): if enable_ray: import ray @@ -95,8 +104,8 @@ def prepare_env(enable_ray=False, pip_requirements=None, comps_path=None): ray.init(runtime_env={"pip": pip_requirements, "env_vars": {"PYTHONPATH": comps_path}}) else: ray.init(runtime_env={"env_vars": {"PYTHONPATH": comps_path}}) - - + + def get_max_cpus(total_num_tasks): num_cpus_available = os.cpu_count() num_cpus_per_task = num_cpus_available // total_num_tasks From 0f5b920ce5320f024ee277ff82501df061059092 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:24:50 +0000 Subject: [PATCH 03/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/ray_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/comps/guardrails/pii_detection/ray_utils.py b/comps/guardrails/pii_detection/ray_utils.py index c4fca4c93..c4018d4c9 100644 --- a/comps/guardrails/pii_detection/ray_utils.py +++ b/comps/guardrails/pii_detection/ray_utils.py @@ -36,9 +36,7 @@ def _read_stream(self, f: "pyarrow.NativeFile", path: str) -> Iterator[Block]: def rayds_initialization(file_paths, dataloader_callable, lazy_mode=True, num_cpus=20): if dataloader_callable is None: - text_list = [ - {"data": data, "filename": data[:50], "error": None, "read_time": "0 secs"} for data in file_paths - ] + text_list = [{"data": data, "filename": data[:50], "error": None, "read_time": "0 secs"} for data in file_paths] return ray.data.from_items(text_list) decorated_dataloader_callable = get_failable_with_time(dataloader_callable) From e5c6c1ca3a6d34b6c1c5752828da590fd1b1c5c7 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Wed, 12 Jun 2024 20:45:00 +0000 Subject: [PATCH 04/18] add e2e test to tests Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/.gitignore | 1 - .../pii/detect/gibberish_data/big.model | 1 + comps/guardrails/pii_detection/test.py | 20 ++++++- comps/guardrails/pii_detection/utils.py | 3 +- tests/test_pii_detection.sh | 56 +++++++++++++++++++ 5 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model create mode 100644 tests/test_pii_detection.sh diff --git a/comps/guardrails/pii_detection/.gitignore b/comps/guardrails/pii_detection/.gitignore index 0407124a4..017a8659d 100644 --- a/comps/guardrails/pii_detection/.gitignore +++ b/comps/guardrails/pii_detection/.gitignore @@ -2,4 +2,3 @@ **/*csv **/*log **/*pyc -**/*model diff --git a/comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model b/comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model new file mode 100644 index 000000000..be28fda01 --- /dev/null +++ b/comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model @@ -0,0 +1 @@ +{"charset": "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", "ngram_size": 2, "counts": {"a": {"a": 1162, "b": 8940, "c": 17954, "d": 20101, "e": 1016, "f": 4796, "g": 8259, "h": 1263, "i": 16189, "j": 463, "k": 4547, "l": 33583, "m": 10869, "n": 80994, "o": 655, "p": 9287, "q": 188, "r": 40916, "s": 40996, "t": 58739, "u": 4054, "v": 8927, "w": 5205, "x": 560, "y": 10706, "z": 534, "A": 145, "B": 84, "C": 125, "D": 166, "E": 33, "F": 142, "G": 77, "H": 84, "I": 213, "J": 21, "K": 49, "L": 43, "M": 195, "N": 92, "O": 40, "P": 237, "Q": 10, "R": 133, "S": 166, "T": 234, "U": 23, "V": 32, "W": 92, "X": 11, "Y": 33, "Z": 13}, "b": {"a": 5069, "b": 459, "c": 182, "d": 151, "e": 21707, "f": 25, "g": 14, "h": 40, "i": 2511, "j": 493, "k": 11, "l": 8545, "m": 189, "n": 86, "o": 7189, "p": 27, "q": 11, "r": 4184, "s": 1683, "t": 729, "u": 7093, "v": 138, "w": 60, "x": 10, "y": 6685, "z": 10, "A": 19, "B": 11, "C": 15, "D": 16, "E": 15, "F": 11, "G": 11, "H": 14, "I": 21, "J": 10, "K": 10, "L": 11, "M": 14, "N": 11, "O": 11, "P": 15, "Q": 10, "R": 13, "S": 12, "T": 25, "U": 11, "V": 10, "W": 15, "X": 10, "Y": 10, "Z": 10}, "c": {"a": 17333, "b": 121, "c": 3239, "d": 186, "e": 25355, "f": 138, "g": 60, "h": 20424, "i": 7672, "j": 21, "k": 5059, "l": 5203, "m": 94, "n": 69, "o": 26665, "p": 161, "q": 252, "r": 4874, "s": 673, "t": 13274, "u": 5715, "v": 45, "w": 159, "x": 10, "y": 1365, "z": 23, "A": 42, "B": 24, "C": 87, "D": 30, "E": 14, "F": 28, "G": 20, "H": 36, "I": 30, "J": 12, "K": 49, "L": 20, "M": 24, "N": 27, "O": 28, "P": 30, "Q": 11, "R": 21, "S": 21, "T": 45, "U": 16, "V": 10, "W": 18, "X": 10, "Y": 12, "Z": 11}, "d": {"a": 17910, "b": 8127, "c": 3135, "d": 4524, "e": 29234, "f": 4490, "g": 2437, "h": 8426, "i": 24346, "j": 663, "k": 412, "l": 4307, "m": 3688, "n": 4691, "o": 14478, "p": 2799, "q": 230, "r": 7394, "s": 11784, "t": 21680, "u": 4985, "v": 1332, "w": 6241, "x": 11, "y": 2841, "z": 24, "A": 720, "B": 524, "C": 328, "D": 407, "E": 145, "F": 243, "G": 189, "H": 672, "I": 1439, "J": 97, "K": 175, "L": 168, "M": 502, "N": 735, "O": 187, "P": 1154, "Q": 11, "R": 331, "S": 1087, "T": 946, "U": 100, "V": 129, "W": 449, "X": 15, "Y": 130, "Z": 30}, "e": {"a": 45146, "b": 8292, "c": 25648, "d": 61505, "e": 19282, "f": 14724, "g": 7750, "h": 10910, "i": 16307, "j": 757, "k": 1505, "l": 25061, "m": 21108, "n": 59327, "o": 17248, "p": 15426, "q": 1639, "r": 90222, "s": 64863, "t": 33539, "u": 3339, "v": 11339, "w": 17755, "x": 6947, "y": 8098, "z": 282, "A": 2274, "B": 1003, "C": 1053, "D": 542, "E": 1015, "F": 1177, "G": 489, "H": 788, "I": 1721, "J": 137, "K": 305, "L": 305, "M": 658, "N": 695, "O": 374, "P": 1086, "Q": 31, "R": 1025, "S": 1355, "T": 1650, "U": 516, "V": 386, "W": 757, "X": 64, "Y": 180, "Z": 22}, "f": {"a": 11168, "b": 969, "c": 1394, "d": 702, "e": 10314, "f": 7152, "g": 581, "h": 2603, "i": 11018, "j": 137, "k": 70, "l": 3443, "m": 1172, "n": 540, "o": 17857, "p": 1019, "q": 29, "r": 9769, "s": 2042, "t": 19878, "u": 3750, "v": 269, "w": 1356, "x": 10, "y": 765, "z": 18, "A": 385, "B": 287, "C": 315, "D": 99, "E": 176, "F": 163, "G": 138, "H": 168, "I": 305, "J": 134, "K": 115, "L": 152, "M": 315, "N": 290, "O": 102, "P": 339, "Q": 13, "R": 194, "S": 359, "T": 216, "U": 52, "V": 64, "W": 146, "X": 13, "Y": 25, "Z": 13}, "g": {"a": 10565, "b": 1241, "c": 837, "d": 729, "e": 12802, "f": 1455, "g": 1233, "h": 12495, "i": 7265, "j": 101, "k": 59, "l": 3736, "m": 1042, "n": 2633, "o": 7546, "p": 837, "q": 61, "r": 7571, "s": 3187, "t": 7692, "u": 3131, "v": 155, "w": 1615, "x": 11, "y": 430, "z": 20, "A": 164, "B": 129, "C": 91, "D": 57, "E": 47, "F": 72, "G": 58, "H": 148, "I": 282, "J": 33, "K": 28, "L": 78, "M": 123, "N": 136, "O": 61, "P": 213, "Q": 14, "R": 63, "S": 165, "T": 289, "U": 28, "V": 21, "W": 108, "X": 11, "Y": 44, "Z": 11}, "h": {"a": 47131, "b": 741, "c": 957, "d": 642, "e": 138176, "f": 774, "g": 464, "h": 2548, "i": 40469, "j": 50, "k": 146, "l": 803, "m": 1434, "n": 666, "o": 22759, "p": 687, "q": 48, "r": 3474, "s": 2109, "t": 11467, "u": 2847, "v": 181, "w": 1644, "x": 10, "y": 2246, "z": 13, "A": 199, "B": 120, "C": 205, "D": 93, "E": 97, "F": 62, "G": 93, "H": 122, "I": 375, "J": 34, "K": 42, "L": 39, "M": 121, "N": 130, "O": 55, "P": 207, "Q": 14, "R": 97, "S": 143, "T": 255, "U": 22, "V": 35, "W": 108, "X": 10, "Y": 29, "Z": 15}, "i": {"a": 9009, "b": 3263, "c": 22405, "d": 14515, "e": 15299, "f": 6646, "g": 9004, "h": 85, "i": 97, "j": 12, "k": 1938, "l": 16083, "m": 15056, "n": 95350, "o": 25233, "p": 2653, "q": 143, "r": 11957, "s": 46613, "t": 42319, "u": 544, "v": 7775, "w": 147, "x": 708, "y": 13, "z": 1468, "A": 20, "B": 20, "C": 28, "D": 27, "E": 14, "F": 30, "G": 11, "H": 22, "I": 27, "J": 10, "K": 13, "L": 12, "M": 19, "N": 17, "O": 12, "P": 15, "Q": 11, "R": 21, "S": 25, "T": 33, "U": 10, "V": 29, "W": 21, "X": 10, "Y": 13, "Z": 12}, "j": {"a": 305, "b": 10, "c": 11, "d": 10, "e": 1355, "f": 10, "g": 10, "h": 10, "i": 25, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 1582, "p": 10, "q": 10, "r": 11, "s": 10, "t": 10, "u": 1891, "v": 10, "w": 11, "x": 10, "y": 10, "z": 10, "A": 10, "B": 10, "C": 10, "D": 10, "E": 10, "F": 10, "G": 10, "H": 10, "I": 10, "J": 10, "K": 10, "L": 10, "M": 10, "N": 11, "O": 10, "P": 11, "Q": 10, "R": 10, "S": 10, "T": 11, "U": 10, "V": 10, "W": 10, "X": 10, "Y": 10, "Z": 10}, "k": {"a": 1835, "b": 271, "c": 217, "d": 131, "e": 9574, "f": 315, "g": 75, "h": 1305, "i": 5623, "j": 29, "k": 28, "l": 723, "m": 238, "n": 3083, "o": 1550, "p": 175, "q": 23, "r": 180, "s": 1942, "t": 1073, "u": 244, "v": 45, "w": 553, "x": 10, "y": 438, "z": 10, "A": 55, "B": 39, "C": 39, "D": 25, "E": 15, "F": 31, "G": 45, "H": 168, "I": 140, "J": 13, "K": 14, "L": 20, "M": 34, "N": 36, "O": 29, "P": 54, "Q": 10, "R": 31, "S": 48, "T": 127, "U": 12, "V": 19, "W": 52, "X": 10, "Y": 26, "Z": 11}, "l": {"a": 22409, "b": 1474, "c": 1843, "d": 11939, "e": 32940, "f": 4037, "g": 746, "h": 1160, "i": 22946, "j": 111, "k": 1494, "l": 25367, "m": 2129, "n": 791, "o": 17950, "p": 1660, "q": 86, "r": 1381, "s": 5632, "t": 7478, "u": 4449, "v": 1426, "w": 2047, "x": 10, "y": 19227, "z": 39, "A": 195, "B": 95, "C": 137, "D": 70, "E": 72, "F": 104, "G": 60, "H": 157, "I": 426, "J": 39, "K": 27, "L": 63, "M": 118, "N": 77, "O": 48, "P": 178, "Q": 13, "R": 87, "S": 175, "T": 250, "U": 22, "V": 55, "W": 187, "X": 10, "Y": 27, "Z": 13}, "m": {"a": 22099, "b": 3726, "c": 392, "d": 290, "e": 31697, "f": 706, "g": 193, "h": 1069, "i": 11029, "j": 36, "k": 42, "l": 476, "m": 3553, "n": 666, "o": 13546, "p": 8637, "q": 29, "r": 301, "s": 4505, "t": 3947, "u": 3890, "v": 108, "w": 1220, "x": 10, "y": 3884, "z": 11, "A": 207, "B": 141, "C": 69, "D": 69, "E": 47, "F": 60, "G": 42, "H": 249, "I": 354, "J": 30, "K": 49, "L": 74, "M": 149, "N": 123, "O": 77, "P": 211, "Q": 11, "R": 76, "S": 136, "T": 372, "U": 27, "V": 44, "W": 123, "X": 10, "Y": 42, "Z": 11}, "n": {"a": 20659, "b": 2867, "c": 18809, "d": 64573, "e": 30356, "f": 5293, "g": 44288, "h": 5175, "i": 17452, "j": 792, "k": 2822, "l": 4632, "m": 2734, "n": 4025, "o": 26556, "p": 1921, "q": 477, "r": 1656, "s": 20866, "t": 55105, "u": 3460, "v": 2012, "w": 4858, "x": 212, "y": 4302, "z": 60, "A": 728, "B": 317, "C": 306, "D": 225, "E": 273, "F": 479, "G": 166, "H": 417, "I": 1052, "J": 152, "K": 94, "L": 143, "M": 529, "N": 335, "O": 207, "P": 579, "Q": 23, "R": 273, "S": 442, "T": 962, "U": 51, "V": 105, "W": 335, "X": 14, "Y": 84, "Z": 16}, "o": {"a": 5545, "b": 4746, "c": 6767, "d": 7970, "e": 2158, "f": 44409, "g": 2851, "h": 4165, "i": 5590, "j": 457, "k": 4838, "l": 16155, "m": 24542, "n": 64681, "o": 12767, "p": 8518, "q": 104, "r": 44780, "s": 16057, "t": 24766, "u": 42249, "v": 11214, "w": 18356, "x": 479, "y": 2271, "z": 168, "A": 193, "B": 164, "C": 131, "D": 104, "E": 76, "F": 123, "G": 93, "H": 96, "I": 320, "J": 31, "K": 93, "L": 63, "M": 227, "N": 189, "O": 48, "P": 413, "Q": 11, "R": 156, "S": 162, "T": 168, "U": 17, "V": 64, "W": 81, "X": 11, "Y": 45, "Z": 20}, "p": {"a": 11647, "b": 239, "c": 246, "d": 74, "e": 16431, "f": 260, "g": 61, "h": 2655, "i": 5213, "j": 33, "k": 47, "l": 8956, "m": 277, "n": 91, "o": 11880, "p": 6089, "q": 19, "r": 13223, "s": 2287, "t": 4181, "u": 3898, "v": 23, "w": 372, "x": 10, "y": 697, "z": 10, "A": 79, "B": 29, "C": 31, "D": 19, "E": 39, "F": 36, "G": 17, "H": 44, "I": 74, "J": 17, "K": 13, "L": 16, "M": 28, "N": 26, "O": 36, "P": 28, "Q": 10, "R": 23, "S": 34, "T": 84, "U": 11, "V": 12, "W": 35, "X": 12, "Y": 16, "Z": 10}, "q": {"a": 10, "b": 10, "c": 10, "d": 10, "e": 10, "f": 10, "g": 10, "h": 10, "i": 10, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 11, "p": 11, "q": 10, "r": 12, "s": 11, "t": 10, "u": 4426, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 10, "B": 10, "C": 10, "D": 10, "E": 10, "F": 10, "G": 10, "H": 10, "I": 10, "J": 10, "K": 10, "L": 10, "M": 10, "N": 10, "O": 10, "P": 10, "Q": 10, "R": 10, "S": 10, "T": 11, "U": 10, "V": 10, "W": 10, "X": 10, "Y": 10, "Z": 10}, "r": {"a": 29197, "b": 2704, "c": 5953, "d": 8506, "e": 73484, "f": 3667, "g": 4541, "h": 3944, "i": 30893, "j": 146, "k": 2513, "l": 4051, "m": 8900, "n": 6781, "o": 32255, "p": 2947, "q": 114, "r": 8982, "s": 19875, "t": 19965, "u": 5144, "v": 2592, "w": 3543, "x": 13, "y": 12015, "z": 105, "A": 490, "B": 247, "C": 178, "D": 125, "E": 122, "F": 145, "G": 136, "H": 443, "I": 646, "J": 83, "K": 94, "L": 103, "M": 314, "N": 231, "O": 121, "P": 348, "Q": 13, "R": 162, "S": 365, "T": 624, "U": 38, "V": 54, "W": 302, "X": 24, "Y": 64, "Z": 16}, "s": {"a": 30379, "b": 4706, "c": 9020, "d": 3048, "e": 38911, "f": 5532, "g": 1586, "h": 21428, "i": 27643, "j": 315, "k": 3714, "l": 5263, "m": 6509, "n": 3654, "o": 27677, "p": 9609, "q": 581, "r": 2650, "s": 26502, "t": 49807, "u": 13227, "v": 923, "w": 9204, "x": 13, "y": 2197, "z": 23, "A": 756, "B": 412, "C": 298, "D": 267, "E": 171, "F": 251, "G": 138, "H": 562, "I": 1593, "J": 105, "K": 77, "L": 145, "M": 901, "N": 293, "O": 248, "P": 430, "Q": 19, "R": 215, "S": 566, "T": 1794, "U": 59, "V": 100, "W": 466, "X": 38, "Y": 110, "Z": 16}, "t": {"a": 28134, "b": 3582, "c": 3747, "d": 2250, "e": 44165, "f": 3218, "g": 968, "h": 143750, "i": 49233, "j": 196, "k": 507, "l": 7141, "m": 3857, "n": 1721, "o": 51180, "p": 1899, "q": 144, "r": 15312, "s": 16128, "t": 22458, "u": 10151, "v": 371, "w": 9265, "x": 18, "y": 7794, "z": 162, "A": 603, "B": 505, "C": 282, "D": 189, "E": 110, "F": 156, "G": 341, "H": 461, "I": 1662, "J": 82, "K": 139, "L": 146, "M": 366, "N": 387, "O": 173, "P": 688, "Q": 12, "R": 279, "S": 432, "T": 911, "U": 41, "V": 76, "W": 399, "X": 10, "Y": 108, "Z": 27}, "u": {"a": 4098, "b": 3395, "c": 5664, "d": 2716, "e": 5223, "f": 1022, "g": 5801, "h": 496, "i": 3406, "j": 19, "k": 539, "l": 14496, "m": 4877, "n": 16310, "o": 411, "p": 6303, "q": 22, "r": 20624, "s": 19715, "t": 19727, "u": 64, "v": 163, "w": 562, "x": 92, "y": 119, "z": 714, "A": 41, "B": 18, "C": 27, "D": 21, "E": 10, "F": 14, "G": 20, "H": 33, "I": 95, "J": 16, "K": 20, "L": 20, "M": 39, "N": 20, "O": 17, "P": 31, "Q": 10, "R": 14, "S": 22, "T": 51, "U": 10, "V": 13, "W": 47, "X": 10, "Y": 38, "Z": 10}, "v": {"a": 4275, "b": 79, "c": 78, "d": 65, "e": 30740, "f": 78, "g": 63, "h": 217, "i": 8610, "j": 16, "k": 33, "l": 275, "m": 43, "n": 580, "o": 3071, "p": 47, "q": 13, "r": 202, "s": 811, "t": 193, "u": 105, "v": 12, "w": 389, "x": 10, "y": 253, "z": 10, "A": 40, "B": 19, "C": 19, "D": 29, "E": 12, "F": 11, "G": 15, "H": 25, "I": 40, "J": 11, "K": 17, "L": 14, "M": 16, "N": 20, "O": 15, "P": 25, "Q": 10, "R": 17, "S": 15, "T": 29, "U": 10, "V": 14, "W": 23, "X": 10, "Y": 16, "Z": 11}, "w": {"a": 20657, "b": 432, "c": 438, "d": 837, "e": 13748, "f": 449, "g": 182, "h": 17734, "i": 17019, "j": 36, "k": 116, "l": 646, "m": 413, "n": 4196, "o": 8341, "p": 202, "q": 43, "r": 1185, "s": 2111, "t": 1867, "u": 168, "v": 75, "w": 971, "x": 11, "y": 270, "z": 10, "A": 92, "B": 66, "C": 41, "D": 38, "E": 109, "F": 44, "G": 24, "H": 92, "I": 278, "J": 68, "K": 22, "L": 24, "M": 69, "N": 68, "O": 58, "P": 84, "Q": 10, "R": 36, "S": 61, "T": 126, "U": 14, "V": 13, "W": 84, "X": 11, "Y": 194, "Z": 15}, "x": {"a": 1109, "b": 27, "c": 1315, "d": 35, "e": 853, "f": 44, "g": 14, "h": 169, "i": 1119, "j": 12, "k": 10, "l": 30, "m": 35, "n": 19, "o": 220, "p": 2234, "q": 17, "r": 26, "s": 57, "t": 1606, "u": 133, "v": 18, "w": 71, "x": 10, "y": 70, "z": 10, "A": 15, "B": 11, "C": 15, "D": 11, "E": 11, "F": 13, "G": 11, "H": 12, "I": 24, "J": 13, "K": 15, "L": 11, "M": 15, "N": 16, "O": 11, "P": 17, "Q": 10, "R": 14, "S": 13, "T": 24, "U": 10, "V": 11, "W": 12, "X": 10, "Y": 11, "Z": 10}, "y": {"a": 9218, "b": 3429, "c": 2581, "d": 2432, "e": 5148, "f": 2506, "g": 976, "h": 2733, "i": 4729, "j": 159, "k": 275, "l": 1956, "m": 2611, "n": 1146, "o": 12036, "p": 2407, "q": 111, "r": 1794, "s": 7375, "t": 8667, "u": 612, "v": 295, "w": 3769, "x": 37, "y": 386, "z": 40, "A": 418, "B": 237, "C": 185, "D": 126, "E": 82, "F": 142, "G": 135, "H": 317, "I": 699, "J": 66, "K": 58, "L": 95, "M": 159, "N": 196, "O": 111, "P": 243, "Q": 12, "R": 116, "S": 316, "T": 625, "U": 53, "V": 33, "W": 240, "X": 11, "Y": 94, "Z": 11}, "z": {"a": 320, "b": 16, "c": 14, "d": 33, "e": 1590, "f": 15, "g": 11, "h": 125, "i": 390, "j": 10, "k": 14, "l": 87, "m": 75, "n": 12, "o": 715, "p": 12, "q": 10, "r": 13, "s": 20, "t": 24, "u": 170, "v": 15, "w": 22, "x": 10, "y": 42, "z": 93, "A": 10, "B": 12, "C": 12, "D": 11, "E": 10, "F": 12, "G": 11, "H": 11, "I": 13, "J": 10, "K": 10, "L": 10, "M": 11, "N": 13, "O": 13, "P": 11, "Q": 10, "R": 10, "S": 10, "T": 14, "U": 10, "V": 10, "W": 21, "X": 10, "Y": 10, "Z": 10}, "A": {"a": 18, "b": 192, "c": 329, "d": 246, "e": 11, "f": 478, "g": 146, "h": 246, "i": 28, "j": 18, "k": 21, "l": 1119, "m": 1334, "n": 3653, "o": 15, "p": 175, "q": 13, "r": 523, "s": 825, "t": 633, "u": 355, "v": 37, "w": 56, "x": 22, "y": 19, "z": 13, "A": 21, "B": 63, "C": 100, "D": 66, "E": 19, "F": 41, "G": 56, "H": 25, "I": 48, "J": 11, "K": 14, "L": 168, "M": 77, "N": 329, "O": 14, "P": 460, "Q": 11, "R": 215, "S": 105, "T": 182, "U": 17, "V": 25, "W": 15, "X": 13, "Y": 21, "Z": 12}, "B": {"a": 716, "b": 10, "c": 11, "d": 11, "e": 922, "f": 10, "g": 10, "h": 10, "i": 167, "j": 10, "k": 10, "l": 77, "m": 10, "n": 11, "o": 1371, "p": 10, "q": 10, "r": 648, "s": 13, "t": 10, "u": 1544, "v": 10, "w": 14, "x": 10, "y": 231, "z": 10, "A": 40, "B": 16, "C": 20, "D": 11, "E": 81, "F": 16, "G": 13, "H": 18, "I": 35, "J": 10, "K": 10, "L": 44, "M": 19, "N": 12, "O": 74, "P": 13, "Q": 10, "R": 28, "S": 16, "T": 12, "U": 43, "V": 10, "W": 11, "X": 10, "Y": 24, "Z": 10}, "C": {"a": 841, "b": 15, "c": 10, "d": 10, "e": 96, "f": 14, "g": 10, "h": 602, "i": 207, "j": 10, "k": 10, "l": 318, "m": 10, "n": 10, "o": 2666, "p": 10, "q": 10, "r": 127, "s": 10, "t": 11, "u": 104, "v": 10, "w": 11, "x": 10, "y": 48, "z": 24, "A": 108, "B": 15, "C": 24, "D": 23, "E": 85, "F": 13, "G": 11, "H": 488, "I": 64, "J": 12, "K": 30, "L": 67, "M": 15, "N": 10, "O": 124, "P": 13, "Q": 12, "R": 29, "S": 29, "T": 142, "U": 33, "V": 10, "W": 14, "X": 10, "Y": 34, "Z": 11}, "D": {"a": 263, "b": 10, "c": 10, "d": 10, "e": 1189, "f": 10, "g": 10, "h": 11, "i": 375, "j": 10, "k": 10, "l": 10, "m": 141, "n": 17, "o": 953, "p": 10, "q": 10, "r": 232, "s": 11, "t": 10, "u": 285, "v": 11, "w": 17, "x": 10, "y": 13, "z": 10, "A": 46, "B": 16, "C": 24, "D": 25, "E": 130, "F": 17, "G": 16, "H": 16, "I": 135, "J": 13, "K": 10, "L": 23, "M": 23, "N": 18, "O": 49, "P": 24, "Q": 10, "R": 29, "S": 55, "T": 66, "U": 37, "V": 31, "W": 31, "X": 10, "Y": 10, "Z": 10}, "E": {"a": 221, "b": 14, "c": 62, "d": 72, "e": 11, "f": 30, "g": 24, "h": 72, "i": 29, "j": 10, "k": 13, "l": 157, "m": 795, "n": 648, "o": 16, "p": 50, "q": 15, "r": 93, "s": 40, "t": 25, "u": 267, "v": 383, "w": 14, "x": 199, "y": 30, "z": 12, "A": 146, "B": 67, "C": 156, "D": 122, "E": 57, "F": 58, "G": 33, "H": 19, "I": 69, "J": 13, "K": 11, "L": 75, "M": 90, "N": 248, "O": 75, "P": 80, "Q": 14, "R": 732, "S": 314, "T": 76, "U": 40, "V": 64, "W": 62, "X": 41, "Y": 25, "Z": 11}, "F": {"a": 267, "b": 10, "c": 10, "d": 10, "e": 381, "f": 11, "g": 10, "h": 13, "i": 487, "j": 10, "k": 10, "l": 70, "m": 10, "n": 10, "o": 636, "p": 10, "q": 10, "r": 2015, "s": 10, "t": 19, "u": 58, "v": 10, "w": 12, "x": 10, "y": 10, "z": 10, "A": 39, "B": 20, "C": 23, "D": 16, "E": 47, "F": 43, "G": 18, "H": 14, "I": 215, "J": 16, "K": 10, "L": 32, "M": 21, "N": 17, "O": 68, "P": 24, "Q": 10, "R": 36, "S": 24, "T": 94, "U": 19, "V": 11, "W": 21, "X": 10, "Y": 20, "Z": 10}, "G": {"a": 165, "b": 10, "c": 10, "d": 11, "e": 871, "f": 10, "g": 10, "h": 12, "i": 88, "j": 10, "k": 10, "l": 67, "m": 10, "n": 10, "o": 737, "p": 10, "q": 10, "r": 421, "s": 12, "t": 26, "u": 373, "v": 10, "w": 15, "x": 10, "y": 10, "z": 11, "A": 34, "B": 29, "C": 43, "D": 17, "E": 89, "F": 17, "G": 19, "H": 34, "I": 38, "J": 10, "K": 10, "L": 43, "M": 28, "N": 21, "O": 34, "P": 24, "Q": 10, "R": 76, "S": 36, "T": 41, "U": 48, "V": 12, "W": 11, "X": 10, "Y": 13, "Z": 12}, "H": {"a": 879, "b": 10, "c": 10, "d": 10, "e": 3027, "f": 13, "g": 10, "h": 11, "i": 858, "j": 10, "k": 10, "l": 10, "m": 20, "n": 10, "o": 1287, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 218, "v": 10, "w": 10, "x": 10, "y": 51, "z": 10, "A": 492, "B": 15, "C": 14, "D": 15, "E": 341, "F": 14, "G": 14, "H": 18, "I": 81, "J": 13, "K": 11, "L": 18, "M": 18, "N": 19, "O": 51, "P": 16, "Q": 10, "R": 27, "S": 25, "T": 27, "U": 20, "V": 17, "W": 16, "X": 10, "Y": 15, "Z": 10}, "I": {"a": 847, "b": 168, "c": 561, "d": 571, "e": 93, "f": 909, "g": 128, "h": 1063, "i": 48, "j": 29, "k": 269, "l": 845, "m": 508, "n": 2778, "o": 160, "p": 139, "q": 22, "r": 263, "s": 1099, "t": 2743, "u": 77, "v": 161, "w": 865, "x": 10, "y": 20, "z": 10, "A": 100, "B": 36, "C": 212, "D": 47, "E": 95, "F": 37, "G": 206, "H": 15, "I": 592, "J": 12, "K": 10, "L": 84, "M": 42, "N": 270, "O": 220, "P": 31, "Q": 10, "R": 57, "S": 181, "T": 201, "U": 14, "V": 138, "W": 22, "X": 69, "Y": 13, "Z": 17}, "J": {"a": 339, "b": 10, "c": 10, "d": 11, "e": 249, "f": 11, "g": 10, "h": 10, "i": 12, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 283, "p": 10, "q": 10, "r": 11, "s": 10, "t": 10, "u": 329, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 21, "B": 20, "C": 16, "D": 11, "E": 44, "F": 16, "G": 14, "H": 14, "I": 10, "J": 10, "K": 14, "L": 11, "M": 13, "N": 10, "O": 23, "P": 11, "Q": 12, "R": 14, "S": 17, "T": 19, "U": 31, "V": 10, "W": 22, "X": 10, "Y": 10, "Z": 10}, "K": {"a": 250, "b": 12, "c": 11, "d": 10, "e": 107, "f": 11, "g": 10, "h": 20, "i": 250, "j": 10, "k": 10, "l": 50, "m": 10, "n": 76, "o": 114, "p": 10, "q": 10, "r": 83, "s": 11, "t": 11, "u": 670, "v": 10, "w": 12, "x": 10, "y": 11, "z": 10, "A": 13, "B": 10, "C": 17, "D": 10, "E": 16, "F": 14, "G": 10, "H": 20, "I": 21, "J": 10, "K": 22, "L": 12, "M": 12, "N": 13, "O": 16, "P": 13, "Q": 10, "R": 10, "S": 25, "T": 17, "U": 10, "V": 10, "W": 12, "X": 10, "Y": 10, "Z": 10}, "L": {"a": 427, "b": 10, "c": 10, "d": 10, "e": 445, "f": 10, "g": 10, "h": 11, "i": 442, "j": 10, "k": 10, "l": 19, "m": 10, "n": 10, "o": 607, "p": 10, "q": 10, "r": 14, "s": 10, "t": 10, "u": 47, "v": 14, "w": 10, "x": 10, "y": 71, "z": 10, "A": 105, "B": 24, "C": 23, "D": 45, "E": 113, "F": 13, "G": 18, "H": 14, "I": 172, "J": 12, "K": 11, "L": 62, "M": 20, "N": 17, "O": 83, "P": 47, "Q": 10, "R": 18, "S": 44, "T": 27, "U": 43, "V": 15, "W": 20, "X": 10, "Y": 25, "Z": 10}, "M": {"a": 2286, "b": 10, "c": 114, "d": 16, "e": 389, "f": 10, "g": 10, "h": 10, "i": 762, "j": 10, "k": 10, "l": 11, "m": 11, "n": 10, "o": 1351, "p": 10, "q": 10, "r": 423, "s": 10, "t": 11, "u": 205, "v": 10, "w": 10, "x": 10, "y": 287, "z": 10, "A": 106, "B": 38, "C": 16, "D": 12, "E": 116, "F": 15, "G": 10, "H": 13, "I": 69, "J": 12, "K": 12, "L": 13, "M": 39, "N": 15, "O": 57, "P": 41, "Q": 10, "R": 16, "S": 26, "T": 13, "U": 17, "V": 11, "W": 11, "X": 10, "Y": 17, "Z": 10}, "N": {"a": 2095, "b": 12, "c": 10, "d": 10, "e": 935, "f": 10, "g": 10, "h": 10, "i": 733, "j": 10, "k": 10, "l": 12, "m": 10, "n": 11, "o": 1482, "p": 10, "q": 10, "r": 11, "s": 11, "t": 10, "u": 38, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 120, "B": 55, "C": 70, "D": 266, "E": 120, "F": 36, "G": 75, "H": 18, "I": 104, "J": 30, "K": 12, "L": 13, "M": 22, "N": 32, "O": 83, "P": 26, "Q": 10, "R": 27, "S": 111, "T": 202, "U": 22, "V": 14, "W": 16, "X": 10, "Y": 57, "Z": 10}, "O": {"a": 17, "b": 31, "c": 87, "d": 23, "e": 15, "f": 148, "g": 34, "h": 498, "i": 17, "j": 11, "k": 26, "l": 123, "m": 13, "n": 1078, "o": 22, "p": 96, "q": 10, "r": 261, "s": 126, "t": 123, "u": 141, "v": 27, "w": 33, "x": 20, "y": 11, "z": 14, "A": 20, "B": 18, "C": 44, "D": 34, "E": 12, "F": 218, "G": 42, "H": 18, "I": 23, "J": 39, "K": 46, "L": 131, "M": 65, "N": 354, "O": 73, "P": 42, "Q": 10, "R": 174, "S": 45, "T": 57, "U": 107, "V": 30, "W": 38, "X": 11, "Y": 23, "Z": 10}, "P": {"a": 806, "b": 11, "c": 10, "d": 10, "e": 986, "f": 58, "g": 10, "h": 172, "i": 2054, "j": 10, "k": 10, "l": 185, "m": 10, "n": 19, "o": 435, "p": 10, "q": 10, "r": 3285, "s": 18, "t": 12, "u": 156, "v": 10, "w": 13, "x": 10, "y": 29, "z": 10, "A": 75, "B": 11, "C": 10, "D": 13, "E": 47, "F": 13, "G": 17, "H": 43, "I": 29, "J": 11, "K": 10, "L": 32, "M": 18, "N": 10, "O": 72, "P": 30, "Q": 10, "R": 94, "S": 18, "T": 445, "U": 49, "V": 10, "W": 11, "X": 10, "Y": 15, "Z": 10}, "Q": {"a": 10, "b": 10, "c": 10, "d": 10, "e": 10, "f": 10, "g": 10, "h": 10, "i": 10, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 10, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 148, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 12, "B": 10, "C": 10, "D": 10, "E": 10, "F": 10, "G": 10, "H": 10, "I": 10, "J": 10, "K": 10, "L": 10, "M": 10, "N": 10, "O": 10, "P": 10, "Q": 10, "R": 10, "S": 10, "T": 10, "U": 18, "V": 10, "W": 10, "X": 10, "Y": 10, "Z": 10}, "R": {"a": 204, "b": 10, "c": 10, "d": 10, "e": 1095, "f": 10, "g": 10, "h": 106, "i": 180, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 1434, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 925, "v": 10, "w": 12, "x": 10, "y": 47, "z": 10, "A": 149, "B": 22, "C": 45, "D": 23, "E": 219, "F": 21, "G": 73, "H": 19, "I": 281, "J": 12, "K": 22, "L": 36, "M": 40, "N": 40, "O": 112, "P": 26, "Q": 10, "R": 49, "S": 72, "T": 104, "U": 26, "V": 99, "W": 20, "X": 262, "Y": 69, "Z": 10}, "S": {"a": 304, "b": 12, "c": 268, "d": 10, "e": 941, "f": 12, "g": 11, "h": 1239, "i": 278, "j": 10, "k": 63, "l": 77, "m": 160, "n": 16, "o": 1683, "p": 368, "q": 43, "r": 10, "s": 11, "t": 1421, "u": 523, "v": 15, "w": 62, "x": 10, "y": 132, "z": 10, "A": 76, "B": 30, "C": 47, "D": 17, "E": 212, "F": 21, "G": 14, "H": 55, "I": 98, "J": 12, "K": 12, "L": 21, "M": 31, "N": 16, "O": 137, "P": 34, "Q": 10, "R": 16, "S": 88, "T": 190, "U": 67, "V": 15, "W": 21, "X": 11, "Y": 31, "Z": 10}, "T": {"a": 225, "b": 10, "c": 10, "d": 10, "e": 392, "f": 10, "g": 10, "h": 11816, "i": 259, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 774, "p": 13, "q": 10, "r": 379, "s": 90, "t": 10, "u": 308, "v": 20, "w": 138, "x": 10, "y": 32, "z": 10, "A": 96, "B": 19, "C": 21, "D": 15, "E": 620, "F": 17, "G": 40, "H": 397, "I": 350, "J": 12, "K": 10, "L": 25, "M": 30, "N": 28, "O": 103, "P": 17, "Q": 10, "R": 85, "S": 48, "T": 42, "U": 67, "V": 16, "W": 30, "X": 10, "Y": 51, "Z": 10}, "U": {"a": 10, "b": 10, "c": 10, "d": 10, "e": 10, "f": 11, "g": 13, "h": 42, "i": 10, "j": 10, "k": 16, "l": 78, "m": 10, "n": 873, "o": 10, "p": 36, "q": 10, "r": 22, "s": 27, "t": 41, "u": 10, "v": 22, "w": 10, "x": 10, "y": 10, "z": 10, "A": 32, "B": 40, "C": 23, "D": 20, "E": 35, "F": 14, "G": 19, "H": 14, "I": 16, "J": 10, "K": 10, "L": 45, "M": 45, "N": 70, "O": 10, "P": 17, "Q": 10, "R": 123, "S": 80, "T": 109, "U": 12, "V": 10, "W": 12, "X": 10, "Y": 10, "Z": 11}, "V": {"a": 458, "b": 10, "c": 10, "d": 10, "e": 288, "f": 10, "g": 10, "h": 10, "i": 405, "j": 10, "k": 10, "l": 26, "m": 10, "n": 10, "o": 173, "p": 34, "q": 10, "r": 13, "s": 12, "t": 11, "u": 11, "v": 10, "w": 14, "x": 10, "y": 36, "z": 10, "A": 23, "B": 10, "C": 10, "D": 14, "E": 125, "F": 10, "G": 10, "H": 10, "I": 227, "J": 11, "K": 10, "L": 10, "M": 11, "N": 10, "O": 28, "P": 11, "Q": 10, "R": 14, "S": 13, "T": 24, "U": 10, "V": 10, "W": 10, "X": 12, "Y": 10, "Z": 10}, "W": {"a": 695, "b": 10, "c": 10, "d": 10, "e": 1447, "f": 10, "g": 10, "h": 2860, "i": 716, "j": 10, "k": 10, "l": 11, "m": 12, "n": 10, "o": 287, "p": 10, "q": 10, "r": 34, "s": 10, "t": 10, "u": 20, "v": 10, "w": 10, "x": 10, "y": 26, "z": 10, "A": 66, "B": 17, "C": 16, "D": 17, "E": 54, "F": 15, "G": 16, "H": 27, "I": 36, "J": 13, "K": 10, "L": 11, "M": 14, "N": 15, "O": 39, "P": 12, "Q": 10, "R": 13, "S": 17, "T": 18, "U": 10, "V": 10, "W": 12, "X": 10, "Y": 15, "Z": 12}, "X": {"a": 13, "b": 10, "c": 10, "d": 10, "e": 10, "f": 10, "g": 11, "h": 10, "i": 10, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 10, "p": 13, "q": 10, "r": 93, "s": 12, "t": 11, "u": 10, "v": 10, "w": 12, "x": 10, "y": 10, "z": 10, "A": 18, "B": 10, "C": 13, "D": 11, "E": 11, "F": 11, "G": 10, "H": 10, "I": 186, "J": 10, "K": 10, "L": 10, "M": 11, "N": 10, "O": 10, "P": 18, "Q": 10, "R": 11, "S": 12, "T": 19, "U": 10, "V": 128, "W": 10, "X": 113, "Y": 11, "Z": 10}, "Y": {"a": 73, "b": 10, "c": 10, "d": 10, "e": 605, "f": 10, "g": 10, "h": 10, "i": 13, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 1331, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 18, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 29, "B": 15, "C": 13, "D": 26, "E": 20, "F": 13, "G": 11, "H": 11, "I": 17, "J": 10, "K": 14, "L": 15, "M": 23, "N": 13, "O": 76, "P": 22, "Q": 11, "R": 15, "S": 37, "T": 16, "U": 11, "V": 10, "W": 12, "X": 13, "Y": 13, "Z": 11}, "Z": {"a": 43, "b": 10, "c": 10, "d": 17, "e": 20, "f": 10, "g": 10, "h": 59, "i": 12, "j": 10, "k": 10, "l": 10, "m": 10, "n": 27, "o": 11, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 19, "v": 10, "w": 11, "x": 10, "y": 10, "z": 10, "A": 13, "B": 10, "C": 10, "D": 10, "E": 14, "F": 10, "G": 10, "H": 10, "I": 11, "J": 10, "K": 10, "L": 10, "M": 10, "N": 11, "O": 12, "P": 10, "Q": 10, "R": 12, "S": 10, "T": 10, "U": 10, "V": 10, "W": 11, "X": 10, "Y": 10, "Z": 10}}} diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index acfc40843..3fa1db1b8 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -29,10 +29,24 @@ def test_html(ip_addr="localhost", batch_size=20): def test_text(ip_addr="localhost", batch_size=20): - proxies = {"http": ""} + proxies = {"http":""} url = f"http://{ip_addr}:6357/v1/piidetect" - content = pd.read_csv("data/ai_rss.csv")["Description"] - content = content[:batch_size].to_list() + if os.path.exists("data/ai_rss.csv"): + content = pd.read_csv("data/ai_rss.csv")["Description"] + content = content[:batch_size].to_list() + else: + content = ["""With new architectures, there comes a bit of a dilemma. After having spent billions of dollars training models with older architectures, companies rightfully wonder if it is worth spending billions more on a newer architecture that may itself be outmoded soon. +One possible solution to this dilemma is transfer learning. The idea here is to put noise into the trained model and then use the output given to then backpropagate on the new model. The idea here is that you don’t need to worry about generating huge amounts of novel data and potentially the number of epochs you have to train for is also significantly reduced. This idea has not been perfected yet, so it remains to be seen the role it will play in the future. +Nevertheless, as businesses become more invested in these architectures the potential for newer architectures that improve cost will only increase. Time will tell how quickly the industry moves to adopt them. +For those who are building apps that allow for a seamless transition between models, you can look at the major strives made in throughput and latency by YOCO and have hope that the major bottlenecks your app is having may soon be resolved. +It’s an exciting time to be building. +With special thanks to Christopher Taylor for his feedback on this blog post. +[1] Sun, Y., et al. “You Only Cache Once: Decoder-Decoder Architectures for Language Models” (2024), arXiv +[2] Sun, Y., et al. “Retentive Network: A Successor to Transformer for Large Language Models” (2023), arXiv +[3] Wikimedia Foundation, et al. “Hadamard product (matrices)” (2024), Wikipedia +[4] Sanderson, G. et al., “Attention in transformers, visually explained | Chapter 6, Deep Learning” (2024), YouTube +[5] A. Vaswani, et al., “Attention Is All You Need” (2017), arXiv +Understanding You Only Cache Once was originally published in Towards Data Science on Medium, where people are continuing the conversation by highlighting and responding to this story."""] * batch_size payload = {"text_list": json.dumps(content)} with Timer(f"send {len(content)} text to pii detection endpoint"): diff --git a/comps/guardrails/pii_detection/utils.py b/comps/guardrails/pii_detection/utils.py index c23145553..7ce0b58ab 100644 --- a/comps/guardrails/pii_detection/utils.py +++ b/comps/guardrails/pii_detection/utils.py @@ -10,7 +10,6 @@ from pathlib import Path import pandas as pd -from fastapi import HTTPException class Timer: @@ -122,4 +121,4 @@ async def save_file_to_local_disk(save_path: str, file): fout.write(content) except Exception as e: print(f"Write file failed. Exception: {e}") - raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") + raise SystemError(f"Write file {save_path} failed. Exception: {e}") diff --git a/tests/test_pii_detection.sh b/tests/test_pii_detection.sh new file mode 100644 index 000000000..a5f5fe4ad --- /dev/null +++ b/tests/test_pii_detection.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +#set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + echo "Start building docker images for microservice" + cd $WORKPATH + docker build -t opea/guardrails-pii-detection:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile . + echo "Docker images built" +} + +function start_service() { + echo "Starting microservice" + docker run -d --runtime=runc --name="test-guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest + sleep 5 + echo "Microservice started" +} + +function validate_microservice() { + echo "Validate microservice started" + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + echo "test 1 - single task" + python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address + echo "test 2 - 20 tasks in parallel" + python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address + echo "Validate microservice completed" +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-guardrails-pii-detection-endpoint") + echo "Shutdown legacy containers "$cid + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo "cleanup container images and volumes" + echo y | docker system prune 2>&1 > /dev/null + +} + +main \ No newline at end of file From 07aa1d6ece196f91d1d07f1deba7a2d847d1a025 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 12 Jun 2024 20:52:01 +0000 Subject: [PATCH 05/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/test.py | 11 ++++++++--- tests/test_pii_detection.sh | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index 3fa1db1b8..4149ed42e 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -29,13 +29,15 @@ def test_html(ip_addr="localhost", batch_size=20): def test_text(ip_addr="localhost", batch_size=20): - proxies = {"http":""} + proxies = {"http": ""} url = f"http://{ip_addr}:6357/v1/piidetect" if os.path.exists("data/ai_rss.csv"): content = pd.read_csv("data/ai_rss.csv")["Description"] content = content[:batch_size].to_list() else: - content = ["""With new architectures, there comes a bit of a dilemma. After having spent billions of dollars training models with older architectures, companies rightfully wonder if it is worth spending billions more on a newer architecture that may itself be outmoded soon. + content = ( + [ + """With new architectures, there comes a bit of a dilemma. After having spent billions of dollars training models with older architectures, companies rightfully wonder if it is worth spending billions more on a newer architecture that may itself be outmoded soon. One possible solution to this dilemma is transfer learning. The idea here is to put noise into the trained model and then use the output given to then backpropagate on the new model. The idea here is that you don’t need to worry about generating huge amounts of novel data and potentially the number of epochs you have to train for is also significantly reduced. This idea has not been perfected yet, so it remains to be seen the role it will play in the future. Nevertheless, as businesses become more invested in these architectures the potential for newer architectures that improve cost will only increase. Time will tell how quickly the industry moves to adopt them. For those who are building apps that allow for a seamless transition between models, you can look at the major strives made in throughput and latency by YOCO and have hope that the major bottlenecks your app is having may soon be resolved. @@ -46,7 +48,10 @@ def test_text(ip_addr="localhost", batch_size=20): [3] Wikimedia Foundation, et al. “Hadamard product (matrices)” (2024), Wikipedia [4] Sanderson, G. et al., “Attention in transformers, visually explained | Chapter 6, Deep Learning” (2024), YouTube [5] A. Vaswani, et al., “Attention Is All You Need” (2017), arXiv -Understanding You Only Cache Once was originally published in Towards Data Science on Medium, where people are continuing the conversation by highlighting and responding to this story."""] * batch_size +Understanding You Only Cache Once was originally published in Towards Data Science on Medium, where people are continuing the conversation by highlighting and responding to this story.""" + ] + * batch_size + ) payload = {"text_list": json.dumps(content)} with Timer(f"send {len(content)} text to pii detection endpoint"): diff --git a/tests/test_pii_detection.sh b/tests/test_pii_detection.sh index a5f5fe4ad..4510ca3a4 100644 --- a/tests/test_pii_detection.sh +++ b/tests/test_pii_detection.sh @@ -53,4 +53,4 @@ function main() { } -main \ No newline at end of file +main From 03ea7f09196a6c7fda5580d6f26f4e148dc1ae69 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Fri, 14 Jun 2024 23:28:02 +0000 Subject: [PATCH 06/18] update README per comments Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/README.md | 32 ++++++------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md index 99eaec422..a6a9d5ae0 100644 --- a/comps/guardrails/pii_detection/README.md +++ b/comps/guardrails/pii_detection/README.md @@ -1,5 +1,7 @@ # PII Detection Microservice +PII Detection a method to detect Personal Identifiable Information in text. This microservice provides users a unified API to either uploade your files or send a list of text, and return with a list following original sequence of labels marking if it contains PII or not. + # 🚀1. Start Microservice with Python(Option 1) ## 1.1 Install Requirements @@ -8,21 +10,7 @@ pip install -r requirements.txt ``` -## 1.2 Start LLM endpoint - -TBD: Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). - -## 1.3 Setup Environment Variables - - - -## 1.4 Start PII Detection Microservice with Python Script +## 1.2 Start PII Detection Microservice with Python Script Start pii detection microservice with below command. @@ -34,9 +22,9 @@ python pii_detection.py ## 2.1 Prepare PII detection model -## 2.1.1 use LLM endpoint +## 2.1.1 use LLM endpoint (will add later) -TBD +intro placeholder ## 2.1.2 use NER model (default mode) @@ -46,18 +34,14 @@ apt install git-lfs cd pii/bigcode; git clone https://{hf_username}:{hf_token}@huggingface.co/bigcode/starpii/; cd ../.. ``` -## 2.2 Setup Environment Variables - -TBD - -## 2.3 Build Docker Image +## 2.2 Build Docker Image ```bash cd ../../../ # back to GenAIComps/ folder docker build -t opea/guardrails-pii-detection:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile . ``` -## 2.4 Run Docker with CLI +## 2.3 Run Docker with CLI ```bash docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest @@ -69,7 +53,7 @@ docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest ``` -# 🚀3. Status Microservice +# 🚀3. Get Status of Microservice ```bash docker container logs -f guardrails-pii-detection-endpoint From b9e50301f6da394c0cbcac0678ab0d507da5f81c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 23:28:59 +0000 Subject: [PATCH 07/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md index a6a9d5ae0..202198dc7 100644 --- a/comps/guardrails/pii_detection/README.md +++ b/comps/guardrails/pii_detection/README.md @@ -1,6 +1,6 @@ # PII Detection Microservice -PII Detection a method to detect Personal Identifiable Information in text. This microservice provides users a unified API to either uploade your files or send a list of text, and return with a list following original sequence of labels marking if it contains PII or not. +PII Detection a method to detect Personal Identifiable Information in text. This microservice provides users a unified API to either upload your files or send a list of text, and return with a list following original sequence of labels marking if it contains PII or not. # 🚀1. Start Microservice with Python(Option 1) From a2b63f0bf42aecef90727746e94027e48fde471f Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Thu, 20 Jun 2024 22:44:12 +0000 Subject: [PATCH 08/18] remove model specification in README Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/README.md | 14 ++++---------- comps/guardrails/pii_detection/data_utils.py | 1 - comps/guardrails/pii_detection/pii_detection.py | 2 +- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md index 202198dc7..f98cea444 100644 --- a/comps/guardrails/pii_detection/README.md +++ b/comps/guardrails/pii_detection/README.md @@ -22,18 +22,12 @@ python pii_detection.py ## 2.1 Prepare PII detection model +export HUGGINGFACEHUB_API_TOKEN=${HP_TOKEN} + ## 2.1.1 use LLM endpoint (will add later) intro placeholder -## 2.1.2 use NER model (default mode) - -```bash -mkdir -p pii/bigcode -apt install git-lfs -cd pii/bigcode; git clone https://{hf_username}:{hf_token}@huggingface.co/bigcode/starpii/; cd ../.. -``` - ## 2.2 Build Docker Image ```bash @@ -44,13 +38,13 @@ docker build -t opea/guardrails-pii-detection:latest --build-arg https_proxy=$ht ## 2.3 Run Docker with CLI ```bash -docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest +docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-pii-detection:latest ``` > debug mode ```bash -docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest +docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-pii-detection:latest ``` # 🚀3. Get Status of Microservice diff --git a/comps/guardrails/pii_detection/data_utils.py b/comps/guardrails/pii_detection/data_utils.py index dfafbc670..29e9c4196 100644 --- a/comps/guardrails/pii_detection/data_utils.py +++ b/comps/guardrails/pii_detection/data_utils.py @@ -164,7 +164,6 @@ def load_svg(svg_path): return text -@timeout(600) def document_loader(doc_path): if doc_path.endswith(".pdf"): return load_pdf(doc_path) diff --git a/comps/guardrails/pii_detection/pii_detection.py b/comps/guardrails/pii_detection/pii_detection.py index 8f73a4e4e..b49ac7065 100644 --- a/comps/guardrails/pii_detection/pii_detection.py +++ b/comps/guardrails/pii_detection/pii_detection.py @@ -38,7 +38,7 @@ def get_pii_detection_inst(strategy="dummy", settings=None): if strategy == "ner": - return PIIDetectorWithNER(model_path="pii") + return PIIDetectorWithNER() elif strategy == "ml": return PIIDetectorWithML() elif strategy == "llm": From 7a44a97d08209601ea1aac08b652b2645e3ecfbd Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Fri, 21 Jun 2024 15:27:57 +0000 Subject: [PATCH 09/18] Remove big_model and update README Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/README.md | 4 ++-- .../pii_detection/pii/detect/gibberish_data/big.model | 1 - comps/guardrails/pii_detection/pii/detect/keys_detection.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) delete mode 100644 comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model diff --git a/comps/guardrails/pii_detection/README.md b/comps/guardrails/pii_detection/README.md index f98cea444..dba386e38 100644 --- a/comps/guardrails/pii_detection/README.md +++ b/comps/guardrails/pii_detection/README.md @@ -38,13 +38,13 @@ docker build -t opea/guardrails-pii-detection:latest --build-arg https_proxy=$ht ## 2.3 Run Docker with CLI ```bash -docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-pii-detection:latest +docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-pii-detection:latest ``` > debug mode ```bash -docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-pii-detection:latest +docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-pii-detection:latest ``` # 🚀3. Get Status of Microservice diff --git a/comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model b/comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model deleted file mode 100644 index be28fda01..000000000 --- a/comps/guardrails/pii_detection/pii/detect/gibberish_data/big.model +++ /dev/null @@ -1 +0,0 @@ -{"charset": "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", "ngram_size": 2, "counts": {"a": {"a": 1162, "b": 8940, "c": 17954, "d": 20101, "e": 1016, "f": 4796, "g": 8259, "h": 1263, "i": 16189, "j": 463, "k": 4547, "l": 33583, "m": 10869, "n": 80994, "o": 655, "p": 9287, "q": 188, "r": 40916, "s": 40996, "t": 58739, "u": 4054, "v": 8927, "w": 5205, "x": 560, "y": 10706, "z": 534, "A": 145, "B": 84, "C": 125, "D": 166, "E": 33, "F": 142, "G": 77, "H": 84, "I": 213, "J": 21, "K": 49, "L": 43, "M": 195, "N": 92, "O": 40, "P": 237, "Q": 10, "R": 133, "S": 166, "T": 234, "U": 23, "V": 32, "W": 92, "X": 11, "Y": 33, "Z": 13}, "b": {"a": 5069, "b": 459, "c": 182, "d": 151, "e": 21707, "f": 25, "g": 14, "h": 40, "i": 2511, "j": 493, "k": 11, "l": 8545, "m": 189, "n": 86, "o": 7189, "p": 27, "q": 11, "r": 4184, "s": 1683, "t": 729, "u": 7093, "v": 138, "w": 60, "x": 10, "y": 6685, "z": 10, "A": 19, "B": 11, "C": 15, "D": 16, "E": 15, "F": 11, "G": 11, "H": 14, "I": 21, "J": 10, "K": 10, "L": 11, "M": 14, "N": 11, "O": 11, "P": 15, "Q": 10, "R": 13, "S": 12, "T": 25, "U": 11, "V": 10, "W": 15, "X": 10, "Y": 10, "Z": 10}, "c": {"a": 17333, "b": 121, "c": 3239, "d": 186, "e": 25355, "f": 138, "g": 60, "h": 20424, "i": 7672, "j": 21, "k": 5059, "l": 5203, "m": 94, "n": 69, "o": 26665, "p": 161, "q": 252, "r": 4874, "s": 673, "t": 13274, "u": 5715, "v": 45, "w": 159, "x": 10, "y": 1365, "z": 23, "A": 42, "B": 24, "C": 87, "D": 30, "E": 14, "F": 28, "G": 20, "H": 36, "I": 30, "J": 12, "K": 49, "L": 20, "M": 24, "N": 27, "O": 28, "P": 30, "Q": 11, "R": 21, "S": 21, "T": 45, "U": 16, "V": 10, "W": 18, "X": 10, "Y": 12, "Z": 11}, "d": {"a": 17910, "b": 8127, "c": 3135, "d": 4524, "e": 29234, "f": 4490, "g": 2437, "h": 8426, "i": 24346, "j": 663, "k": 412, "l": 4307, "m": 3688, "n": 4691, "o": 14478, "p": 2799, "q": 230, "r": 7394, "s": 11784, "t": 21680, "u": 4985, "v": 1332, "w": 6241, "x": 11, "y": 2841, "z": 24, "A": 720, "B": 524, "C": 328, "D": 407, "E": 145, "F": 243, "G": 189, "H": 672, "I": 1439, "J": 97, "K": 175, "L": 168, "M": 502, "N": 735, "O": 187, "P": 1154, "Q": 11, "R": 331, "S": 1087, "T": 946, "U": 100, "V": 129, "W": 449, "X": 15, "Y": 130, "Z": 30}, "e": {"a": 45146, "b": 8292, "c": 25648, "d": 61505, "e": 19282, "f": 14724, "g": 7750, "h": 10910, "i": 16307, "j": 757, "k": 1505, "l": 25061, "m": 21108, "n": 59327, "o": 17248, "p": 15426, "q": 1639, "r": 90222, "s": 64863, "t": 33539, "u": 3339, "v": 11339, "w": 17755, "x": 6947, "y": 8098, "z": 282, "A": 2274, "B": 1003, "C": 1053, "D": 542, "E": 1015, "F": 1177, "G": 489, "H": 788, "I": 1721, "J": 137, "K": 305, "L": 305, "M": 658, "N": 695, "O": 374, "P": 1086, "Q": 31, "R": 1025, "S": 1355, "T": 1650, "U": 516, "V": 386, "W": 757, "X": 64, "Y": 180, "Z": 22}, "f": {"a": 11168, "b": 969, "c": 1394, "d": 702, "e": 10314, "f": 7152, "g": 581, "h": 2603, "i": 11018, "j": 137, "k": 70, "l": 3443, "m": 1172, "n": 540, "o": 17857, "p": 1019, "q": 29, "r": 9769, "s": 2042, "t": 19878, "u": 3750, "v": 269, "w": 1356, "x": 10, "y": 765, "z": 18, "A": 385, "B": 287, "C": 315, "D": 99, "E": 176, "F": 163, "G": 138, "H": 168, "I": 305, "J": 134, "K": 115, "L": 152, "M": 315, "N": 290, "O": 102, "P": 339, "Q": 13, "R": 194, "S": 359, "T": 216, "U": 52, "V": 64, "W": 146, "X": 13, "Y": 25, "Z": 13}, "g": {"a": 10565, "b": 1241, "c": 837, "d": 729, "e": 12802, "f": 1455, "g": 1233, "h": 12495, "i": 7265, "j": 101, "k": 59, "l": 3736, "m": 1042, "n": 2633, "o": 7546, "p": 837, "q": 61, "r": 7571, "s": 3187, "t": 7692, "u": 3131, "v": 155, "w": 1615, "x": 11, "y": 430, "z": 20, "A": 164, "B": 129, "C": 91, "D": 57, "E": 47, "F": 72, "G": 58, "H": 148, "I": 282, "J": 33, "K": 28, "L": 78, "M": 123, "N": 136, "O": 61, "P": 213, "Q": 14, "R": 63, "S": 165, "T": 289, "U": 28, "V": 21, "W": 108, "X": 11, "Y": 44, "Z": 11}, "h": {"a": 47131, "b": 741, "c": 957, "d": 642, "e": 138176, "f": 774, "g": 464, "h": 2548, "i": 40469, "j": 50, "k": 146, "l": 803, "m": 1434, "n": 666, "o": 22759, "p": 687, "q": 48, "r": 3474, "s": 2109, "t": 11467, "u": 2847, "v": 181, "w": 1644, "x": 10, "y": 2246, "z": 13, "A": 199, "B": 120, "C": 205, "D": 93, "E": 97, "F": 62, "G": 93, "H": 122, "I": 375, "J": 34, "K": 42, "L": 39, "M": 121, "N": 130, "O": 55, "P": 207, "Q": 14, "R": 97, "S": 143, "T": 255, "U": 22, "V": 35, "W": 108, "X": 10, "Y": 29, "Z": 15}, "i": {"a": 9009, "b": 3263, "c": 22405, "d": 14515, "e": 15299, "f": 6646, "g": 9004, "h": 85, "i": 97, "j": 12, "k": 1938, "l": 16083, "m": 15056, "n": 95350, "o": 25233, "p": 2653, "q": 143, "r": 11957, "s": 46613, "t": 42319, "u": 544, "v": 7775, "w": 147, "x": 708, "y": 13, "z": 1468, "A": 20, "B": 20, "C": 28, "D": 27, "E": 14, "F": 30, "G": 11, "H": 22, "I": 27, "J": 10, "K": 13, "L": 12, "M": 19, "N": 17, "O": 12, "P": 15, "Q": 11, "R": 21, "S": 25, "T": 33, "U": 10, "V": 29, "W": 21, "X": 10, "Y": 13, "Z": 12}, "j": {"a": 305, "b": 10, "c": 11, "d": 10, "e": 1355, "f": 10, "g": 10, "h": 10, "i": 25, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 1582, "p": 10, "q": 10, "r": 11, "s": 10, "t": 10, "u": 1891, "v": 10, "w": 11, "x": 10, "y": 10, "z": 10, "A": 10, "B": 10, "C": 10, "D": 10, "E": 10, "F": 10, "G": 10, "H": 10, "I": 10, "J": 10, "K": 10, "L": 10, "M": 10, "N": 11, "O": 10, "P": 11, "Q": 10, "R": 10, "S": 10, "T": 11, "U": 10, "V": 10, "W": 10, "X": 10, "Y": 10, "Z": 10}, "k": {"a": 1835, "b": 271, "c": 217, "d": 131, "e": 9574, "f": 315, "g": 75, "h": 1305, "i": 5623, "j": 29, "k": 28, "l": 723, "m": 238, "n": 3083, "o": 1550, "p": 175, "q": 23, "r": 180, "s": 1942, "t": 1073, "u": 244, "v": 45, "w": 553, "x": 10, "y": 438, "z": 10, "A": 55, "B": 39, "C": 39, "D": 25, "E": 15, "F": 31, "G": 45, "H": 168, "I": 140, "J": 13, "K": 14, "L": 20, "M": 34, "N": 36, "O": 29, "P": 54, "Q": 10, "R": 31, "S": 48, "T": 127, "U": 12, "V": 19, "W": 52, "X": 10, "Y": 26, "Z": 11}, "l": {"a": 22409, "b": 1474, "c": 1843, "d": 11939, "e": 32940, "f": 4037, "g": 746, "h": 1160, "i": 22946, "j": 111, "k": 1494, "l": 25367, "m": 2129, "n": 791, "o": 17950, "p": 1660, "q": 86, "r": 1381, "s": 5632, "t": 7478, "u": 4449, "v": 1426, "w": 2047, "x": 10, "y": 19227, "z": 39, "A": 195, "B": 95, "C": 137, "D": 70, "E": 72, "F": 104, "G": 60, "H": 157, "I": 426, "J": 39, "K": 27, "L": 63, "M": 118, "N": 77, "O": 48, "P": 178, "Q": 13, "R": 87, "S": 175, "T": 250, "U": 22, "V": 55, "W": 187, "X": 10, "Y": 27, "Z": 13}, "m": {"a": 22099, "b": 3726, "c": 392, "d": 290, "e": 31697, "f": 706, "g": 193, "h": 1069, "i": 11029, "j": 36, "k": 42, "l": 476, "m": 3553, "n": 666, "o": 13546, "p": 8637, "q": 29, "r": 301, "s": 4505, "t": 3947, "u": 3890, "v": 108, "w": 1220, "x": 10, "y": 3884, "z": 11, "A": 207, "B": 141, "C": 69, "D": 69, "E": 47, "F": 60, "G": 42, "H": 249, "I": 354, "J": 30, "K": 49, "L": 74, "M": 149, "N": 123, "O": 77, "P": 211, "Q": 11, "R": 76, "S": 136, "T": 372, "U": 27, "V": 44, "W": 123, "X": 10, "Y": 42, "Z": 11}, "n": {"a": 20659, "b": 2867, "c": 18809, "d": 64573, "e": 30356, "f": 5293, "g": 44288, "h": 5175, "i": 17452, "j": 792, "k": 2822, "l": 4632, "m": 2734, "n": 4025, "o": 26556, "p": 1921, "q": 477, "r": 1656, "s": 20866, "t": 55105, "u": 3460, "v": 2012, "w": 4858, "x": 212, "y": 4302, "z": 60, "A": 728, "B": 317, "C": 306, "D": 225, "E": 273, "F": 479, "G": 166, "H": 417, "I": 1052, "J": 152, "K": 94, "L": 143, "M": 529, "N": 335, "O": 207, "P": 579, "Q": 23, "R": 273, "S": 442, "T": 962, "U": 51, "V": 105, "W": 335, "X": 14, "Y": 84, "Z": 16}, "o": {"a": 5545, "b": 4746, "c": 6767, "d": 7970, "e": 2158, "f": 44409, "g": 2851, "h": 4165, "i": 5590, "j": 457, "k": 4838, "l": 16155, "m": 24542, "n": 64681, "o": 12767, "p": 8518, "q": 104, "r": 44780, "s": 16057, "t": 24766, "u": 42249, "v": 11214, "w": 18356, "x": 479, "y": 2271, "z": 168, "A": 193, "B": 164, "C": 131, "D": 104, "E": 76, "F": 123, "G": 93, "H": 96, "I": 320, "J": 31, "K": 93, "L": 63, "M": 227, "N": 189, "O": 48, "P": 413, "Q": 11, "R": 156, "S": 162, "T": 168, "U": 17, "V": 64, "W": 81, "X": 11, "Y": 45, "Z": 20}, "p": {"a": 11647, "b": 239, "c": 246, "d": 74, "e": 16431, "f": 260, "g": 61, "h": 2655, "i": 5213, "j": 33, "k": 47, "l": 8956, "m": 277, "n": 91, "o": 11880, "p": 6089, "q": 19, "r": 13223, "s": 2287, "t": 4181, "u": 3898, "v": 23, "w": 372, "x": 10, "y": 697, "z": 10, "A": 79, "B": 29, "C": 31, "D": 19, "E": 39, "F": 36, "G": 17, "H": 44, "I": 74, "J": 17, "K": 13, "L": 16, "M": 28, "N": 26, "O": 36, "P": 28, "Q": 10, "R": 23, "S": 34, "T": 84, "U": 11, "V": 12, "W": 35, "X": 12, "Y": 16, "Z": 10}, "q": {"a": 10, "b": 10, "c": 10, "d": 10, "e": 10, "f": 10, "g": 10, "h": 10, "i": 10, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 11, "p": 11, "q": 10, "r": 12, "s": 11, "t": 10, "u": 4426, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 10, "B": 10, "C": 10, "D": 10, "E": 10, "F": 10, "G": 10, "H": 10, "I": 10, "J": 10, "K": 10, "L": 10, "M": 10, "N": 10, "O": 10, "P": 10, "Q": 10, "R": 10, "S": 10, "T": 11, "U": 10, "V": 10, "W": 10, "X": 10, "Y": 10, "Z": 10}, "r": {"a": 29197, "b": 2704, "c": 5953, "d": 8506, "e": 73484, "f": 3667, "g": 4541, "h": 3944, "i": 30893, "j": 146, "k": 2513, "l": 4051, "m": 8900, "n": 6781, "o": 32255, "p": 2947, "q": 114, "r": 8982, "s": 19875, "t": 19965, "u": 5144, "v": 2592, "w": 3543, "x": 13, "y": 12015, "z": 105, "A": 490, "B": 247, "C": 178, "D": 125, "E": 122, "F": 145, "G": 136, "H": 443, "I": 646, "J": 83, "K": 94, "L": 103, "M": 314, "N": 231, "O": 121, "P": 348, "Q": 13, "R": 162, "S": 365, "T": 624, "U": 38, "V": 54, "W": 302, "X": 24, "Y": 64, "Z": 16}, "s": {"a": 30379, "b": 4706, "c": 9020, "d": 3048, "e": 38911, "f": 5532, "g": 1586, "h": 21428, "i": 27643, "j": 315, "k": 3714, "l": 5263, "m": 6509, "n": 3654, "o": 27677, "p": 9609, "q": 581, "r": 2650, "s": 26502, "t": 49807, "u": 13227, "v": 923, "w": 9204, "x": 13, "y": 2197, "z": 23, "A": 756, "B": 412, "C": 298, "D": 267, "E": 171, "F": 251, "G": 138, "H": 562, "I": 1593, "J": 105, "K": 77, "L": 145, "M": 901, "N": 293, "O": 248, "P": 430, "Q": 19, "R": 215, "S": 566, "T": 1794, "U": 59, "V": 100, "W": 466, "X": 38, "Y": 110, "Z": 16}, "t": {"a": 28134, "b": 3582, "c": 3747, "d": 2250, "e": 44165, "f": 3218, "g": 968, "h": 143750, "i": 49233, "j": 196, "k": 507, "l": 7141, "m": 3857, "n": 1721, "o": 51180, "p": 1899, "q": 144, "r": 15312, "s": 16128, "t": 22458, "u": 10151, "v": 371, "w": 9265, "x": 18, "y": 7794, "z": 162, "A": 603, "B": 505, "C": 282, "D": 189, "E": 110, "F": 156, "G": 341, "H": 461, "I": 1662, "J": 82, "K": 139, "L": 146, "M": 366, "N": 387, "O": 173, "P": 688, "Q": 12, "R": 279, "S": 432, "T": 911, "U": 41, "V": 76, "W": 399, "X": 10, "Y": 108, "Z": 27}, "u": {"a": 4098, "b": 3395, "c": 5664, "d": 2716, "e": 5223, "f": 1022, "g": 5801, "h": 496, "i": 3406, "j": 19, "k": 539, "l": 14496, "m": 4877, "n": 16310, "o": 411, "p": 6303, "q": 22, "r": 20624, "s": 19715, "t": 19727, "u": 64, "v": 163, "w": 562, "x": 92, "y": 119, "z": 714, "A": 41, "B": 18, "C": 27, "D": 21, "E": 10, "F": 14, "G": 20, "H": 33, "I": 95, "J": 16, "K": 20, "L": 20, "M": 39, "N": 20, "O": 17, "P": 31, "Q": 10, "R": 14, "S": 22, "T": 51, "U": 10, "V": 13, "W": 47, "X": 10, "Y": 38, "Z": 10}, "v": {"a": 4275, "b": 79, "c": 78, "d": 65, "e": 30740, "f": 78, "g": 63, "h": 217, "i": 8610, "j": 16, "k": 33, "l": 275, "m": 43, "n": 580, "o": 3071, "p": 47, "q": 13, "r": 202, "s": 811, "t": 193, "u": 105, "v": 12, "w": 389, "x": 10, "y": 253, "z": 10, "A": 40, "B": 19, "C": 19, "D": 29, "E": 12, "F": 11, "G": 15, "H": 25, "I": 40, "J": 11, "K": 17, "L": 14, "M": 16, "N": 20, "O": 15, "P": 25, "Q": 10, "R": 17, "S": 15, "T": 29, "U": 10, "V": 14, "W": 23, "X": 10, "Y": 16, "Z": 11}, "w": {"a": 20657, "b": 432, "c": 438, "d": 837, "e": 13748, "f": 449, "g": 182, "h": 17734, "i": 17019, "j": 36, "k": 116, "l": 646, "m": 413, "n": 4196, "o": 8341, "p": 202, "q": 43, "r": 1185, "s": 2111, "t": 1867, "u": 168, "v": 75, "w": 971, "x": 11, "y": 270, "z": 10, "A": 92, "B": 66, "C": 41, "D": 38, "E": 109, "F": 44, "G": 24, "H": 92, "I": 278, "J": 68, "K": 22, "L": 24, "M": 69, "N": 68, "O": 58, "P": 84, "Q": 10, "R": 36, "S": 61, "T": 126, "U": 14, "V": 13, "W": 84, "X": 11, "Y": 194, "Z": 15}, "x": {"a": 1109, "b": 27, "c": 1315, "d": 35, "e": 853, "f": 44, "g": 14, "h": 169, "i": 1119, "j": 12, "k": 10, "l": 30, "m": 35, "n": 19, "o": 220, "p": 2234, "q": 17, "r": 26, "s": 57, "t": 1606, "u": 133, "v": 18, "w": 71, "x": 10, "y": 70, "z": 10, "A": 15, "B": 11, "C": 15, "D": 11, "E": 11, "F": 13, "G": 11, "H": 12, "I": 24, "J": 13, "K": 15, "L": 11, "M": 15, "N": 16, "O": 11, "P": 17, "Q": 10, "R": 14, "S": 13, "T": 24, "U": 10, "V": 11, "W": 12, "X": 10, "Y": 11, "Z": 10}, "y": {"a": 9218, "b": 3429, "c": 2581, "d": 2432, "e": 5148, "f": 2506, "g": 976, "h": 2733, "i": 4729, "j": 159, "k": 275, "l": 1956, "m": 2611, "n": 1146, "o": 12036, "p": 2407, "q": 111, "r": 1794, "s": 7375, "t": 8667, "u": 612, "v": 295, "w": 3769, "x": 37, "y": 386, "z": 40, "A": 418, "B": 237, "C": 185, "D": 126, "E": 82, "F": 142, "G": 135, "H": 317, "I": 699, "J": 66, "K": 58, "L": 95, "M": 159, "N": 196, "O": 111, "P": 243, "Q": 12, "R": 116, "S": 316, "T": 625, "U": 53, "V": 33, "W": 240, "X": 11, "Y": 94, "Z": 11}, "z": {"a": 320, "b": 16, "c": 14, "d": 33, "e": 1590, "f": 15, "g": 11, "h": 125, "i": 390, "j": 10, "k": 14, "l": 87, "m": 75, "n": 12, "o": 715, "p": 12, "q": 10, "r": 13, "s": 20, "t": 24, "u": 170, "v": 15, "w": 22, "x": 10, "y": 42, "z": 93, "A": 10, "B": 12, "C": 12, "D": 11, "E": 10, "F": 12, "G": 11, "H": 11, "I": 13, "J": 10, "K": 10, "L": 10, "M": 11, "N": 13, "O": 13, "P": 11, "Q": 10, "R": 10, "S": 10, "T": 14, "U": 10, "V": 10, "W": 21, "X": 10, "Y": 10, "Z": 10}, "A": {"a": 18, "b": 192, "c": 329, "d": 246, "e": 11, "f": 478, "g": 146, "h": 246, "i": 28, "j": 18, "k": 21, "l": 1119, "m": 1334, "n": 3653, "o": 15, "p": 175, "q": 13, "r": 523, "s": 825, "t": 633, "u": 355, "v": 37, "w": 56, "x": 22, "y": 19, "z": 13, "A": 21, "B": 63, "C": 100, "D": 66, "E": 19, "F": 41, "G": 56, "H": 25, "I": 48, "J": 11, "K": 14, "L": 168, "M": 77, "N": 329, "O": 14, "P": 460, "Q": 11, "R": 215, "S": 105, "T": 182, "U": 17, "V": 25, "W": 15, "X": 13, "Y": 21, "Z": 12}, "B": {"a": 716, "b": 10, "c": 11, "d": 11, "e": 922, "f": 10, "g": 10, "h": 10, "i": 167, "j": 10, "k": 10, "l": 77, "m": 10, "n": 11, "o": 1371, "p": 10, "q": 10, "r": 648, "s": 13, "t": 10, "u": 1544, "v": 10, "w": 14, "x": 10, "y": 231, "z": 10, "A": 40, "B": 16, "C": 20, "D": 11, "E": 81, "F": 16, "G": 13, "H": 18, "I": 35, "J": 10, "K": 10, "L": 44, "M": 19, "N": 12, "O": 74, "P": 13, "Q": 10, "R": 28, "S": 16, "T": 12, "U": 43, "V": 10, "W": 11, "X": 10, "Y": 24, "Z": 10}, "C": {"a": 841, "b": 15, "c": 10, "d": 10, "e": 96, "f": 14, "g": 10, "h": 602, "i": 207, "j": 10, "k": 10, "l": 318, "m": 10, "n": 10, "o": 2666, "p": 10, "q": 10, "r": 127, "s": 10, "t": 11, "u": 104, "v": 10, "w": 11, "x": 10, "y": 48, "z": 24, "A": 108, "B": 15, "C": 24, "D": 23, "E": 85, "F": 13, "G": 11, "H": 488, "I": 64, "J": 12, "K": 30, "L": 67, "M": 15, "N": 10, "O": 124, "P": 13, "Q": 12, "R": 29, "S": 29, "T": 142, "U": 33, "V": 10, "W": 14, "X": 10, "Y": 34, "Z": 11}, "D": {"a": 263, "b": 10, "c": 10, "d": 10, "e": 1189, "f": 10, "g": 10, "h": 11, "i": 375, "j": 10, "k": 10, "l": 10, "m": 141, "n": 17, "o": 953, "p": 10, "q": 10, "r": 232, "s": 11, "t": 10, "u": 285, "v": 11, "w": 17, "x": 10, "y": 13, "z": 10, "A": 46, "B": 16, "C": 24, "D": 25, "E": 130, "F": 17, "G": 16, "H": 16, "I": 135, "J": 13, "K": 10, "L": 23, "M": 23, "N": 18, "O": 49, "P": 24, "Q": 10, "R": 29, "S": 55, "T": 66, "U": 37, "V": 31, "W": 31, "X": 10, "Y": 10, "Z": 10}, "E": {"a": 221, "b": 14, "c": 62, "d": 72, "e": 11, "f": 30, "g": 24, "h": 72, "i": 29, "j": 10, "k": 13, "l": 157, "m": 795, "n": 648, "o": 16, "p": 50, "q": 15, "r": 93, "s": 40, "t": 25, "u": 267, "v": 383, "w": 14, "x": 199, "y": 30, "z": 12, "A": 146, "B": 67, "C": 156, "D": 122, "E": 57, "F": 58, "G": 33, "H": 19, "I": 69, "J": 13, "K": 11, "L": 75, "M": 90, "N": 248, "O": 75, "P": 80, "Q": 14, "R": 732, "S": 314, "T": 76, "U": 40, "V": 64, "W": 62, "X": 41, "Y": 25, "Z": 11}, "F": {"a": 267, "b": 10, "c": 10, "d": 10, "e": 381, "f": 11, "g": 10, "h": 13, "i": 487, "j": 10, "k": 10, "l": 70, "m": 10, "n": 10, "o": 636, "p": 10, "q": 10, "r": 2015, "s": 10, "t": 19, "u": 58, "v": 10, "w": 12, "x": 10, "y": 10, "z": 10, "A": 39, "B": 20, "C": 23, "D": 16, "E": 47, "F": 43, "G": 18, "H": 14, "I": 215, "J": 16, "K": 10, "L": 32, "M": 21, "N": 17, "O": 68, "P": 24, "Q": 10, "R": 36, "S": 24, "T": 94, "U": 19, "V": 11, "W": 21, "X": 10, "Y": 20, "Z": 10}, "G": {"a": 165, "b": 10, "c": 10, "d": 11, "e": 871, "f": 10, "g": 10, "h": 12, "i": 88, "j": 10, "k": 10, "l": 67, "m": 10, "n": 10, "o": 737, "p": 10, "q": 10, "r": 421, "s": 12, "t": 26, "u": 373, "v": 10, "w": 15, "x": 10, "y": 10, "z": 11, "A": 34, "B": 29, "C": 43, "D": 17, "E": 89, "F": 17, "G": 19, "H": 34, "I": 38, "J": 10, "K": 10, "L": 43, "M": 28, "N": 21, "O": 34, "P": 24, "Q": 10, "R": 76, "S": 36, "T": 41, "U": 48, "V": 12, "W": 11, "X": 10, "Y": 13, "Z": 12}, "H": {"a": 879, "b": 10, "c": 10, "d": 10, "e": 3027, "f": 13, "g": 10, "h": 11, "i": 858, "j": 10, "k": 10, "l": 10, "m": 20, "n": 10, "o": 1287, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 218, "v": 10, "w": 10, "x": 10, "y": 51, "z": 10, "A": 492, "B": 15, "C": 14, "D": 15, "E": 341, "F": 14, "G": 14, "H": 18, "I": 81, "J": 13, "K": 11, "L": 18, "M": 18, "N": 19, "O": 51, "P": 16, "Q": 10, "R": 27, "S": 25, "T": 27, "U": 20, "V": 17, "W": 16, "X": 10, "Y": 15, "Z": 10}, "I": {"a": 847, "b": 168, "c": 561, "d": 571, "e": 93, "f": 909, "g": 128, "h": 1063, "i": 48, "j": 29, "k": 269, "l": 845, "m": 508, "n": 2778, "o": 160, "p": 139, "q": 22, "r": 263, "s": 1099, "t": 2743, "u": 77, "v": 161, "w": 865, "x": 10, "y": 20, "z": 10, "A": 100, "B": 36, "C": 212, "D": 47, "E": 95, "F": 37, "G": 206, "H": 15, "I": 592, "J": 12, "K": 10, "L": 84, "M": 42, "N": 270, "O": 220, "P": 31, "Q": 10, "R": 57, "S": 181, "T": 201, "U": 14, "V": 138, "W": 22, "X": 69, "Y": 13, "Z": 17}, "J": {"a": 339, "b": 10, "c": 10, "d": 11, "e": 249, "f": 11, "g": 10, "h": 10, "i": 12, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 283, "p": 10, "q": 10, "r": 11, "s": 10, "t": 10, "u": 329, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 21, "B": 20, "C": 16, "D": 11, "E": 44, "F": 16, "G": 14, "H": 14, "I": 10, "J": 10, "K": 14, "L": 11, "M": 13, "N": 10, "O": 23, "P": 11, "Q": 12, "R": 14, "S": 17, "T": 19, "U": 31, "V": 10, "W": 22, "X": 10, "Y": 10, "Z": 10}, "K": {"a": 250, "b": 12, "c": 11, "d": 10, "e": 107, "f": 11, "g": 10, "h": 20, "i": 250, "j": 10, "k": 10, "l": 50, "m": 10, "n": 76, "o": 114, "p": 10, "q": 10, "r": 83, "s": 11, "t": 11, "u": 670, "v": 10, "w": 12, "x": 10, "y": 11, "z": 10, "A": 13, "B": 10, "C": 17, "D": 10, "E": 16, "F": 14, "G": 10, "H": 20, "I": 21, "J": 10, "K": 22, "L": 12, "M": 12, "N": 13, "O": 16, "P": 13, "Q": 10, "R": 10, "S": 25, "T": 17, "U": 10, "V": 10, "W": 12, "X": 10, "Y": 10, "Z": 10}, "L": {"a": 427, "b": 10, "c": 10, "d": 10, "e": 445, "f": 10, "g": 10, "h": 11, "i": 442, "j": 10, "k": 10, "l": 19, "m": 10, "n": 10, "o": 607, "p": 10, "q": 10, "r": 14, "s": 10, "t": 10, "u": 47, "v": 14, "w": 10, "x": 10, "y": 71, "z": 10, "A": 105, "B": 24, "C": 23, "D": 45, "E": 113, "F": 13, "G": 18, "H": 14, "I": 172, "J": 12, "K": 11, "L": 62, "M": 20, "N": 17, "O": 83, "P": 47, "Q": 10, "R": 18, "S": 44, "T": 27, "U": 43, "V": 15, "W": 20, "X": 10, "Y": 25, "Z": 10}, "M": {"a": 2286, "b": 10, "c": 114, "d": 16, "e": 389, "f": 10, "g": 10, "h": 10, "i": 762, "j": 10, "k": 10, "l": 11, "m": 11, "n": 10, "o": 1351, "p": 10, "q": 10, "r": 423, "s": 10, "t": 11, "u": 205, "v": 10, "w": 10, "x": 10, "y": 287, "z": 10, "A": 106, "B": 38, "C": 16, "D": 12, "E": 116, "F": 15, "G": 10, "H": 13, "I": 69, "J": 12, "K": 12, "L": 13, "M": 39, "N": 15, "O": 57, "P": 41, "Q": 10, "R": 16, "S": 26, "T": 13, "U": 17, "V": 11, "W": 11, "X": 10, "Y": 17, "Z": 10}, "N": {"a": 2095, "b": 12, "c": 10, "d": 10, "e": 935, "f": 10, "g": 10, "h": 10, "i": 733, "j": 10, "k": 10, "l": 12, "m": 10, "n": 11, "o": 1482, "p": 10, "q": 10, "r": 11, "s": 11, "t": 10, "u": 38, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 120, "B": 55, "C": 70, "D": 266, "E": 120, "F": 36, "G": 75, "H": 18, "I": 104, "J": 30, "K": 12, "L": 13, "M": 22, "N": 32, "O": 83, "P": 26, "Q": 10, "R": 27, "S": 111, "T": 202, "U": 22, "V": 14, "W": 16, "X": 10, "Y": 57, "Z": 10}, "O": {"a": 17, "b": 31, "c": 87, "d": 23, "e": 15, "f": 148, "g": 34, "h": 498, "i": 17, "j": 11, "k": 26, "l": 123, "m": 13, "n": 1078, "o": 22, "p": 96, "q": 10, "r": 261, "s": 126, "t": 123, "u": 141, "v": 27, "w": 33, "x": 20, "y": 11, "z": 14, "A": 20, "B": 18, "C": 44, "D": 34, "E": 12, "F": 218, "G": 42, "H": 18, "I": 23, "J": 39, "K": 46, "L": 131, "M": 65, "N": 354, "O": 73, "P": 42, "Q": 10, "R": 174, "S": 45, "T": 57, "U": 107, "V": 30, "W": 38, "X": 11, "Y": 23, "Z": 10}, "P": {"a": 806, "b": 11, "c": 10, "d": 10, "e": 986, "f": 58, "g": 10, "h": 172, "i": 2054, "j": 10, "k": 10, "l": 185, "m": 10, "n": 19, "o": 435, "p": 10, "q": 10, "r": 3285, "s": 18, "t": 12, "u": 156, "v": 10, "w": 13, "x": 10, "y": 29, "z": 10, "A": 75, "B": 11, "C": 10, "D": 13, "E": 47, "F": 13, "G": 17, "H": 43, "I": 29, "J": 11, "K": 10, "L": 32, "M": 18, "N": 10, "O": 72, "P": 30, "Q": 10, "R": 94, "S": 18, "T": 445, "U": 49, "V": 10, "W": 11, "X": 10, "Y": 15, "Z": 10}, "Q": {"a": 10, "b": 10, "c": 10, "d": 10, "e": 10, "f": 10, "g": 10, "h": 10, "i": 10, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 10, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 148, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 12, "B": 10, "C": 10, "D": 10, "E": 10, "F": 10, "G": 10, "H": 10, "I": 10, "J": 10, "K": 10, "L": 10, "M": 10, "N": 10, "O": 10, "P": 10, "Q": 10, "R": 10, "S": 10, "T": 10, "U": 18, "V": 10, "W": 10, "X": 10, "Y": 10, "Z": 10}, "R": {"a": 204, "b": 10, "c": 10, "d": 10, "e": 1095, "f": 10, "g": 10, "h": 106, "i": 180, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 1434, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 925, "v": 10, "w": 12, "x": 10, "y": 47, "z": 10, "A": 149, "B": 22, "C": 45, "D": 23, "E": 219, "F": 21, "G": 73, "H": 19, "I": 281, "J": 12, "K": 22, "L": 36, "M": 40, "N": 40, "O": 112, "P": 26, "Q": 10, "R": 49, "S": 72, "T": 104, "U": 26, "V": 99, "W": 20, "X": 262, "Y": 69, "Z": 10}, "S": {"a": 304, "b": 12, "c": 268, "d": 10, "e": 941, "f": 12, "g": 11, "h": 1239, "i": 278, "j": 10, "k": 63, "l": 77, "m": 160, "n": 16, "o": 1683, "p": 368, "q": 43, "r": 10, "s": 11, "t": 1421, "u": 523, "v": 15, "w": 62, "x": 10, "y": 132, "z": 10, "A": 76, "B": 30, "C": 47, "D": 17, "E": 212, "F": 21, "G": 14, "H": 55, "I": 98, "J": 12, "K": 12, "L": 21, "M": 31, "N": 16, "O": 137, "P": 34, "Q": 10, "R": 16, "S": 88, "T": 190, "U": 67, "V": 15, "W": 21, "X": 11, "Y": 31, "Z": 10}, "T": {"a": 225, "b": 10, "c": 10, "d": 10, "e": 392, "f": 10, "g": 10, "h": 11816, "i": 259, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 774, "p": 13, "q": 10, "r": 379, "s": 90, "t": 10, "u": 308, "v": 20, "w": 138, "x": 10, "y": 32, "z": 10, "A": 96, "B": 19, "C": 21, "D": 15, "E": 620, "F": 17, "G": 40, "H": 397, "I": 350, "J": 12, "K": 10, "L": 25, "M": 30, "N": 28, "O": 103, "P": 17, "Q": 10, "R": 85, "S": 48, "T": 42, "U": 67, "V": 16, "W": 30, "X": 10, "Y": 51, "Z": 10}, "U": {"a": 10, "b": 10, "c": 10, "d": 10, "e": 10, "f": 11, "g": 13, "h": 42, "i": 10, "j": 10, "k": 16, "l": 78, "m": 10, "n": 873, "o": 10, "p": 36, "q": 10, "r": 22, "s": 27, "t": 41, "u": 10, "v": 22, "w": 10, "x": 10, "y": 10, "z": 10, "A": 32, "B": 40, "C": 23, "D": 20, "E": 35, "F": 14, "G": 19, "H": 14, "I": 16, "J": 10, "K": 10, "L": 45, "M": 45, "N": 70, "O": 10, "P": 17, "Q": 10, "R": 123, "S": 80, "T": 109, "U": 12, "V": 10, "W": 12, "X": 10, "Y": 10, "Z": 11}, "V": {"a": 458, "b": 10, "c": 10, "d": 10, "e": 288, "f": 10, "g": 10, "h": 10, "i": 405, "j": 10, "k": 10, "l": 26, "m": 10, "n": 10, "o": 173, "p": 34, "q": 10, "r": 13, "s": 12, "t": 11, "u": 11, "v": 10, "w": 14, "x": 10, "y": 36, "z": 10, "A": 23, "B": 10, "C": 10, "D": 14, "E": 125, "F": 10, "G": 10, "H": 10, "I": 227, "J": 11, "K": 10, "L": 10, "M": 11, "N": 10, "O": 28, "P": 11, "Q": 10, "R": 14, "S": 13, "T": 24, "U": 10, "V": 10, "W": 10, "X": 12, "Y": 10, "Z": 10}, "W": {"a": 695, "b": 10, "c": 10, "d": 10, "e": 1447, "f": 10, "g": 10, "h": 2860, "i": 716, "j": 10, "k": 10, "l": 11, "m": 12, "n": 10, "o": 287, "p": 10, "q": 10, "r": 34, "s": 10, "t": 10, "u": 20, "v": 10, "w": 10, "x": 10, "y": 26, "z": 10, "A": 66, "B": 17, "C": 16, "D": 17, "E": 54, "F": 15, "G": 16, "H": 27, "I": 36, "J": 13, "K": 10, "L": 11, "M": 14, "N": 15, "O": 39, "P": 12, "Q": 10, "R": 13, "S": 17, "T": 18, "U": 10, "V": 10, "W": 12, "X": 10, "Y": 15, "Z": 12}, "X": {"a": 13, "b": 10, "c": 10, "d": 10, "e": 10, "f": 10, "g": 11, "h": 10, "i": 10, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 10, "p": 13, "q": 10, "r": 93, "s": 12, "t": 11, "u": 10, "v": 10, "w": 12, "x": 10, "y": 10, "z": 10, "A": 18, "B": 10, "C": 13, "D": 11, "E": 11, "F": 11, "G": 10, "H": 10, "I": 186, "J": 10, "K": 10, "L": 10, "M": 11, "N": 10, "O": 10, "P": 18, "Q": 10, "R": 11, "S": 12, "T": 19, "U": 10, "V": 128, "W": 10, "X": 113, "Y": 11, "Z": 10}, "Y": {"a": 73, "b": 10, "c": 10, "d": 10, "e": 605, "f": 10, "g": 10, "h": 10, "i": 13, "j": 10, "k": 10, "l": 10, "m": 10, "n": 10, "o": 1331, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 18, "v": 10, "w": 10, "x": 10, "y": 10, "z": 10, "A": 29, "B": 15, "C": 13, "D": 26, "E": 20, "F": 13, "G": 11, "H": 11, "I": 17, "J": 10, "K": 14, "L": 15, "M": 23, "N": 13, "O": 76, "P": 22, "Q": 11, "R": 15, "S": 37, "T": 16, "U": 11, "V": 10, "W": 12, "X": 13, "Y": 13, "Z": 11}, "Z": {"a": 43, "b": 10, "c": 10, "d": 17, "e": 20, "f": 10, "g": 10, "h": 59, "i": 12, "j": 10, "k": 10, "l": 10, "m": 10, "n": 27, "o": 11, "p": 10, "q": 10, "r": 10, "s": 10, "t": 10, "u": 19, "v": 10, "w": 11, "x": 10, "y": 10, "z": 10, "A": 13, "B": 10, "C": 10, "D": 10, "E": 14, "F": 10, "G": 10, "H": 10, "I": 11, "J": 10, "K": 10, "L": 10, "M": 10, "N": 11, "O": 12, "P": 10, "Q": 10, "R": 12, "S": 10, "T": 10, "U": 10, "V": 10, "W": 11, "X": 10, "Y": 10, "Z": 10}}} diff --git a/comps/guardrails/pii_detection/pii/detect/keys_detection.py b/comps/guardrails/pii_detection/pii/detect/keys_detection.py index 97fe1e8a2..53a720137 100755 --- a/comps/guardrails/pii_detection/pii/detect/keys_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/keys_detection.py @@ -19,7 +19,7 @@ def get_detector_model(): {"path": "detect_secrets.filters.heuristic.is_likely_id_string"}, {"path": "detect_secrets.filters.heuristic.is_templated_secret"}, {"path": "detect_secrets.filters.heuristic.is_sequential_string"}, - {"path": "detect_secrets.filters.gibberish.should_exclude_secret", "model": get_detector_model(), "limit": 4.0}, + #{"path": "detect_secrets.filters.gibberish.should_exclude_secret", "model": get_detector_model(), "limit": 4.0}, ] plugins = [ {"name": "ArtifactoryDetector"}, From 1d7c2f86306dd954d42b5cd85db0a6fe486cf2f4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 15:29:01 +0000 Subject: [PATCH 10/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/pii/detect/keys_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/guardrails/pii_detection/pii/detect/keys_detection.py b/comps/guardrails/pii_detection/pii/detect/keys_detection.py index 53a720137..796430a64 100755 --- a/comps/guardrails/pii_detection/pii/detect/keys_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/keys_detection.py @@ -19,7 +19,7 @@ def get_detector_model(): {"path": "detect_secrets.filters.heuristic.is_likely_id_string"}, {"path": "detect_secrets.filters.heuristic.is_templated_secret"}, {"path": "detect_secrets.filters.heuristic.is_sequential_string"}, - #{"path": "detect_secrets.filters.gibberish.should_exclude_secret", "model": get_detector_model(), "limit": 4.0}, + # {"path": "detect_secrets.filters.gibberish.should_exclude_secret", "model": get_detector_model(), "limit": 4.0}, ] plugins = [ {"name": "ArtifactoryDetector"}, From 9f14ab0fe9c082206d0ff21798b3b5e19097d079 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Mon, 24 Jun 2024 14:14:03 +0000 Subject: [PATCH 11/18] enable debug mode in test bash Signed-off-by: Chendi Xue --- tests/test_pii_detection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pii_detection.sh b/tests/test_pii_detection.sh index 4510ca3a4..4466992b5 100644 --- a/tests/test_pii_detection.sh +++ b/tests/test_pii_detection.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -#set -xe +set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') From 5235e39a87516f02619e034a4e3961bd07fedbc2 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Mon, 24 Jun 2024 22:58:02 +0000 Subject: [PATCH 12/18] rename test file Signed-off-by: Chendi Xue --- tests/{test_pii_detection.sh => test_guardrails_pii_detection.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_pii_detection.sh => test_guardrails_pii_detection.sh} (100%) diff --git a/tests/test_pii_detection.sh b/tests/test_guardrails_pii_detection.sh similarity index 100% rename from tests/test_pii_detection.sh rename to tests/test_guardrails_pii_detection.sh From ffa09f91b3c10c5b7108179a504272a4add3d92c Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Tue, 25 Jun 2024 00:18:02 +0000 Subject: [PATCH 13/18] mv pandas import into test Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index 4149ed42e..4b510d1c9 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -4,14 +4,12 @@ import argparse import json import os -import timeit -import pandas as pd import requests from utils import Timer - def test_html(ip_addr="localhost", batch_size=20): + import pandas as pd proxies = {"http": ""} url = f"http://{ip_addr}:6357/v1/piidetect" urls = pd.read_csv("data/ai_rss.csv")["Permalink"] @@ -32,6 +30,7 @@ def test_text(ip_addr="localhost", batch_size=20): proxies = {"http": ""} url = f"http://{ip_addr}:6357/v1/piidetect" if os.path.exists("data/ai_rss.csv"): + import pandas as pd content = pd.read_csv("data/ai_rss.csv")["Description"] content = content[:batch_size].to_list() else: From d37195daf4966456e9f1bd3d7ecc70d6179bcd4e Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Tue, 25 Jun 2024 00:52:39 +0000 Subject: [PATCH 14/18] add new requirement for prometheus and except for user didn't provide hg_token Signed-off-by: Chendi Xue --- .../pii/detect/name_password_detection.py | 2 ++ comps/guardrails/pii_detection/pii/pii_utils.py | 12 ++++++++---- comps/guardrails/pii_detection/requirements.txt | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py index c02c6c80b..91f51e9ec 100644 --- a/comps/guardrails/pii_detection/pii/detect/name_password_detection.py +++ b/comps/guardrails/pii_detection/pii/detect/name_password_detection.py @@ -17,6 +17,8 @@ def detect_name_password(content, pipeline, entity_types=None): if entity_types is None: entity_types = [PIIEntityType.NAME, PIIEntityType.PASSWORD] matches = [] + if pipeline is None: + return matches try: for entity in pipeline(content): entity_group = entity["entity_group"] diff --git a/comps/guardrails/pii_detection/pii/pii_utils.py b/comps/guardrails/pii_detection/pii/pii_utils.py index 6dae7d002..900cc8fbe 100644 --- a/comps/guardrails/pii_detection/pii/pii_utils.py +++ b/comps/guardrails/pii_detection/pii/pii_utils.py @@ -37,10 +37,14 @@ def __init__(self, model_path=None): _model_key = "bigcode/starpii" _model_key = _model_key if model_path is None else os.path.join(model_path, _model_key) - tokenizer = AutoTokenizer.from_pretrained(_model_key, model_max_length=512) - self.pipeline = pipeline( - model=_model_key, task="token-classification", tokenizer=tokenizer, grouped_entities=True - ) + try: + tokenizer = AutoTokenizer.from_pretrained(_model_key, model_max_length=512) + self.pipeline = pipeline( + model=_model_key, task="token-classification", tokenizer=tokenizer, grouped_entities=True + ) + except Exception as e: + print("Failed to load model, skip NER classification", e) + self.pipeline = None def detect_pii(self, text): result = [] diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt index d942bf347..ce8dcddcc 100644 --- a/comps/guardrails/pii_detection/requirements.txt +++ b/comps/guardrails/pii_detection/requirements.txt @@ -1,3 +1,4 @@ +prometheus-fastapi-instrumentator beautifulsoup4 detect_secrets docarray[full] From c52b67dc142829282f584d96a3a40ff44e2c820a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 00:55:54 +0000 Subject: [PATCH 15/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/requirements.txt | 2 +- comps/guardrails/pii_detection/test.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt index ce8dcddcc..88690093f 100644 --- a/comps/guardrails/pii_detection/requirements.txt +++ b/comps/guardrails/pii_detection/requirements.txt @@ -1,4 +1,3 @@ -prometheus-fastapi-instrumentator beautifulsoup4 detect_secrets docarray[full] @@ -16,6 +15,7 @@ opentelemetry-sdk pandas phonenumbers Pillow +prometheus-fastapi-instrumentator pyarrow pymupdf python-docx diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index 4b510d1c9..3754e8114 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -8,8 +8,10 @@ import requests from utils import Timer + def test_html(ip_addr="localhost", batch_size=20): import pandas as pd + proxies = {"http": ""} url = f"http://{ip_addr}:6357/v1/piidetect" urls = pd.read_csv("data/ai_rss.csv")["Permalink"] @@ -31,6 +33,7 @@ def test_text(ip_addr="localhost", batch_size=20): url = f"http://{ip_addr}:6357/v1/piidetect" if os.path.exists("data/ai_rss.csv"): import pandas as pd + content = pd.read_csv("data/ai_rss.csv")["Description"] content = content[:batch_size].to_list() else: From cf034c117a4f6ca59625338a0a69ab6de1513146 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Tue, 25 Jun 2024 01:03:08 +0000 Subject: [PATCH 16/18] mv pandas import to function Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/guardrails/pii_detection/utils.py b/comps/guardrails/pii_detection/utils.py index 7ce0b58ab..ac6aa5d6c 100644 --- a/comps/guardrails/pii_detection/utils.py +++ b/comps/guardrails/pii_detection/utils.py @@ -9,7 +9,6 @@ import timeit from pathlib import Path -import pandas as pd class Timer: @@ -40,6 +39,7 @@ class TimeoutError(Exception): def save_logs(log_name, data): + import pandas as pd df = pd.DataFrame.from_records(data) try: dir_path = os.path.dirname(log_name) From 42a80e0ecd538a2ccc173f34977ee0ca65ca0ab3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 01:03:53 +0000 Subject: [PATCH 17/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/guardrails/pii_detection/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/guardrails/pii_detection/utils.py b/comps/guardrails/pii_detection/utils.py index ac6aa5d6c..0766bec70 100644 --- a/comps/guardrails/pii_detection/utils.py +++ b/comps/guardrails/pii_detection/utils.py @@ -10,7 +10,6 @@ from pathlib import Path - class Timer: level = 0 viewer = None @@ -40,6 +39,7 @@ class TimeoutError(Exception): def save_logs(log_name, data): import pandas as pd + df = pd.DataFrame.from_records(data) try: dir_path = os.path.dirname(log_name) From 57939ec314d2257107f4b0e59eb337e12abea318 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Tue, 25 Jun 2024 01:08:08 +0000 Subject: [PATCH 18/18] remove ip_addr hardcode Signed-off-by: Chendi Xue --- comps/guardrails/pii_detection/test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comps/guardrails/pii_detection/test.py b/comps/guardrails/pii_detection/test.py index 3754e8114..214c0d0b9 100644 --- a/comps/guardrails/pii_detection/test.py +++ b/comps/guardrails/pii_detection/test.py @@ -92,7 +92,6 @@ def test_pdf(ip_addr="localhost", batch_size=20): parser.add_argument("--ip_addr", type=str, default="localhost", help="IP address of the server") args = parser.parse_args() - args.ip_addr = "100.83.111.250" if args.test_html: test_html(ip_addr=args.ip_addr, batch_size=args.batch_size) elif args.test_pdf: