Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Jun 11, 2024
1 parent 7861b9e commit 79da2ee
Show file tree
Hide file tree
Showing 17 changed files with 183 additions and 116 deletions.
7 changes: 4 additions & 3 deletions comps/guardrails/pii_detection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ TBD

## 2.1.2 use NER model (default mode)

``` bash
```bash
mkdir -p pii/bigcode
apt install git-lfs
cd pii/bigcode; git clone https://{hf_username}:{hf_token}@huggingface.co/bigcode/starpii/; cd ../..
Expand All @@ -64,21 +64,22 @@ docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p
```

> debug mode
```bash
docker run --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 6357:6357 -v ./comps/guardrails/pii_detection/:/home/user/comps/guardrails/pii_detection/ --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-pii-detection:latest
```

# 🚀3. Status Microservice

``` bash
```bash
docker container logs -f guardrails-pii-detection-endpoint
```

# 🚀4. Consume Microservice

Once microservice starts, user can use below script to invoke the microservice for pii detection.

``` python
```python
import requests
import json

Expand Down
3 changes: 1 addition & 2 deletions comps/guardrails/pii_detection/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ def get_boolean_env_var(var_name, default_value=False):
return default_value



LLM_URL = os.getenv("LLM_ENDPOINT_URL", None)

current_file_path = pathlib.Path(__file__).parent.resolve()
comps_path = os.path.join(current_file_path, "../../../")
comps_path = os.path.join(current_file_path, "../../../")
15 changes: 8 additions & 7 deletions comps/guardrails/pii_detection/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import io
import json
import multiprocessing
import os
import re
import unicodedata
from urllib.parse import urlparse, urlunparse

import easyocr
import fitz
import numpy as np
import pandas as pd
import requests
import yaml
from bs4 import BeautifulSoup

import io
import json
import multiprocessing
import re
import unicodedata
from urllib.parse import urlparse, urlunparse
from docx import Document as DDocument
from langchain_community.document_loaders import (
UnstructuredImageLoader,
Expand All @@ -26,6 +26,7 @@
from PIL import Image
from utils import timeout


def load_pdf(pdf_path):
"""Load the pdf file."""
doc = fitz.open(pdf_path)
Expand Down
5 changes: 4 additions & 1 deletion comps/guardrails/pii_detection/pii/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
__all__ = ['pii_detection', 'pii_redaction']
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

__all__ = ["pii_detection", "pii_redaction"]
2 changes: 2 additions & 0 deletions comps/guardrails/pii_detection/pii/detect/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
16 changes: 10 additions & 6 deletions comps/guardrails/pii_detection/pii/detect/emails_detection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
""" This code is adapted from BigScience PII detection
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
""" This code is adapted from BigScience PII detection
https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py
MST BigScience PII Code
Expand All @@ -18,11 +20,13 @@
"""

import sys

import regex

# Note: to reduce false positives, a number of technically-valid-but-rarely-used
# email address patterns (e.g. with parenthesis or slashes) will not match
email_pattern = regex.compile(r'''
email_pattern = regex.compile(
r"""
(?<= ^ | [[({<\b\s@,?!;'"\p{Han}¿¡:.] | \\['"] ) # left delimiter
(
(?: # local part
Expand All @@ -44,7 +48,9 @@
(?: [\p{L}\p{M}]{2,63} | xn-- \w+ ) # TLD, including IDN
)
(?= $ | [])}>\b\s@,?!;'"\p{Han}] | \\['"] | : (?! \d) | \. (?! \S)) # right delim
''', flags=regex.MULTILINE | regex.VERBOSE)
""",
flags=regex.MULTILINE | regex.VERBOSE,
)


def detect_email(content):
Expand All @@ -63,9 +69,7 @@ def detect_email(content):
for match in matches_tmp:
if match.groups():
if len(match.groups()) > 1 and match.groups()[1]:
sys.stderr.write(
"Warning: Found substring matches in the main match."
)
sys.stderr.write("Warning: Found substring matches in the main match.")
# setup outputs
value = match.group(1)
start, end = match.span(1)
Expand Down
29 changes: 14 additions & 15 deletions comps/guardrails/pii_detection/pii/detect/ip_detection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
""" This code is adapted from BigScience PII detection
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
""" This code is adapted from BigScience PII detection
https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py
MST BigScience PII Code
Expand All @@ -17,8 +19,9 @@
limitations under the License.
"""

import sys
import ipaddress
import sys

import regex

year_patterns = [
Expand All @@ -43,10 +46,8 @@
ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])"
ip_pattern = regex.compile(
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])("
+ r"|".join([ipv4_pattern, ipv6_pattern])
+ ")(?:$|[\s@,?!;:'\"(.\p{Han}])",
flags=regex.MULTILINE
r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:'\"(.\p{Han}])",
flags=regex.MULTILINE,
)


Expand All @@ -66,18 +67,18 @@ def ip_has_digit(matched_str):

def filter_versions(matched_str, context):
"""Filter addresses in this format x.x.x.x and the words dns/server
don't appear in the neighboring context, usually they are just versions"""
# count occurrence of dots
dot_count = matched_str.count('.')
exclude = (dot_count == 3 and len(matched_str) == 7)
don't appear in the neighboring context, usually they are just versions."""
# count occurrence of dots
dot_count = matched_str.count(".")
exclude = dot_count == 3 and len(matched_str) == 7
if exclude:
if "dns" in context.lower() or "server" in context.lower():
return False
return exclude


def not_ip_address(matched_str):
""" make sure the string has a valid IP address format
"""make sure the string has a valid IP address format
e.g: 33.01.33.33 is not a valid IP address because of the 0 in front of 1
TODO: fix this directly in the regex"""
try:
Expand All @@ -103,9 +104,7 @@ def detect_ip(content):
for match in matches_tmp:
if match.groups():
if len(match.groups()) > 1 and match.groups()[1]:
sys.stderr.write(
"Warning: Found substring matches in the main match."
)
sys.stderr.write("Warning: Found substring matches in the main match.")
# setup outputs
value = match.group(1)
start, end = match.span(1)
Expand All @@ -115,7 +114,7 @@ def detect_ip(content):
continue
if matches_date_pattern(value):
continue
if filter_versions(value, content[start - 100:end + 100]) or not_ip_address(value):
if filter_versions(value, content[start - 100 : end + 100]) or not_ip_address(value):
continue
# combine if conditions in one

Expand Down
20 changes: 9 additions & 11 deletions comps/guardrails/pii_detection/pii/detect/keys_detection.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
""" This code is adapted from BigCode PII
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
""" This code is adapted from BigCode PII
https://github.com/bigcode-project/bigcode-dataset/blob/main/pii/utils/keys_detection.py
"""
import os


# Secrets detection with detect-secrets tool


Expand Down Expand Up @@ -44,7 +45,7 @@ def get_detector_model():


def is_gibberish(matched_str):
"""Checks to make sure the PII span is gibberish and not word like"""
"""Checks to make sure the PII span is gibberish and not word like."""
# pip install gibberish-detector
# download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt
# run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds)
Expand All @@ -62,7 +63,7 @@ def is_hash(content, value):
# TODO: fix this issue happened one for JS in the stack-smol, file did contain value
print("Value not found in content, why this happened?")
return False
lines = content[:content.index(value)].splitlines()
lines = content[: content.index(value)].splitlines()
target_line = lines[-1]
if len(value) in [32, 40, 64]:
# if "sha" or "md5" are in content:
Expand All @@ -73,7 +74,7 @@ def is_hash(content, value):


def file_has_hashes(content, coeff=0.02):
"""Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines"""
"""Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines."""
lines = content.splitlines()
count_sha = 0
count_hash = 0
Expand Down Expand Up @@ -108,13 +109,12 @@ def scan_secrets(line: str):

lines = line.splitlines(keepends=True)
for secret in _process_line_based_plugins(
lines=list(enumerate(lines, start=1)),
filename="Adhoc String",
lines=list(enumerate(lines, start=1)),
filename="Adhoc String",
):
yield secret



def detect_keys(content):
"""Detect secret keys in content using detect-secrets tool
Args:
Expand All @@ -135,9 +135,7 @@ def detect_keys(content):

from detect_secrets.settings import transient_settings

with transient_settings(
{"plugins_used": plugins, "filters_used": filters}
) as settings:
with transient_settings({"plugins_used": plugins, "filters_used": filters}) as settings:
matches = []
for secret in scan_secrets(content):
if is_hash(content, secret.secret_value) or file_has_hashes(content):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from .utils import PIIEntityType


Expand All @@ -17,8 +20,9 @@ def detect_name_password(content, pipeline, entity_types=None):
try:
for entity in pipeline(content):
entity_group = entity["entity_group"]
if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or \
("PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types):
if ("NAME" == entity_group and PIIEntityType.NAME in entity_types) or (
"PASSWORD" == entity_group and PIIEntityType.PASSWORD in entity_types
):
matches.append(
{
"tag": entity_group,
Expand Down
8 changes: 6 additions & 2 deletions comps/guardrails/pii_detection/pii/detect/phones_detection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os


def detect_phones(text):
"""Detects phone in a string using phonenumbers libray only detection the international phone number"""
"""Detects phone in a string using phonenumbers library only detection the international phone number."""
try:
import phonenumbers
except ImportError:
os.system("pip install phonenumbers")
import phonenumbers

matches = []

for match in phonenumbers.PhoneNumberMatcher(text, "IN"):
Expand Down
3 changes: 3 additions & 0 deletions comps/guardrails/pii_detection/pii/detect/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from enum import Enum, auto


Expand Down
Loading

0 comments on commit 79da2ee

Please sign in to comment.