From 97fdf5404cb80d895a60cc515ce05ba041a87fff Mon Sep 17 00:00:00 2001 From: qgao007 <108324932+qgao007@users.noreply.github.com> Date: Wed, 21 Aug 2024 19:23:25 -0600 Subject: [PATCH] Add toxicity detection microservice (#338) * Add toxicity detection microservice Signed-off-by: Qun Gao * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Modification to toxicity plugin PR (#432) * changed microservice to use Service.GUARDRAILS and input/output to TextDoc Signed-off-by: Tyler Wilbers * simplify dockerfile to use langchain Signed-off-by: Tyler Wilbers * sort requirements Signed-off-by: Tyler Wilbers * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tyler Wilbers Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Minor SPDX header update (#434) Signed-off-by: Abolfazl Shahbazi * Remove 'langsmith' per code review (#534) Signed-off-by: Abolfazl Shahbazi * Add toxicity detection microservices with E2E testing Signed-off-by: Qun Gao --------- Signed-off-by: Qun Gao Signed-off-by: Tyler Wilbers Signed-off-by: Abolfazl Shahbazi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Abolfazl Shahbazi Co-authored-by: Tyler W --- comps/guardrails/toxicity_detection/README.md | 88 +++++++++++++++++++ .../toxicity_detection/docker/Dockerfile | 31 +++++++ .../toxicity_detection/requirements.txt | 15 ++++ .../toxicity_detection/toxicity_detection.py | 31 +++++++ tests/test_guardrails_toxicity_detection.sh | 72 +++++++++++++++ 5 files changed, 237 insertions(+) create mode 100644 comps/guardrails/toxicity_detection/README.md create mode 100644 comps/guardrails/toxicity_detection/docker/Dockerfile create mode 100644 comps/guardrails/toxicity_detection/requirements.txt create mode 100644 comps/guardrails/toxicity_detection/toxicity_detection.py create mode 100755 tests/test_guardrails_toxicity_detection.sh diff --git a/comps/guardrails/toxicity_detection/README.md b/comps/guardrails/toxicity_detection/README.md new file mode 100644 index 000000000..57093cbe9 --- /dev/null +++ b/comps/guardrails/toxicity_detection/README.md @@ -0,0 +1,88 @@ +# Toxicity Detection Microservice + +# ☣️💥🛡️ Intel Toxicity Detection Model + +## Introduction + +Intel also provides toxicity detection model, which is lightweight, runs efficiently on a CPU, and performs well on toxic_chat and jigsaws datasets. More datasets are being fine-tuned. If you're interested, please contact abolfazl.shahbazi@intel.com. + +## Training Customerizable Toxicity Model on Gaudi2 + +Additionally, we offer a fine-tuning workflow on Intel Gaudi2, allowing you to customerize your toxicity detecction model to suit your unique needs. + +# 🚀1. Start Microservice with Python(Option 1) + +## 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +## 1.2 Start Toxicity Detection Microservice with Python Script + +```bash +python toxicity_detection.py +``` + +# 🚀2. Start Microservie with Docker (Option 2) + +## 2.1 Prepare toxicity detection model + +export HUGGINGFACEHUB_API_TOKEN=${HP_TOKEN} + +## 2.2 Build Docker Image + +```bash +cd ../../../ # back to GenAIComps/ folder +docker build -t opea/guardrails-toxicity-detection:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/toxicity_detection/docker/Dockerfile . +``` + +## 2.3 Run Docker Container with Microservice + +```bash +docker run -d --rm --runtime=runc --name="guardrails-toxicity-detection-endpoint" -p 9091:9091 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-toxicity-detection:latest +``` + +# 🚀3. Get Status of Microservice + +```bash +docker container logs -f guardrails-toxicity-detection-endpoint +``` + +# 🚀4. Consume Microservice Pre-LLM/Post-LLM + +Once microservice starts, users can use examples (bash or python) below to apply toxicity detection for both user's query (Pre-LLM) or LLM's response (Post-LLM) + +**Bash:** + +```bash +curl localhost:9091/v1/toxicity + -X POST + -d '{"text":"How to poison your neighbor'\''s dog secretly"}' + -H 'Content-Type: application/json' +``` + +Example Output: + +```bash +"\nI'm sorry, but your query or LLM's response is TOXIC with an score of 0.97 (0-1)!!!\n" +``` + +**Python Script:** + +```python +import requests +import json + +proxies = {"http": ""} +url = "http://localhost:9091/v1/toxicity" +data = {"text": "How to poison your neighbor'''s dog without being caught?"} + +try: + resp = requests.post(url=url, data=data, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") +except requests.exceptions.RequestException as e: + print("An error occurred:", e) +``` diff --git a/comps/guardrails/toxicity_detection/docker/Dockerfile b/comps/guardrails/toxicity_detection/docker/Dockerfile new file mode 100644 index 000000000..18c372656 --- /dev/null +++ b/comps/guardrails/toxicity_detection/docker/Dockerfile @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/guardrails/toxicity_detection/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/guardrails/toxicity_detection/ + +ENTRYPOINT ["python", "toxicity_detection.py"] diff --git a/comps/guardrails/toxicity_detection/requirements.txt b/comps/guardrails/toxicity_detection/requirements.txt new file mode 100644 index 000000000..64bfa169c --- /dev/null +++ b/comps/guardrails/toxicity_detection/requirements.txt @@ -0,0 +1,15 @@ +aiohttp +docarray[full] +fastapi +httpx +huggingface_hub +langchain-community +langchain-huggingface +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +pyyaml +requests +shortuuid +uvicorn diff --git a/comps/guardrails/toxicity_detection/toxicity_detection.py b/comps/guardrails/toxicity_detection/toxicity_detection.py new file mode 100644 index 000000000..df965505f --- /dev/null +++ b/comps/guardrails/toxicity_detection/toxicity_detection.py @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from transformers import pipeline + +from comps import ServiceType, TextDoc, opea_microservices, register_microservice + + +@register_microservice( + name="opea_service@toxicity_detection", + service_type=ServiceType.GUARDRAIL, + endpoint="/v1/toxicity", + host="0.0.0.0", + port=9091, + input_datatype=TextDoc, + output_datatype=TextDoc, +) +def llm_generate(input: TextDoc): + input_text = input.text + toxic = toxicity_pipeline(input_text) + print("done") + if toxic[0]["label"] == "toxic": + return TextDoc(text="Violated policies: toxicity, please check your input.", downstream_black_list=[".*"]) + else: + return TextDoc(text=input_text) + + +if __name__ == "__main__": + model = "citizenlab/distilbert-base-multilingual-cased-toxicity" + toxicity_pipeline = pipeline("text-classification", model=model, tokenizer=model) + opea_microservices["opea_service@toxicity_detection"].start() diff --git a/tests/test_guardrails_toxicity_detection.sh b/tests/test_guardrails_toxicity_detection.sh new file mode 100755 index 000000000..215019e6c --- /dev/null +++ b/tests/test_guardrails_toxicity_detection.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + echo "Start building docker images for microservice" + cd $WORKPATH + docker build --no-cache -t opea/guardrails-toxicity-detection:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/toxicity_detection/docker/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/guardrails-toxicity-detection built fail" + exit 1 + else + echo "opea/guardrails-toxicity-detection built successful" + fi +} + +function start_service() { + echo "Starting microservice" + docker run -d --runtime=runc --name="test-comps-guardrails-toxicity-detection-endpoint" -p 9091:9091 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/guardrails-toxicity-detection:comps + sleep 5 + echo "Microservice started" +} + +function validate_microservice() { + echo "Validate microservice started" + echo "test 1 - toxic" + result=$(curl localhost:9091/v1/toxicity -X POST -d '{"text":"How to poison your neighbor'\''s dog secretly"}' -H 'Content-Type: application/json') + if [[ $result == *"Violated"* ]]; then + echo "Result correct." + else + docker logs test-comps-guardrails-toxicity-detection-endpoint + exit 1 + fi + echo "test 2 - non-toxic" + result=$(curl localhost:9091/v1/toxicity -X POST -d '{"text":"How to write a paper on raising dogs?"}' -H 'Content-Type: application/json') + if [[ $result == *"paper"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-guardrails-toxicity-detection-endpoint + exit 1 + fi + echo "Validate microservice completed" +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-guardrails-toxicity-detection-endpoint") + echo "Shutdown legacy containers "$cid + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo "cleanup container images and volumes" + echo y | docker system prune 2>&1 > /dev/null + +} + +main