Modification to toxicity plugin PR (#432)

* changed microservice to use Service.GUARDRAILS and input/output to TextDoc Signed-off-by: Tyler Wilbers <[email protected]> * simplify dockerfile to use langchain Signed-off-by: Tyler Wilbers <[email protected]> * sort requirements Signed-off-by: Tyler Wilbers <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tyler Wilbers <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opea-project · Aug 8, 2024 · 63650d0 · 63650d0
1 parent fe2fe47
commit 63650d0
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 86 deletions.
diff --git a/comps/guardrails/toxicity_detection/README.md b/comps/guardrails/toxicity_detection/README.md
@@ -40,7 +40,7 @@ docker build -t opea/guardrails-toxicity-detection:latest --build-arg https_prox
 ## 2.3 Run Docker Container with Microservice
 
 ```bash
-docker run -d --rm --runtime=runc --name="guardrails-pii-detection-endpoint" -p 9091:9091 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-toxicity-detection:latest
+docker run -d --rm --runtime=runc --name="guardrails-toxicity-detection-endpoint" -p 9091:9091 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/guardrails-toxicity-detection:latest
 ```
 
 # 🚀3. Get Status of Microservice
@@ -58,7 +58,7 @@ Once microservice starts, users can use examples (bash or python) below to apply
 ```bash
 curl localhost:9091/v1/toxicity
     -X POST
-    -d '{"query":"How to poison your neighbor'\''s dog secretly"}'
+    -d '{"text":"How to poison your neighbor'\''s dog secretly"}'
     -H 'Content-Type: application/json'
 ```
 
@@ -76,7 +76,7 @@ import json
 
 proxies = {"http": ""}
 url = "http://localhost:9091/v1/toxicity"
-data = {"query": "How to poison your neighbor'''s dog without being caught?"}
+data = {"text": "How to poison your neighbor'''s dog without being caught?"}
 
 try:
     resp = requests.post(url=url, data=data, proxies=proxies)

diff --git a/comps/guardrails/toxicity_detection/docker/Dockerfile b/comps/guardrails/toxicity_detection/docker/Dockerfile
@@ -1,60 +1,31 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-FROM ubuntu:22.04
-
-ARG TAG=main
-
-RUN apt-get update \ 
-    && apt-get upgrade -y \ 
-    && apt-get install -y --no-install-recommends \
-    gcc-12 \
-    g++-12 \
-    make \
-    wget \
-    libnuma-dev \
-    numactl \
-    git \
-    pkg-config \
-    software-properties-common \
-    zlib1g-dev \
-    libssl-dev \
-    libffi-dev \
-    libbz2-dev \
-    libsqlite3-dev \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 60 \
-    && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 60 \
-    && apt-get autoremove -y \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install python
-WORKDIR /tmp
-RUN wget -q https://www.python.org/ftp/python/3.8.10/Python-3.8.10.tgz \
-    && tar -xzvf Python-3.8.10.tgz
-WORKDIR /tmp/Python-3.8.10
-RUN ./configure --prefix=/usr/bin/python3.8 --enable-optimizations \
-    && make -j \
-    && make install \
-    && update-alternatives --install /usr/bin/python python /usr/bin/python3.8/bin/python3.8 60 \
-    && update-alternatives --install /usr/bin/pip pip /usr/bin/python3.8/bin/pip3 60 \
-    && python -m pip install --no-cache-dir --upgrade pip setuptools \
-    && pip install --no-cache-dir wheel \
-    && rm -rf /tmp/* \
-    && echo "export PATH=/usr/bin/python3.8:\$PATH" >> ~/.bashrc
-
-RUN pip install --no-cache-dir torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir cmake==3.26.1 transformers==4.41.2 sentencepiece==0.1.99 accelerate==0.23.0 protobuf tiktoken transformers-stream-generator einops \
-    && ln -s /usr/bin/python3.8/lib/python3.8/site-packages/cmake/data/bin/cmake /usr/bin/cmake
-
-COPY comps /root/comps
+FROM langchain/langchain:latest
 
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /root/comps/guardrails/toxicity_detection/requirements.txt
+ENV LANG=C.UTF-8
+
+ARG ARCH="cpu"
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
 
-ENV PYTHONPATH=$PYTHONPATH:/root
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    if [ ${ARCH} = "cpu" ]; then pip install torch --index-url https://download.pytorch.org/whl/cpu; fi && \
+    pip install --no-cache-dir -r /home/user/comps/guardrails/toxicity_detection/requirements.txt
 
-RUN chmod +x /root/comps/guardrails/toxicity_detection/run.sh
+ENV PYTHONPATH=$PYTHONPATH:/home/user
 
-WORKDIR /root/comps/guardrails/toxicity_detection/
+WORKDIR /home/user/comps/guardrails/toxicity_detection/
 
-ENTRYPOINT ["/root/comps/guardrails/toxicity_detection/run.sh"]
+ENTRYPOINT ["python", "toxicity_detection.py"]
diff --git a/comps/guardrails/toxicity_detection/requirements.txt b/comps/guardrails/toxicity_detection/requirements.txt
@@ -1,5 +1,7 @@
+aiohttp
 docarray[full]
 fastapi
+httpx
 huggingface_hub
 langchain-community
 langchain-huggingface
@@ -8,4 +10,7 @@ opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
 prometheus-fastapi-instrumentator
+pyyaml
+requests
 shortuuid
+uvicorn
diff --git a/comps/guardrails/toxicity_detection/run.sh b/comps/guardrails/toxicity_detection/run.sh
diff --git a/comps/guardrails/toxicity_detection/toxicity_detection.py b/comps/guardrails/toxicity_detection/toxicity_detection.py
@@ -12,43 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import pathlib
-import sys
-from datetime import datetime
-
-cur_path = pathlib.Path(__file__).parent.resolve()
-comps_path = os.path.join(cur_path, "../../../")
-sys.path.append(comps_path)
-
-import torch
-from fastapi.responses import StreamingResponse
 from langsmith import traceable
 
 # from utils import initialize_model
 from transformers import pipeline
 
-from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+from comps import ServiceType, TextDoc, opea_microservices, register_microservice
 
 
 @register_microservice(
     name="opea_service@toxicity_detection",
-    service_type=ServiceType.LLM,
+    service_type=ServiceType.GUARDRAIL,
     endpoint="/v1/toxicity",
     host="0.0.0.0",
     port=9091,
+    input_datatype=TextDoc,
+    output_datatype=TextDoc,
 )
 @traceable(run_type="llm")
-async def llm_generate(input: LLMParamsDoc):
-    input_query = input.query
-    model_name_or_path = "citizenlab/distilbert-base-multilingual-cased-toxicity"
-    toxicity_classifier = pipeline("text-classification", model=model_name_or_path, tokenizer=model_name_or_path)
-    toxic = toxicity_classifier(input_query)
+def llm_generate(input: TextDoc):
+    input_text = input.text
+    toxic = toxicity_pipeline(input_text)
+    print("done")
     if toxic[0]["label"] == "toxic":
-        return f"\nI'm sorry, but your query or LLM's response is TOXIC with an score of {toxic[0]['score']:.2f} (0-1)!!!\n"
+        return TextDoc(text="Violated policies: toxicity, please check your input.", downstream_black_list=[".*"])
     else:
-        return input_query
+        return TextDoc(text=input_text)
 
 
 if __name__ == "__main__":
+    model = "citizenlab/distilbert-base-multilingual-cased-toxicity"
+    toxicity_pipeline = pipeline("text-classification", model=model, tokenizer=model)
     opea_microservices["opea_service@toxicity_detection"].start()