From f6f620a2db8d75001747e541b3a27cf2c8f06c29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20De=20Le=C3=B3n?=
 <111013930+daniel-de-leon-user293@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:37:45 -0700
Subject: [PATCH] Add Intel/toxic-prompt-roberta to toxicity detection
 microservice (#749)

* add toxic roberta model

Signed-off-by: Daniel Deleon <daniel.de.leon@intel.com>

* update README

Signed-off-by: Daniel Deleon <daniel.de.leon@intel.com>

---------

Signed-off-by: Daniel Deleon <daniel.de.leon@intel.com>
---
 comps/guardrails/toxicity_detection/README.md             | 8 +++-----
 comps/guardrails/toxicity_detection/toxicity_detection.py | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/comps/guardrails/toxicity_detection/README.md b/comps/guardrails/toxicity_detection/README.md
index 1024969bf..b13d68c5c 100644
--- a/comps/guardrails/toxicity_detection/README.md
+++ b/comps/guardrails/toxicity_detection/README.md
@@ -4,11 +4,9 @@
 
 Toxicity Detection Microservice allows AI Application developers to safeguard user input and LLM output from harmful language in a RAG environment. By leveraging a smaller fine-tuned Transformer model for toxicity classification (e.g. DistilledBERT, RoBERTa, etc.), we maintain a lightweight guardrails microservice without significantly sacrificing performance making it readily deployable on both Intel Gaudi and Xeon.
 
-Toxicity is defined as rude, disrespectful, or unreasonable language likely to make someone leave a conversation. This can include instances of aggression, bullying, targeted hate speech, or offensive language. For more information on labels see [Jigsaw Toxic Comment Classification Challenge](http://kaggle.com/c/jigsaw-toxic-comment-classification-challenge).
-
-## Future Development
+This microservice uses [`Intel/toxic-prompt-roberta`](https://huggingface.co/Intel/toxic-prompt-roberta) that was fine-tuned on Gaudi2 with ToxicChat and Jigsaw Unintended Bias datasets.
 
-- Add a RoBERTa (125M params) toxicity model fine-tuned on Gaudi2 with ToxicChat and Jigsaw dataset in an optimized serving framework.
+Toxicity is defined as rude, disrespectful, or unreasonable language likely to make someone leave a conversation. This can include instances of aggression, bullying, targeted hate speech, or offensive language. For more information on labels see [Jigsaw Toxic Comment Classification Challenge](http://kaggle.com/c/jigsaw-toxic-comment-classification-challenge).
 
 ## 🚀1. Start Microservice with Python（Option 1）
 
@@ -65,7 +63,7 @@ curl localhost:9091/v1/toxicity
 Example Output:
 
 ```bash
-"\nI'm sorry, but your query or LLM's response is TOXIC with an score of 0.97 (0-1)!!!\n"
+"Violated policies: toxicity, please check your input."
 ```
 
 **Python Script:**
diff --git a/comps/guardrails/toxicity_detection/toxicity_detection.py b/comps/guardrails/toxicity_detection/toxicity_detection.py
index df965505f..711945ab6 100644
--- a/comps/guardrails/toxicity_detection/toxicity_detection.py
+++ b/comps/guardrails/toxicity_detection/toxicity_detection.py
@@ -19,13 +19,13 @@ def llm_generate(input: TextDoc):
     input_text = input.text
     toxic = toxicity_pipeline(input_text)
     print("done")
-    if toxic[0]["label"] == "toxic":
+    if toxic[0]["label"].lower() == "toxic":
         return TextDoc(text="Violated policies: toxicity, please check your input.", downstream_black_list=[".*"])
     else:
         return TextDoc(text=input_text)
 
 
 if __name__ == "__main__":
-    model = "citizenlab/distilbert-base-multilingual-cased-toxicity"
+    model = "Intel/toxic-prompt-roberta"
     toxicity_pipeline = pipeline("text-classification", model=model, tokenizer=model)
     opea_microservices["opea_service@toxicity_detection"].start()