From f6f620a2db8d75001747e541b3a27cf2c8f06c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20De=20Le=C3=B3n?= <111013930+daniel-de-leon-user293@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:37:45 -0700 Subject: [PATCH] Add Intel/toxic-prompt-roberta to toxicity detection microservice (#749) * add toxic roberta model Signed-off-by: Daniel Deleon * update README Signed-off-by: Daniel Deleon --------- Signed-off-by: Daniel Deleon --- comps/guardrails/toxicity_detection/README.md | 8 +++----- comps/guardrails/toxicity_detection/toxicity_detection.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/comps/guardrails/toxicity_detection/README.md b/comps/guardrails/toxicity_detection/README.md index 1024969bf..b13d68c5c 100644 --- a/comps/guardrails/toxicity_detection/README.md +++ b/comps/guardrails/toxicity_detection/README.md @@ -4,11 +4,9 @@ Toxicity Detection Microservice allows AI Application developers to safeguard user input and LLM output from harmful language in a RAG environment. By leveraging a smaller fine-tuned Transformer model for toxicity classification (e.g. DistilledBERT, RoBERTa, etc.), we maintain a lightweight guardrails microservice without significantly sacrificing performance making it readily deployable on both Intel Gaudi and Xeon. -Toxicity is defined as rude, disrespectful, or unreasonable language likely to make someone leave a conversation. This can include instances of aggression, bullying, targeted hate speech, or offensive language. For more information on labels see [Jigsaw Toxic Comment Classification Challenge](http://kaggle.com/c/jigsaw-toxic-comment-classification-challenge). - -## Future Development +This microservice uses [`Intel/toxic-prompt-roberta`](https://huggingface.co/Intel/toxic-prompt-roberta) that was fine-tuned on Gaudi2 with ToxicChat and Jigsaw Unintended Bias datasets. -- Add a RoBERTa (125M params) toxicity model fine-tuned on Gaudi2 with ToxicChat and Jigsaw dataset in an optimized serving framework. +Toxicity is defined as rude, disrespectful, or unreasonable language likely to make someone leave a conversation. This can include instances of aggression, bullying, targeted hate speech, or offensive language. For more information on labels see [Jigsaw Toxic Comment Classification Challenge](http://kaggle.com/c/jigsaw-toxic-comment-classification-challenge). ## 🚀1. Start Microservice with Python(Option 1) @@ -65,7 +63,7 @@ curl localhost:9091/v1/toxicity Example Output: ```bash -"\nI'm sorry, but your query or LLM's response is TOXIC with an score of 0.97 (0-1)!!!\n" +"Violated policies: toxicity, please check your input." ``` **Python Script:** diff --git a/comps/guardrails/toxicity_detection/toxicity_detection.py b/comps/guardrails/toxicity_detection/toxicity_detection.py index df965505f..711945ab6 100644 --- a/comps/guardrails/toxicity_detection/toxicity_detection.py +++ b/comps/guardrails/toxicity_detection/toxicity_detection.py @@ -19,13 +19,13 @@ def llm_generate(input: TextDoc): input_text = input.text toxic = toxicity_pipeline(input_text) print("done") - if toxic[0]["label"] == "toxic": + if toxic[0]["label"].lower() == "toxic": return TextDoc(text="Violated policies: toxicity, please check your input.", downstream_black_list=[".*"]) else: return TextDoc(text=input_text) if __name__ == "__main__": - model = "citizenlab/distilbert-base-multilingual-cased-toxicity" + model = "Intel/toxic-prompt-roberta" toxicity_pipeline = pipeline("text-classification", model=model, tokenizer=model) opea_microservices["opea_service@toxicity_detection"].start()