Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tutorial: toxicity classifier #708

Merged
merged 20 commits into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<!--
👋 Thanks for submitting a Pull Request to EVA DB!

🙌 We want to make contributing to EVA DB as easy and transparent as possible. Here are a few tips to get you started:
Expand All @@ -9,7 +8,3 @@

👉 Please see our ✅ [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details.

🚨 Note that Copilot will summarize this PR below, do not modify the 'copilot:all' line.
-->

copilot:all
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ tutorials/bddtest.zip
tutorials/license.zip
license/
bddtest/
tutorials/*.jpg

# benchmark
.benchmarks
Expand Down
Binary file added data/detoxify/meme3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 5 additions & 1 deletion eva/models/server/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from enum import Enum
from typing import Optional

from eva.executor.executor_utils import ExecutorError
from eva.models.storage.batch import Batch
from eva.utils.generic_utils import PickleSerializer

Expand Down Expand Up @@ -45,7 +46,10 @@ def deserialize(cls, data):
return obj

def as_df(self):
assert self.batch is not None, "Response is empty"
if self.error is not None:
raise ExecutorError(self.error)
if self.batch is None:
raise ExecutorError("Empty batch")
return self.batch.frames

def __str__(self):
Expand Down
99 changes: 99 additions & 0 deletions eva/udfs/toxicity_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# coding=utf-8
# Copyright 2018-2022 EVA
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List

import pandas as pd
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TextClassificationPipeline,
)

from eva.udfs.abstract.abstract_udf import AbstractClassifierUDF


class ToxicityClassifier(AbstractClassifierUDF):
jarulraj marked this conversation as resolved.
Show resolved Hide resolved
"""
Arguments:
threshold (float): Threshold for classifier confidence score
"""

@property
def name(self) -> str:
return "ToxicityClassifier"

def setup(self, threshold=0.3):
self.threshold = threshold
model_path = "s-nlp/roberta_toxicity_classifier"
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
mult_model_path = (
"EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus"
)
self.mult_tokenizer = AutoTokenizer.from_pretrained(mult_model_path)
self.mult_model = AutoModelForSequenceClassification.from_pretrained(
mult_model_path
)

@property
def labels(self) -> List[str]:
return ["toxic", "not toxic"]

def forward(self, text_dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Performs predictions on input text
Arguments:
text (pd.DataFrame): Dataframe with text on which predictions need to be performed

['example text 1','example text 2']
['example text 3']
...
['example text 1','example text 5']

Returns:
outcome (List[Str])
"""
# reconstruct dimension of the input
outcome = pd.DataFrame()
dataframe_size = text_dataframe.size

for i in range(0, dataframe_size):
text = text_dataframe.iat[i, 0]
text = " ".join(text)
pipeline = TextClassificationPipeline(
model=self.model, tokenizer=self.tokenizer
)
out = pipeline(text)

multi_pipeline = TextClassificationPipeline(
model=self.mult_model, tokenizer=self.mult_tokenizer
)
multi_out = multi_pipeline(text)

out_label = out[0]["label"]
multi_out_label = multi_out[0]["label"]
multi_out_score = multi_out[0]["score"]
if out_label == "toxic" or (
multi_out_label == "LABEL_1" and multi_out_score > self.threshold
):
outcome = pd.concat(
[outcome, pd.DataFrame({"labels": ["toxic"]})], ignore_index=True
)
else:
outcome = pd.concat(
[outcome, pd.DataFrame({"labels": ["not toxic"]})],
ignore_index=True,
)
return outcome
14 changes: 7 additions & 7 deletions script/formatting/pre-push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ if [ $exit_status -ne 0 ]; then
exit 1
fi

if ! git diff-index --quiet HEAD --; then
echo "Code was reformatted or you have unstaged changes."
echo "Please verify and stage the changes."
echo "List of files updated."
git --no-pager diff --name-only
exit 1
fi
# if ! git diff-index --quiet HEAD --; then
jarulraj marked this conversation as resolved.
Show resolved Hide resolved
# echo "Code was reformatted or you have unstaged changes."
# echo "Please verify and stage the changes."
# echo "List of files updated."
# git --no-pager diff --name-only
# exit 1
# fi

4 changes: 2 additions & 2 deletions script/test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ if [[ "$OSTYPE" != "msys" ]];
then
if [[ "$MODE" = "TEST" || "$MODE" = "ALL" ]];
then
PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=eva/ -s -v --log-level=WARNING -m "not benchmark"
PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=eva/ --capture=sys --tb=short -v --log-level=WARNING -m "not benchmark"
elif [[ "$MODE" = "RAY" ]];
then
PYTHONPATH=./ pytest -s -v -p no:cov test/ -m "not benchmark"
Expand Down Expand Up @@ -109,7 +109,7 @@ fi

if [[ ( "$OSTYPE" != "msys" ) && ( "$MODE" = "NOTEBOOK" || "$MODE" = "ALL" ) ]];
then
PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" -s -v --log-level=WARNING --nbmake-timeout=3000
PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000
notebook_test_code=$?
if [ "$notebook_test_code" != "0" ];
then
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def read(path, encoding="utf-8"):
"ipython<8.13.0", # NOTEBOOKS
"thefuzz", # FUZZY STRING MATCHING
"ultralytics", # OBJECT DETECTION
"transformers==4.27.4", # HUGGINGFACE
"transformers>=4.27.4", # HUGGINGFACE
"openai>=0.27.4", # CHATGPT
]

Expand Down
96 changes: 96 additions & 0 deletions test/integration_tests/test_huggingface_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def test_image_classification(self):

select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 3;"
output = execute_query_fetch_all(select_query)
print("output: ", output)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)
Expand Down Expand Up @@ -254,3 +255,98 @@ def test_summarization_from_video(self):
execute_query_fetch_all(drop_udf_query)
drop_udf_query = f"DROP UDF {summary_udf};"
execute_query_fetch_all(drop_udf_query)

def test_toxicity_classification(self):
udf_name = "HFToxicityClassifier"
create_udf_query = f"""CREATE UDF {udf_name}
TYPE HuggingFace
'task' 'text-classification'
'model' 'martin-ha/toxic-comment-model'
"""
execute_query_fetch_all(create_udf_query)

drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
execute_query_fetch_all(drop_table_query)

create_table_query = """CREATE TABLE MyCSV (
id INTEGER UNIQUE,
comment TEXT(30)
);"""
execute_query_fetch_all(create_table_query)

load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
execute_query_fetch_all(load_table_query)

select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
output = execute_query_fetch_all(select_query)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)

# Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
self.assertTrue(
all(
x in ["non-toxic", "toxic"]
for x in output.frames[udf_name.lower() + ".label"]
)
)

# Test that there exists a column with udf_name.score
# and each entry is a float
self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
self.assertTrue(
all(
isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
)
)

drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

def test_multilingual_toxicity_classification(self):
udf_name = "HFMultToxicityClassifier"
create_udf_query = f"""CREATE UDF {udf_name}
TYPE HuggingFace
'task' 'text-classification'
'model' 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus'
"""
execute_query_fetch_all(create_udf_query)

drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
execute_query_fetch_all(drop_table_query)

create_table_query = """CREATE TABLE MyCSV (
id INTEGER UNIQUE,
comment TEXT(30)
);"""
execute_query_fetch_all(create_table_query)

load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
execute_query_fetch_all(load_table_query)

select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
output = execute_query_fetch_all(select_query)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)

# Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
self.assertTrue(
all(
x in ["LABEL_1", "LABEL_0"]
for x in output.frames[udf_name.lower() + ".label"]
)
)

# Test that there exists a column with udf_name.score and each entry is a float
self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
self.assertTrue(
all(
isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
)
)

drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)
2 changes: 2 additions & 0 deletions test/integration_tests/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@ def setUpClass(cls):
asl_actions = f"{EVA_ROOT_DIR}/data/actions/computer_asl.mp4"
meme1 = f"{EVA_ROOT_DIR}/data/detoxify/meme1.jpg"
meme2 = f"{EVA_ROOT_DIR}/data/detoxify/meme2.jpg"
meme3 = f"{EVA_ROOT_DIR}/data/detoxify/meme3.jpg"

execute_query_fetch_all(f"LOAD VIDEO '{ua_detrac}' INTO MyVideo;")
execute_query_fetch_all(f"LOAD VIDEO '{mnist}' INTO MNIST;")
execute_query_fetch_all(f"LOAD VIDEO '{actions}' INTO Actions;")
execute_query_fetch_all(f"LOAD VIDEO '{asl_actions}' INTO Asl_actions;")
execute_query_fetch_all(f"LOAD IMAGE '{meme1}' INTO MemeImages;")
execute_query_fetch_all(f"LOAD IMAGE '{meme2}' INTO MemeImages;")
execute_query_fetch_all(f"LOAD IMAGE '{meme3}' INTO MemeImages;")
load_udfs_for_testing()

@classmethod
Expand Down
19 changes: 14 additions & 5 deletions tutorials/00-start-eva-server.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,22 @@
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2023-04-30T15:16:35.543397Z",
"iopub.status.busy": "2023-04-30T15:16:35.542857Z",
"iopub.status.idle": "2023-04-30T15:16:49.079780Z",
"shell.execute_reply": "2023-04-30T15:16:49.077653Z"
"iopub.execute_input": "2023-05-08T04:56:35.979870Z",
"iopub.status.busy": "2023-05-08T04:56:35.979421Z",
"iopub.status.idle": "2023-05-08T04:56:50.439757Z",
"shell.execute_reply": "2023-05-08T04:56:50.438079Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -133,7 +142,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.8"
},
"vscode": {
"interpreter": {
Expand Down
Loading