georgia-tech-db · jarulraj · May 9, 2023 · Apr 23, 2023 · Apr 24, 2023 · Apr 29, 2023
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,3 @@
-<!--
 👋 Thanks for submitting a Pull Request to EVA DB!
 
 🙌 We want to make contributing to EVA DB as easy and transparent as possible. Here are a few tips to get you started:
@@ -9,7 +8,3 @@
 
 👉 Please see our ✅ [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details.
 
-🚨 Note that Copilot will summarize this PR below, do not modify the 'copilot:all' line.
--->
-
-copilot:all
diff --git a/.gitignore b/.gitignore
@@ -183,6 +183,7 @@ tutorials/bddtest.zip
 tutorials/license.zip
 license/
 bddtest/
+tutorials/*.jpg
 
 # benchmark
 .benchmarks

diff --git a/data/detoxify/meme3.jpg b/data/detoxify/meme3.jpg
diff --git a/eva/models/server/response.py b/eva/models/server/response.py
@@ -16,6 +16,7 @@
 from enum import Enum
 from typing import Optional
 
+from eva.executor.executor_utils import ExecutorError
 from eva.models.storage.batch import Batch
 from eva.utils.generic_utils import PickleSerializer
 
@@ -45,7 +46,10 @@ def deserialize(cls, data):
         return obj
 
     def as_df(self):
-        assert self.batch is not None, "Response is empty"
+        if self.error is not None:
+            raise ExecutorError(self.error)
+        if self.batch is None:
+            raise ExecutorError("Empty batch")
         return self.batch.frames
 
     def __str__(self):

diff --git a/eva/udfs/toxicity_classifier.py b/eva/udfs/toxicity_classifier.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2018-2022 EVA
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import pandas as pd
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    TextClassificationPipeline,
+)
+
+from eva.udfs.abstract.abstract_udf import AbstractClassifierUDF
+
+
+class ToxicityClassifier(AbstractClassifierUDF):
+    """
+    Arguments:
+        threshold (float): Threshold for classifier confidence score
+    """
+
+    @property
+    def name(self) -> str:
+        return "ToxicityClassifier"
+
+    def setup(self, threshold=0.3):
+        self.threshold = threshold
+        model_path = "s-nlp/roberta_toxicity_classifier"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
+        mult_model_path = (
+            "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus"
+        )
+        self.mult_tokenizer = AutoTokenizer.from_pretrained(mult_model_path)
+        self.mult_model = AutoModelForSequenceClassification.from_pretrained(
+            mult_model_path
+        )
+
+    @property
+    def labels(self) -> List[str]:
+        return ["toxic", "not toxic"]
+
+    def forward(self, text_dataframe: pd.DataFrame) -> pd.DataFrame:
+        """
+        Performs predictions on input text
+        Arguments:
+            text (pd.DataFrame): Dataframe with text on which predictions need to be performed
+
+            ['example text 1','example text 2']
+            ['example text 3']
+            ...
+            ['example text 1','example text 5']
+
+        Returns:
+            outcome (List[Str])
+        """
+        # reconstruct dimension of the input
+        outcome = pd.DataFrame()
+        dataframe_size = text_dataframe.size
+
+        for i in range(0, dataframe_size):
+            text = text_dataframe.iat[i, 0]
+            text = " ".join(text)
+            pipeline = TextClassificationPipeline(
+                model=self.model, tokenizer=self.tokenizer
+            )
+            out = pipeline(text)
+
+            multi_pipeline = TextClassificationPipeline(
+                model=self.mult_model, tokenizer=self.mult_tokenizer
+            )
+            multi_out = multi_pipeline(text)
+
+            out_label = out[0]["label"]
+            multi_out_label = multi_out[0]["label"]
+            multi_out_score = multi_out[0]["score"]
+            if out_label == "toxic" or (
+                multi_out_label == "LABEL_1" and multi_out_score > self.threshold
+            ):
+                outcome = pd.concat(
+                    [outcome, pd.DataFrame({"labels": ["toxic"]})], ignore_index=True
+                )
+            else:
+                outcome = pd.concat(
+                    [outcome, pd.DataFrame({"labels": ["not toxic"]})],
+                    ignore_index=True,
+                )
+        return outcome
diff --git a/script/formatting/pre-push.sh b/script/formatting/pre-push.sh
@@ -14,11 +14,11 @@ if [ $exit_status -ne 0 ]; then
     exit 1
 fi
 
-if ! git diff-index --quiet HEAD --; then
-    echo "Code was reformatted or you have unstaged changes." 
-    echo "Please verify and stage the changes."
-    echo "List of files updated."
-    git --no-pager diff --name-only
-    exit 1
-fi
+# if ! git diff-index --quiet HEAD --; then
+#     echo "Code was reformatted or you have unstaged changes." 
+#     echo "Please verify and stage the changes."
+#     echo "List of files updated."
+#     git --no-pager diff --name-only
+#     exit 1
+# fi
 
diff --git a/script/test/test.sh b/script/test/test.sh
@@ -76,7 +76,7 @@ if [[ "$OSTYPE" != "msys" ]];
 then
     if [[ "$MODE" = "TEST" || "$MODE" = "ALL" ]];
     then
-        PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered  --cov-config=.coveragerc --cov-context=test --cov=eva/ -s -v --log-level=WARNING -m "not benchmark"
+        PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered  --cov-config=.coveragerc --cov-context=test --cov=eva/ --capture=sys --tb=short -v --log-level=WARNING -m "not benchmark"
     elif [[ "$MODE" = "RAY" ]];
     then
         PYTHONPATH=./ pytest -s -v -p no:cov test/ -m "not benchmark"
@@ -109,7 +109,7 @@ fi
 
 if [[ ( "$OSTYPE" != "msys" ) && ( "$MODE" = "NOTEBOOK" || "$MODE" = "ALL" ) ]];
 then 
-    PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" -s -v --log-level=WARNING --nbmake-timeout=3000
+    PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000
     notebook_test_code=$?
     if [ "$notebook_test_code" != "0" ];
     then

diff --git a/setup.py b/setup.py
@@ -98,7 +98,7 @@ def read(path, encoding="utf-8"):
     "ipython<8.13.0",  # NOTEBOOKS
     "thefuzz",  # FUZZY STRING MATCHING
     "ultralytics",  # OBJECT DETECTION
-    "transformers==4.27.4",  # HUGGINGFACE
+    "transformers>=4.27.4",  # HUGGINGFACE
     "openai>=0.27.4",  # CHATGPT
 ]
 

diff --git a/test/integration_tests/test_huggingface_udfs.py b/test/integration_tests/test_huggingface_udfs.py
@@ -140,6 +140,7 @@ def test_image_classification(self):
 
         select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 3;"
         output = execute_query_fetch_all(select_query)
+        print("output: ", output)
 
         # Test that output has 2 columns
         self.assertEqual(len(output.frames.columns), 2)
@@ -254,3 +255,98 @@ def test_summarization_from_video(self):
         execute_query_fetch_all(drop_udf_query)
         drop_udf_query = f"DROP UDF {summary_udf};"
         execute_query_fetch_all(drop_udf_query)
+
+    def test_toxicity_classification(self):
+        udf_name = "HFToxicityClassifier"
+        create_udf_query = f"""CREATE UDF {udf_name}
+            TYPE HuggingFace
+            'task' 'text-classification'
+            'model' 'martin-ha/toxic-comment-model'
+        """
+        execute_query_fetch_all(create_udf_query)
+
+        drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
+        execute_query_fetch_all(drop_table_query)
+
+        create_table_query = """CREATE TABLE MyCSV (
+                id INTEGER UNIQUE,
+                comment TEXT(30)
+            );"""
+        execute_query_fetch_all(create_table_query)
+
+        load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
+        execute_query_fetch_all(load_table_query)
+
+        select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
+        output = execute_query_fetch_all(select_query)
+
+        # Test that output has 2 columns
+        self.assertEqual(len(output.frames.columns), 2)
+
+        # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
+        self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
+        self.assertTrue(
+            all(
+                x in ["non-toxic", "toxic"]
+                for x in output.frames[udf_name.lower() + ".label"]
+            )
+        )
+
+        # Test that there exists a column with udf_name.score
+        # and each entry is a float
+        self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
+        self.assertTrue(
+            all(
+                isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
+            )
+        )
+
+        drop_udf_query = f"DROP UDF {udf_name};"
+        execute_query_fetch_all(drop_udf_query)
+
+    def test_multilingual_toxicity_classification(self):
+        udf_name = "HFMultToxicityClassifier"
+        create_udf_query = f"""CREATE UDF {udf_name}
+            TYPE HuggingFace
+            'task' 'text-classification'
+            'model' 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus'
+        """
+        execute_query_fetch_all(create_udf_query)
+
+        drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
+        execute_query_fetch_all(drop_table_query)
+
+        create_table_query = """CREATE TABLE MyCSV (
+                id INTEGER UNIQUE,
+                comment TEXT(30)
+            );"""
+        execute_query_fetch_all(create_table_query)
+
+        load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
+        execute_query_fetch_all(load_table_query)
+
+        select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
+        output = execute_query_fetch_all(select_query)
+
+        # Test that output has 2 columns
+        self.assertEqual(len(output.frames.columns), 2)
+
+        # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
+        self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
+        self.assertTrue(
+            all(
+                x in ["LABEL_1", "LABEL_0"]
+                for x in output.frames[udf_name.lower() + ".label"]
+            )
+        )
+
+        # Test that there exists a column with udf_name.score and each entry is a float
+        self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
+        self.assertTrue(
+            all(
+                isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
+            )
+        )
+
+        drop_udf_query = f"DROP UDF {udf_name};"
+        execute_query_fetch_all(drop_udf_query)
diff --git a/test/integration_tests/test_pytorch.py b/test/integration_tests/test_pytorch.py
@@ -39,13 +39,15 @@ def setUpClass(cls):
         asl_actions = f"{EVA_ROOT_DIR}/data/actions/computer_asl.mp4"
         meme1 = f"{EVA_ROOT_DIR}/data/detoxify/meme1.jpg"
         meme2 = f"{EVA_ROOT_DIR}/data/detoxify/meme2.jpg"
+        meme3 = f"{EVA_ROOT_DIR}/data/detoxify/meme3.jpg"
 
         execute_query_fetch_all(f"LOAD VIDEO '{ua_detrac}' INTO MyVideo;")
         execute_query_fetch_all(f"LOAD VIDEO '{mnist}' INTO MNIST;")
         execute_query_fetch_all(f"LOAD VIDEO '{actions}' INTO Actions;")
         execute_query_fetch_all(f"LOAD VIDEO '{asl_actions}' INTO Asl_actions;")
         execute_query_fetch_all(f"LOAD IMAGE '{meme1}' INTO MemeImages;")
         execute_query_fetch_all(f"LOAD IMAGE '{meme2}' INTO MemeImages;")
+        execute_query_fetch_all(f"LOAD IMAGE '{meme3}' INTO MemeImages;")
         load_udfs_for_testing()
 
     @classmethod

diff --git a/tutorials/00-start-eva-server.ipynb b/tutorials/00-start-eva-server.ipynb
@@ -38,13 +38,22 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2023-04-30T15:16:35.543397Z",
-     "iopub.status.busy": "2023-04-30T15:16:35.542857Z",
-     "iopub.status.idle": "2023-04-30T15:16:49.079780Z",
-     "shell.execute_reply": "2023-04-30T15:16:49.077653Z"
+     "iopub.execute_input": "2023-05-08T04:56:35.979870Z",
+     "iopub.status.busy": "2023-05-08T04:56:35.979421Z",
+     "iopub.status.idle": "2023-05-08T04:56:50.439757Z",
+     "shell.execute_reply": "2023-05-08T04:56:50.438079Z"
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -133,7 +142,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.8"
   },
   "vscode": {
    "interpreter": {