tutorial: toxicity classifier (#708)

- [x] Added a formatting function check (in formatter.py) for notebooks (checks Colab links, empty cells, etc.) --------- Co-authored-by: hhh21u <[email protected]>
georgia-tech-db · May 9, 2023 · de4628a · de4628a
1 parent bfdc756
commit de4628a
Show file tree

Hide file tree

Showing 20 changed files with 3,026 additions and 766 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,4 +1,3 @@
-<!--
 👋 Thanks for submitting a Pull Request to EVA DB!
 
 🙌 We want to make contributing to EVA DB as easy and transparent as possible. Here are a few tips to get you started:
@@ -9,7 +8,3 @@
 
 👉 Please see our ✅ [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details.
 
-🚨 Note that Copilot will summarize this PR below, do not modify the 'copilot:all' line.
--->
-
-copilot:all
diff --git a/.gitignore b/.gitignore
@@ -183,6 +183,7 @@ tutorials/bddtest.zip
 tutorials/license.zip
 license/
 bddtest/
+tutorials/*.jpg
 
 # benchmark
 .benchmarks

diff --git a/eva/executor/create_executor.py b/eva/executor/create_executor.py
@@ -17,6 +17,7 @@
 from eva.executor.executor_utils import handle_if_not_exists
 from eva.plan_nodes.create_plan import CreatePlan
 from eva.storage.storage_engine import StorageEngine
+from eva.utils.logging_manager import logger
 
 
 class CreateExecutor(AbstractExecutor):
@@ -26,6 +27,7 @@ def __init__(self, node: CreatePlan):
 
     def exec(self, *args, **kwargs):
         if not handle_if_not_exists(self.node.table_info, self.node.if_not_exists):
+            logger.debug(f"Creating table {self.node.table_info}")
             catalog_entry = self.catalog.create_and_insert_table_catalog_entry(
                 self.node.table_info, self.node.column_list
             )

diff --git a/eva/executor/drop_executor.py b/eva/executor/drop_executor.py
@@ -54,6 +54,7 @@ def exec(self, *args, **kwargs):
         )
         storage_engine = StorageEngine.factory(table_obj)
 
+        logger.debug(f"Dropping table {table_info}")
         storage_engine.drop(table=table_obj)
 
         for col_obj in table_obj.columns:

diff --git a/eva/executor/executor_utils.py b/eva/executor/executor_utils.py
@@ -46,6 +46,7 @@ def apply_predicate(batch: Batch, predicate: AbstractExpression) -> Batch:
 
 
 def handle_if_not_exists(table_info: TableInfo, if_not_exist=False):
+    # Table exists
     if CatalogManager().check_table_exists(
         table_info.table_name,
         table_info.database_name,
@@ -57,6 +58,7 @@ def handle_if_not_exists(table_info: TableInfo, if_not_exist=False):
         else:
             logger.error(err_msg)
             raise ExecutorError(err_msg)
+    # Table does not exist
     else:
         return False
 

diff --git a/eva/models/server/response.py b/eva/models/server/response.py
@@ -16,6 +16,7 @@
 from enum import Enum
 from typing import Optional
 
+from eva.executor.executor_utils import ExecutorError
 from eva.models.storage.batch import Batch
 from eva.utils.generic_utils import PickleSerializer
 
@@ -45,7 +46,10 @@ def deserialize(cls, data):
         return obj
 
     def as_df(self):
-        assert self.batch is not None, "Response is empty"
+        if self.error is not None:
+            raise ExecutorError(self.error)
+        if self.batch is None:
+            raise ExecutorError("Empty batch")
         return self.batch.frames
 
     def __str__(self):

diff --git a/script/formatting/formatter.py b/script/formatting/formatter.py
@@ -21,6 +21,8 @@
 import sys
 from pathlib import Path
 import asyncio
+import nbformat
+from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell
 
 import pkg_resources
 
@@ -48,6 +50,7 @@ def wrapped(*args, **kwargs):
 EVA_SRC_DIR = os.path.join(EVA_DIR, "eva")
 EVA_TEST_DIR = os.path.join(EVA_DIR, "test")
 EVA_SCRIPT_DIR = os.path.join(EVA_DIR, "script")
+EVA_NOTEBOOKS_DIR = os.path.join(EVA_DIR, "tutorials")
 
 FORMATTING_DIR = os.path.join(EVA_SCRIPT_DIR, "formatting")
 PYLINTRC = os.path.join(FORMATTING_DIR, "pylintrc")
@@ -206,6 +209,54 @@ def format_file(file_path, add_header, strip_header, format_code):
 
 # END FORMAT__FILE(FILE_NAME)
 
+# check the notebooks
+def check_notebook_format(notebook_file):
+    notebook_file_name = os.path.basename(notebook_file)
+
+    # Ignore this notebook
+    if notebook_file_name == "ignore_tag.ipynb":
+        return True
+
+    with open(notebook_file) as f:
+        nb = nbformat.read(f, as_version=4)
+
+    # Check that the notebook contains at least one cell
+    if not nb.cells:
+        LOG.error(f"ERROR: Notebook {notebook_file} has no cells")
+        sys.exit(1)
+
+    # Check that all cells have a valid cell type (code, markdown, or raw)
+    for cell in nb.cells:
+        if cell.cell_type not in ['code', 'markdown', 'raw']:
+            LOG.error(f"ERROR: Notebook {notebook_file} contains an invalid cell type: {cell.cell_type}")
+            sys.exit(1)
+
+    # Check that all code cells have a non-empty source code
+    for cell in nb.cells:
+        if cell.cell_type == 'code' and not cell.source.strip():
+            LOG.error(f"ERROR: Notebook {notebook_file} contains an empty code cell")
+            sys.exit(1)
+
+    # Check for "print(response)"
+    for cell in nb.cells:
+        if cell.cell_type == 'code' and 'print(response)' in cell.source:
+            LOG.error(f"ERROR: Notebook {notebook_file} contains an a cell with this content: {cell.source}")
+            sys.exit(1)
+
+    # Check for "Colab link"
+    contains_colab_link = False
+    for cell in nb.cells:
+        if cell.cell_type == 'markdown' and 'colab' in cell.source:
+            # Check if colab link is correct
+            # notebook_file_name must match colab link
+            if notebook_file_name in cell.source:
+                contains_colab_link = True
+                break
+
+    if contains_colab_link is False:
+        sys.exit(1)
+
+    return True
 
 # format all the files in the dir passed as argument
 def format_dir(dir_path, add_header, strip_header, format_code):
@@ -325,4 +376,13 @@ def check_file(file):
         )
 
         for file in files:
-            check_file(file)
+            check_file(file)
+
+    # CHECK ALL THE NOTEBOOKS
+
+    # Iterate over all files in the directory 
+    # and check if they are Jupyter notebooks
+    for file in os.listdir(EVA_NOTEBOOKS_DIR):
+        if file.endswith(".ipynb"):
+            notebook_file = os.path.join(EVA_NOTEBOOKS_DIR, file)
+            check_notebook_format(notebook_file)
diff --git a/script/test/test.sh b/script/test/test.sh
@@ -76,7 +76,7 @@ if [[ "$OSTYPE" != "msys" ]];
 then
     if [[ "$MODE" = "TEST" || "$MODE" = "ALL" ]];
     then
-        PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered  --cov-config=.coveragerc --cov-context=test --cov=eva/ -s -v --log-level=WARNING -m "not benchmark"
+        PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered  --cov-config=.coveragerc --cov-context=test --cov=eva/ --capture=sys --tb=short -v --log-level=WARNING -m "not benchmark"
     elif [[ "$MODE" = "RAY" ]];
     then
         PYTHONPATH=./ pytest -s -v -p no:cov test/ -m "not benchmark"
@@ -109,7 +109,7 @@ fi
 
 if [[ ( "$OSTYPE" != "msys" ) && ( "$MODE" = "NOTEBOOK" || "$MODE" = "ALL" ) ]];
 then 
-    PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" -s -v --log-level=WARNING --nbmake-timeout=3000
+    PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" 
     notebook_test_code=$?
     if [ "$notebook_test_code" != "0" ];
     then

diff --git a/setup.py b/setup.py
@@ -98,7 +98,7 @@ def read(path, encoding="utf-8"):
     "ipython<8.13.0",  # NOTEBOOKS
     "thefuzz",  # FUZZY STRING MATCHING
     "ultralytics",  # OBJECT DETECTION
-    "transformers==4.27.4",  # HUGGINGFACE
+    "transformers>=4.27.4",  # HUGGINGFACE
     "openai>=0.27.4",  # CHATGPT
 ]
 

diff --git a/test/integration_tests/test_huggingface_udfs.py b/test/integration_tests/test_huggingface_udfs.py
@@ -15,6 +15,8 @@
 import unittest
 from test.util import create_text_csv, file_remove
 
+import pytest
+
 from eva.catalog.catalog_manager import CatalogManager
 from eva.executor.executor_utils import ExecutorError
 from eva.server.command_handler import execute_query_fetch_all
@@ -44,6 +46,7 @@ def setUp(self) -> None:
     def tearDown(self) -> None:
         execute_query_fetch_all("DROP TABLE IF EXISTS DETRAC;")
         execute_query_fetch_all("DROP TABLE IF EXISTS VIDEOS;")
+        execute_query_fetch_all("DROP TABLE IF EXISTS MyCSV;")
         file_remove(self.csv_file_path)
 
     def test_io_catalog_entries_populated(self):
@@ -140,6 +143,7 @@ def test_image_classification(self):
 
         select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 3;"
         output = execute_query_fetch_all(select_query)
+        print("output: ", output)
 
         # Test that output has 2 columns
         self.assertEqual(len(output.frames.columns), 2)
@@ -159,6 +163,7 @@ def test_image_classification(self):
         drop_udf_query = f"DROP UDF {udf_name};"
         execute_query_fetch_all(drop_udf_query)
 
+    @pytest.mark.benchmark
     def test_text_classification(self):
         create_table_query = """CREATE TABLE IF NOT EXISTS MyCSV (
                 id INTEGER UNIQUE,
@@ -203,6 +208,7 @@ def test_text_classification(self):
         execute_query_fetch_all(drop_udf_query)
         execute_query_fetch_all("DROP TABLE MyCSV;")
 
+    @pytest.mark.benchmark
     def test_automatic_speech_recognition(self):
         udf_name = "SpeechRecognizer"
         create_udf = (
@@ -223,6 +229,7 @@ def test_automatic_speech_recognition(self):
         drop_udf_query = f"DROP UDF {udf_name};"
         execute_query_fetch_all(drop_udf_query)
 
+    @pytest.mark.benchmark
     def test_summarization_from_video(self):
         asr_udf = "SpeechRecognizer"
         create_udf = (
@@ -254,3 +261,100 @@ def test_summarization_from_video(self):
         execute_query_fetch_all(drop_udf_query)
         drop_udf_query = f"DROP UDF {summary_udf};"
         execute_query_fetch_all(drop_udf_query)
+
+    @pytest.mark.benchmark
+    def test_toxicity_classification(self):
+        udf_name = "HFToxicityClassifier"
+        create_udf_query = f"""CREATE UDF {udf_name}
+            TYPE HuggingFace
+            'task' 'text-classification'
+            'model' 'martin-ha/toxic-comment-model'
+        """
+        execute_query_fetch_all(create_udf_query)
+
+        drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
+        execute_query_fetch_all(drop_table_query)
+
+        create_table_query = """CREATE TABLE IF NOT EXISTS MyCSV (
+                id INTEGER UNIQUE,
+                comment TEXT(30)
+            );"""
+        execute_query_fetch_all(create_table_query)
+
+        load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
+        execute_query_fetch_all(load_table_query)
+
+        select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
+        output = execute_query_fetch_all(select_query)
+
+        # Test that output has 2 columns
+        self.assertEqual(len(output.frames.columns), 2)
+
+        # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
+        self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
+        self.assertTrue(
+            all(
+                x in ["non-toxic", "toxic"]
+                for x in output.frames[udf_name.lower() + ".label"]
+            )
+        )
+
+        # Test that there exists a column with udf_name.score
+        # and each entry is a float
+        self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
+        self.assertTrue(
+            all(
+                isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
+            )
+        )
+
+        drop_udf_query = f"DROP UDF {udf_name};"
+        execute_query_fetch_all(drop_udf_query)
+
+    @pytest.mark.benchmark
+    def test_multilingual_toxicity_classification(self):
+        udf_name = "HFMultToxicityClassifier"
+        create_udf_query = f"""CREATE UDF {udf_name}
+            TYPE HuggingFace
+            'task' 'text-classification'
+            'model' 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus'
+        """
+        execute_query_fetch_all(create_udf_query)
+
+        drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
+        execute_query_fetch_all(drop_table_query)
+
+        create_table_query = """CREATE TABLE MyCSV (
+                id INTEGER UNIQUE,
+                comment TEXT(30)
+            );"""
+        execute_query_fetch_all(create_table_query)
+
+        load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
+        execute_query_fetch_all(load_table_query)
+
+        select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
+        output = execute_query_fetch_all(select_query)
+
+        # Test that output has 2 columns
+        self.assertEqual(len(output.frames.columns), 2)
+
+        # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
+        self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
+        self.assertTrue(
+            all(
+                x in ["LABEL_1", "LABEL_0"]
+                for x in output.frames[udf_name.lower() + ".label"]
+            )
+        )
+
+        # Test that there exists a column with udf_name.score and each entry is a float
+        self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
+        self.assertTrue(
+            all(
+                isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
+            )
+        )
+
+        drop_udf_query = f"DROP UDF {udf_name};"
+        execute_query_fetch_all(drop_udf_query)
diff --git a/tutorials/00-start-eva-server.ipynb b/tutorials/00-start-eva-server.ipynb
@@ -38,13 +38,22 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2023-04-30T15:16:35.543397Z",
-     "iopub.status.busy": "2023-04-30T15:16:35.542857Z",
-     "iopub.status.idle": "2023-04-30T15:16:49.079780Z",
-     "shell.execute_reply": "2023-04-30T15:16:49.077653Z"
+     "iopub.execute_input": "2023-05-09T03:37:54.104875Z",
+     "iopub.status.busy": "2023-05-09T03:37:54.104289Z",
+     "iopub.status.idle": "2023-05-09T03:38:08.286784Z",
+     "shell.execute_reply": "2023-05-09T03:38:08.284731Z"
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -133,7 +142,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.8"
   },
   "vscode": {
    "interpreter": {