Skip to content

Commit

Permalink
tutorial: toxicity classifier (#708)
Browse files Browse the repository at this point in the history
- [x] Added a formatting function check (in formatter.py) for notebooks
(checks Colab links, empty cells, etc.)

---------

Co-authored-by: hhh21u <[email protected]>
  • Loading branch information
jarulraj and hhh21u authored May 9, 2023
1 parent bfdc756 commit de4628a
Show file tree
Hide file tree
Showing 20 changed files with 3,026 additions and 766 deletions.
5 changes: 0 additions & 5 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<!--
👋 Thanks for submitting a Pull Request to EVA DB!

🙌 We want to make contributing to EVA DB as easy and transparent as possible. Here are a few tips to get you started:
Expand All @@ -9,7 +8,3 @@

👉 Please see our ✅ [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details.

🚨 Note that Copilot will summarize this PR below, do not modify the 'copilot:all' line.
-->

copilot:all
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ tutorials/bddtest.zip
tutorials/license.zip
license/
bddtest/
tutorials/*.jpg

# benchmark
.benchmarks
Expand Down
2 changes: 2 additions & 0 deletions eva/executor/create_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from eva.executor.executor_utils import handle_if_not_exists
from eva.plan_nodes.create_plan import CreatePlan
from eva.storage.storage_engine import StorageEngine
from eva.utils.logging_manager import logger


class CreateExecutor(AbstractExecutor):
Expand All @@ -26,6 +27,7 @@ def __init__(self, node: CreatePlan):

def exec(self, *args, **kwargs):
if not handle_if_not_exists(self.node.table_info, self.node.if_not_exists):
logger.debug(f"Creating table {self.node.table_info}")
catalog_entry = self.catalog.create_and_insert_table_catalog_entry(
self.node.table_info, self.node.column_list
)
Expand Down
1 change: 1 addition & 0 deletions eva/executor/drop_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def exec(self, *args, **kwargs):
)
storage_engine = StorageEngine.factory(table_obj)

logger.debug(f"Dropping table {table_info}")
storage_engine.drop(table=table_obj)

for col_obj in table_obj.columns:
Expand Down
2 changes: 2 additions & 0 deletions eva/executor/executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def apply_predicate(batch: Batch, predicate: AbstractExpression) -> Batch:


def handle_if_not_exists(table_info: TableInfo, if_not_exist=False):
# Table exists
if CatalogManager().check_table_exists(
table_info.table_name,
table_info.database_name,
Expand All @@ -57,6 +58,7 @@ def handle_if_not_exists(table_info: TableInfo, if_not_exist=False):
else:
logger.error(err_msg)
raise ExecutorError(err_msg)
# Table does not exist
else:
return False

Expand Down
6 changes: 5 additions & 1 deletion eva/models/server/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from enum import Enum
from typing import Optional

from eva.executor.executor_utils import ExecutorError
from eva.models.storage.batch import Batch
from eva.utils.generic_utils import PickleSerializer

Expand Down Expand Up @@ -45,7 +46,10 @@ def deserialize(cls, data):
return obj

def as_df(self):
assert self.batch is not None, "Response is empty"
if self.error is not None:
raise ExecutorError(self.error)
if self.batch is None:
raise ExecutorError("Empty batch")
return self.batch.frames

def __str__(self):
Expand Down
62 changes: 61 additions & 1 deletion script/formatting/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import sys
from pathlib import Path
import asyncio
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell

import pkg_resources

Expand Down Expand Up @@ -48,6 +50,7 @@ def wrapped(*args, **kwargs):
EVA_SRC_DIR = os.path.join(EVA_DIR, "eva")
EVA_TEST_DIR = os.path.join(EVA_DIR, "test")
EVA_SCRIPT_DIR = os.path.join(EVA_DIR, "script")
EVA_NOTEBOOKS_DIR = os.path.join(EVA_DIR, "tutorials")

FORMATTING_DIR = os.path.join(EVA_SCRIPT_DIR, "formatting")
PYLINTRC = os.path.join(FORMATTING_DIR, "pylintrc")
Expand Down Expand Up @@ -206,6 +209,54 @@ def format_file(file_path, add_header, strip_header, format_code):

# END FORMAT__FILE(FILE_NAME)

# check the notebooks
def check_notebook_format(notebook_file):
notebook_file_name = os.path.basename(notebook_file)

# Ignore this notebook
if notebook_file_name == "ignore_tag.ipynb":
return True

with open(notebook_file) as f:
nb = nbformat.read(f, as_version=4)

# Check that the notebook contains at least one cell
if not nb.cells:
LOG.error(f"ERROR: Notebook {notebook_file} has no cells")
sys.exit(1)

# Check that all cells have a valid cell type (code, markdown, or raw)
for cell in nb.cells:
if cell.cell_type not in ['code', 'markdown', 'raw']:
LOG.error(f"ERROR: Notebook {notebook_file} contains an invalid cell type: {cell.cell_type}")
sys.exit(1)

# Check that all code cells have a non-empty source code
for cell in nb.cells:
if cell.cell_type == 'code' and not cell.source.strip():
LOG.error(f"ERROR: Notebook {notebook_file} contains an empty code cell")
sys.exit(1)

# Check for "print(response)"
for cell in nb.cells:
if cell.cell_type == 'code' and 'print(response)' in cell.source:
LOG.error(f"ERROR: Notebook {notebook_file} contains an a cell with this content: {cell.source}")
sys.exit(1)

# Check for "Colab link"
contains_colab_link = False
for cell in nb.cells:
if cell.cell_type == 'markdown' and 'colab' in cell.source:
# Check if colab link is correct
# notebook_file_name must match colab link
if notebook_file_name in cell.source:
contains_colab_link = True
break

if contains_colab_link is False:
sys.exit(1)

return True

# format all the files in the dir passed as argument
def format_dir(dir_path, add_header, strip_header, format_code):
Expand Down Expand Up @@ -325,4 +376,13 @@ def check_file(file):
)

for file in files:
check_file(file)
check_file(file)

# CHECK ALL THE NOTEBOOKS

# Iterate over all files in the directory
# and check if they are Jupyter notebooks
for file in os.listdir(EVA_NOTEBOOKS_DIR):
if file.endswith(".ipynb"):
notebook_file = os.path.join(EVA_NOTEBOOKS_DIR, file)
check_notebook_format(notebook_file)
4 changes: 2 additions & 2 deletions script/test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ if [[ "$OSTYPE" != "msys" ]];
then
if [[ "$MODE" = "TEST" || "$MODE" = "ALL" ]];
then
PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=eva/ -s -v --log-level=WARNING -m "not benchmark"
PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=eva/ --capture=sys --tb=short -v --log-level=WARNING -m "not benchmark"
elif [[ "$MODE" = "RAY" ]];
then
PYTHONPATH=./ pytest -s -v -p no:cov test/ -m "not benchmark"
Expand Down Expand Up @@ -109,7 +109,7 @@ fi

if [[ ( "$OSTYPE" != "msys" ) && ( "$MODE" = "NOTEBOOK" || "$MODE" = "ALL" ) ]];
then
PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" -s -v --log-level=WARNING --nbmake-timeout=3000
PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb"
notebook_test_code=$?
if [ "$notebook_test_code" != "0" ];
then
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def read(path, encoding="utf-8"):
"ipython<8.13.0", # NOTEBOOKS
"thefuzz", # FUZZY STRING MATCHING
"ultralytics", # OBJECT DETECTION
"transformers==4.27.4", # HUGGINGFACE
"transformers>=4.27.4", # HUGGINGFACE
"openai>=0.27.4", # CHATGPT
]

Expand Down
104 changes: 104 additions & 0 deletions test/integration_tests/test_huggingface_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import unittest
from test.util import create_text_csv, file_remove

import pytest

from eva.catalog.catalog_manager import CatalogManager
from eva.executor.executor_utils import ExecutorError
from eva.server.command_handler import execute_query_fetch_all
Expand Down Expand Up @@ -44,6 +46,7 @@ def setUp(self) -> None:
def tearDown(self) -> None:
execute_query_fetch_all("DROP TABLE IF EXISTS DETRAC;")
execute_query_fetch_all("DROP TABLE IF EXISTS VIDEOS;")
execute_query_fetch_all("DROP TABLE IF EXISTS MyCSV;")
file_remove(self.csv_file_path)

def test_io_catalog_entries_populated(self):
Expand Down Expand Up @@ -140,6 +143,7 @@ def test_image_classification(self):

select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 3;"
output = execute_query_fetch_all(select_query)
print("output: ", output)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)
Expand All @@ -159,6 +163,7 @@ def test_image_classification(self):
drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_text_classification(self):
create_table_query = """CREATE TABLE IF NOT EXISTS MyCSV (
id INTEGER UNIQUE,
Expand Down Expand Up @@ -203,6 +208,7 @@ def test_text_classification(self):
execute_query_fetch_all(drop_udf_query)
execute_query_fetch_all("DROP TABLE MyCSV;")

@pytest.mark.benchmark
def test_automatic_speech_recognition(self):
udf_name = "SpeechRecognizer"
create_udf = (
Expand All @@ -223,6 +229,7 @@ def test_automatic_speech_recognition(self):
drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_summarization_from_video(self):
asr_udf = "SpeechRecognizer"
create_udf = (
Expand Down Expand Up @@ -254,3 +261,100 @@ def test_summarization_from_video(self):
execute_query_fetch_all(drop_udf_query)
drop_udf_query = f"DROP UDF {summary_udf};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_toxicity_classification(self):
udf_name = "HFToxicityClassifier"
create_udf_query = f"""CREATE UDF {udf_name}
TYPE HuggingFace
'task' 'text-classification'
'model' 'martin-ha/toxic-comment-model'
"""
execute_query_fetch_all(create_udf_query)

drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
execute_query_fetch_all(drop_table_query)

create_table_query = """CREATE TABLE IF NOT EXISTS MyCSV (
id INTEGER UNIQUE,
comment TEXT(30)
);"""
execute_query_fetch_all(create_table_query)

load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
execute_query_fetch_all(load_table_query)

select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
output = execute_query_fetch_all(select_query)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)

# Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
self.assertTrue(
all(
x in ["non-toxic", "toxic"]
for x in output.frames[udf_name.lower() + ".label"]
)
)

# Test that there exists a column with udf_name.score
# and each entry is a float
self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
self.assertTrue(
all(
isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
)
)

drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_multilingual_toxicity_classification(self):
udf_name = "HFMultToxicityClassifier"
create_udf_query = f"""CREATE UDF {udf_name}
TYPE HuggingFace
'task' 'text-classification'
'model' 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus'
"""
execute_query_fetch_all(create_udf_query)

drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
execute_query_fetch_all(drop_table_query)

create_table_query = """CREATE TABLE MyCSV (
id INTEGER UNIQUE,
comment TEXT(30)
);"""
execute_query_fetch_all(create_table_query)

load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
execute_query_fetch_all(load_table_query)

select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
output = execute_query_fetch_all(select_query)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)

# Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
self.assertTrue(
all(
x in ["LABEL_1", "LABEL_0"]
for x in output.frames[udf_name.lower() + ".label"]
)
)

# Test that there exists a column with udf_name.score and each entry is a float
self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
self.assertTrue(
all(
isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
)
)

drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)
19 changes: 14 additions & 5 deletions tutorials/00-start-eva-server.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,22 @@
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2023-04-30T15:16:35.543397Z",
"iopub.status.busy": "2023-04-30T15:16:35.542857Z",
"iopub.status.idle": "2023-04-30T15:16:49.079780Z",
"shell.execute_reply": "2023-04-30T15:16:49.077653Z"
"iopub.execute_input": "2023-05-09T03:37:54.104875Z",
"iopub.status.busy": "2023-05-09T03:37:54.104289Z",
"iopub.status.idle": "2023-05-09T03:38:08.286784Z",
"shell.execute_reply": "2023-05-09T03:38:08.284731Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -133,7 +142,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.8"
},
"vscode": {
"interpreter": {
Expand Down
Loading

0 comments on commit de4628a

Please sign in to comment.