Hello, Benchmark (#19)

gretelai · Oct 6, 2022 · 1bc76e9 · 1bc76e9
1 parent 5299d6e
commit 1bc76e9
Show file tree

Hide file tree

Showing 20 changed files with 1,976 additions and 2 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,7 @@
+.PHONY: test
+test:
+	python -m pytest
+
+.PHONY: type
+type:
+	python -m pyright src/gretel_trainer/benchmark tests/test_benchmark.py
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ This module is designed to provide a simple interface to help users successfully
     * `Gretel-CTGAN` model type supports tabular and conditional data generation.
     * `Gretel-GPT` natural language synthesis based on an open-source implementation of GPT-3 (coming soon).
     * `Gretel-DGAN` multi-variate time series based on DoppelGANger (coming soon).
-    
+
 ## Try it out now!
 
 If you want to quickly get started synthesizing data with **Gretel.ai**, simply click the button below and follow the examples. See additional Python3 and Jupyter Notebook examples in the `./notebooks` folder.
@@ -57,11 +57,16 @@ model = trainer.Trainer()
 model.train(dataset)
 ```
 
-### 3. Generate synthetic data! 
+### 3. Generate synthetic data!
 ```python3
 df = model.generate()
 ```
 
+## Development
+
+- Run tests via `make test`
+- Run type-checking (limited coverage) via `make type`
+
 ## TODOs / Roadmap
 
 - [ ] Enable conditional generation via SDK interface (supported in Notebooks currently).
diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb
@@ -0,0 +1,222 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gretel_trainer.benchmark as b"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Datasets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### From your own data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_demo_data = b.make_dataset([\"~/Downloads/demo.csv\"], datatype=\"tabular_mixed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### From Gretel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datasets = []\n",
+    "# datasets = b.list_gretel_datasets()\n",
+    "# datasets = b.list_gretel_datasets(datatype=\"time_series\")\n",
+    "# datasets = b.list_gretel_datasets(datatype=\"tabular_mixed\", tags=[\"small\", \"marketing\"])\n",
+    "\n",
+    "[dataset.name for dataset in datasets]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b.list_gretel_dataset_tags()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select a specific dataset by name\n",
+    "iris = b.get_gretel_dataset(\"iris\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Gretel defaults\n",
+    "\n",
+    "Preconfigured based on [public blueprints](https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gretel_trainer.benchmark import (\n",
+    "    GretelAmplify,\n",
+    "    GretelAuto,\n",
+    "    GretelCTGAN,\n",
+    "    GretelGPTX,\n",
+    "    GretelLSTM,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Customized Gretel models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gretel_trainer.benchmark import GretelModel\n",
+    "\n",
+    "\n",
+    "class TunedLSTM(GretelModel):\n",
+    "    config = \"/path/to/my_config.yml\"\n",
+    "\n",
+    "\n",
+    "class TweakedCtgan(GretelModel):\n",
+    "    config = {...}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Completely custom, non-Gretel models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "\n",
+    "class MyCustomModel:\n",
+    "    def train(self, source: str, **kwargs) -> None:\n",
+    "        self.source_df = pd.read_csv(source)\n",
+    "        time.sleep(8)\n",
+    "        return None\n",
+    "\n",
+    "    def generate(self, **kwargs) -> pd.DataFrame:\n",
+    "        time.sleep(3)\n",
+    "        return self.source_df.sample(frac=0.6)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch a Benchmark Comparison!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comparison = b.compare(datasets=[my_demo_data, iris], models=[GretelLSTM, GretelAmplify])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comparison.results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comparison.wait()\n",
+    "comparison.export_results(\"./results.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.10 64-bit ('3.9.10')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "c8726cf33f00e2373738d19e8a73b26d03723d6c732c72211354be2991192c77"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,2 @@
+pyright
+pytest
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,6 @@
+boto3
 gretel-client
 gretel-synthetics[utils]
+pandas
 pydantic==1.9.0
+typing-extensions
diff --git a/src/gretel_trainer/benchmark/__init__.py b/src/gretel_trainer/benchmark/__init__.py
@@ -0,0 +1,83 @@
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from typing import List, Optional, Type, Union
+
+import gretel_trainer.benchmark.compare as c
+import gretel_trainer.benchmark.custom.datasets
+import pandas as pd
+
+from gretel_trainer.benchmark.core import Dataset, Datatype, ModelFactory
+from gretel_trainer.benchmark.gretel.datasets import GretelDataset, GretelPublicDatasetRepo
+from gretel_trainer.benchmark.gretel.models import (
+    GretelAmplify,
+    GretelAuto,
+    GretelCTGAN,
+    GretelGPTX,
+    GretelLSTM,
+    GretelModel,
+)
+from gretel_trainer.benchmark.gretel.sdk import ActualGretelSDK
+from gretel_trainer import trainer
+
+BENCHMARK_DIR = "./.benchmark"
+
+repo = GretelPublicDatasetRepo(
+    bucket="gretel-datasets",
+    region="us-west-2",
+    load_dir=f"{BENCHMARK_DIR}/gretel_datasets",
+)
+
+
+def get_gretel_dataset(name: str) -> GretelDataset:
+    return repo.get_dataset(name)
+
+
+def list_gretel_datasets(
+    datatype: Optional[Union[Datatype, str]] = None, tags: Optional[List[str]] = None
+) -> List[GretelDataset]:
+    return repo.list_datasets(datatype, tags)
+
+
+def list_gretel_dataset_tags() -> List[str]:
+    return repo.list_tags()
+
+
+def make_dataset(
+    sources: Union[List[str], List[pd.DataFrame]],
+    *,
+    datatype: Union[Datatype, str],
+    namespace: Optional[str] = None,
+    delimiter: str = ",",
+) -> Dataset:
+    return gretel_trainer.benchmark.custom.datasets.make_dataset(
+        sources,
+        datatype=datatype,
+        namespace=namespace,
+        delimiter=delimiter,
+        local_dir=BENCHMARK_DIR,
+    )
+
+
+def compare(
+    *,
+    datasets: List[Dataset],
+    models: List[Union[ModelFactory, Type[GretelModel]]],
+    auto_clean: bool = True,
+) -> c.Comparison:
+    return c.compare(
+        datasets=datasets,
+        models=models,
+        runtime_config=c.RuntimeConfig(
+            local_dir=BENCHMARK_DIR,
+            project_prefix=f"benchmark-{_timestamp()}",
+            thread_pool=ThreadPoolExecutor(5),
+            wait_secs=10,
+            auto_clean=auto_clean,
+        ),
+        gretel_sdk=ActualGretelSDK,
+        gretel_trainer_factory=trainer.Trainer,
+    )
+
+
+def _timestamp() -> str:
+    return datetime.now().strftime("%Y%m%d%H%M%S")