Skip to content

Commit

Permalink
Hello, Benchmark (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeknep authored Oct 6, 2022
1 parent 5299d6e commit 1bc76e9
Show file tree
Hide file tree
Showing 20 changed files with 1,976 additions and 2 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.PHONY: test
test:
python -m pytest

.PHONY: type
type:
python -m pyright src/gretel_trainer/benchmark tests/test_benchmark.py
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ This module is designed to provide a simple interface to help users successfully
* `Gretel-CTGAN` model type supports tabular and conditional data generation.
* `Gretel-GPT` natural language synthesis based on an open-source implementation of GPT-3 (coming soon).
* `Gretel-DGAN` multi-variate time series based on DoppelGANger (coming soon).

## Try it out now!

If you want to quickly get started synthesizing data with **Gretel.ai**, simply click the button below and follow the examples. See additional Python3 and Jupyter Notebook examples in the `./notebooks` folder.
Expand Down Expand Up @@ -57,11 +57,16 @@ model = trainer.Trainer()
model.train(dataset)
```

### 3. Generate synthetic data!
### 3. Generate synthetic data!
```python3
df = model.generate()
```

## Development

- Run tests via `make test`
- Run type-checking (limited coverage) via `make type`

## TODOs / Roadmap

- [ ] Enable conditional generation via SDK interface (supported in Notebooks currently).
222 changes: 222 additions & 0 deletions notebooks/benchmark.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gretel_trainer.benchmark as b"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### From your own data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"my_demo_data = b.make_dataset([\"~/Downloads/demo.csv\"], datatype=\"tabular_mixed\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### From Gretel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"datasets = []\n",
"# datasets = b.list_gretel_datasets()\n",
"# datasets = b.list_gretel_datasets(datatype=\"time_series\")\n",
"# datasets = b.list_gretel_datasets(datatype=\"tabular_mixed\", tags=[\"small\", \"marketing\"])\n",
"\n",
"[dataset.name for dataset in datasets]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"b.list_gretel_dataset_tags()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Select a specific dataset by name\n",
"iris = b.get_gretel_dataset(\"iris\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Models"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Gretel defaults\n",
"\n",
"Preconfigured based on [public blueprints](https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from gretel_trainer.benchmark import (\n",
" GretelAmplify,\n",
" GretelAuto,\n",
" GretelCTGAN,\n",
" GretelGPTX,\n",
" GretelLSTM,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Customized Gretel models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from gretel_trainer.benchmark import GretelModel\n",
"\n",
"\n",
"class TunedLSTM(GretelModel):\n",
" config = \"/path/to/my_config.yml\"\n",
"\n",
"\n",
"class TweakedCtgan(GretelModel):\n",
" config = {...}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Completely custom, non-Gretel models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"\n",
"import pandas as pd\n",
"\n",
"\n",
"class MyCustomModel:\n",
" def train(self, source: str, **kwargs) -> None:\n",
" self.source_df = pd.read_csv(source)\n",
" time.sleep(8)\n",
" return None\n",
"\n",
" def generate(self, **kwargs) -> pd.DataFrame:\n",
" time.sleep(3)\n",
" return self.source_df.sample(frac=0.6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Launch a Benchmark Comparison!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"comparison = b.compare(datasets=[my_demo_data, iris], models=[GretelLSTM, GretelAmplify])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"comparison.results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"comparison.wait()\n",
"comparison.export_results(\"./results.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.10 64-bit ('3.9.10')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "c8726cf33f00e2373738d19e8a73b26d03723d6c732c72211354be2991192c77"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pyright
pytest
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
boto3
gretel-client
gretel-synthetics[utils]
pandas
pydantic==1.9.0
typing-extensions
83 changes: 83 additions & 0 deletions src/gretel_trainer/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import List, Optional, Type, Union

import gretel_trainer.benchmark.compare as c
import gretel_trainer.benchmark.custom.datasets
import pandas as pd

from gretel_trainer.benchmark.core import Dataset, Datatype, ModelFactory
from gretel_trainer.benchmark.gretel.datasets import GretelDataset, GretelPublicDatasetRepo
from gretel_trainer.benchmark.gretel.models import (
GretelAmplify,
GretelAuto,
GretelCTGAN,
GretelGPTX,
GretelLSTM,
GretelModel,
)
from gretel_trainer.benchmark.gretel.sdk import ActualGretelSDK
from gretel_trainer import trainer

BENCHMARK_DIR = "./.benchmark"

repo = GretelPublicDatasetRepo(
bucket="gretel-datasets",
region="us-west-2",
load_dir=f"{BENCHMARK_DIR}/gretel_datasets",
)


def get_gretel_dataset(name: str) -> GretelDataset:
return repo.get_dataset(name)


def list_gretel_datasets(
datatype: Optional[Union[Datatype, str]] = None, tags: Optional[List[str]] = None
) -> List[GretelDataset]:
return repo.list_datasets(datatype, tags)


def list_gretel_dataset_tags() -> List[str]:
return repo.list_tags()


def make_dataset(
sources: Union[List[str], List[pd.DataFrame]],
*,
datatype: Union[Datatype, str],
namespace: Optional[str] = None,
delimiter: str = ",",
) -> Dataset:
return gretel_trainer.benchmark.custom.datasets.make_dataset(
sources,
datatype=datatype,
namespace=namespace,
delimiter=delimiter,
local_dir=BENCHMARK_DIR,
)


def compare(
*,
datasets: List[Dataset],
models: List[Union[ModelFactory, Type[GretelModel]]],
auto_clean: bool = True,
) -> c.Comparison:
return c.compare(
datasets=datasets,
models=models,
runtime_config=c.RuntimeConfig(
local_dir=BENCHMARK_DIR,
project_prefix=f"benchmark-{_timestamp()}",
thread_pool=ThreadPoolExecutor(5),
wait_secs=10,
auto_clean=auto_clean,
),
gretel_sdk=ActualGretelSDK,
gretel_trainer_factory=trainer.Trainer,
)


def _timestamp() -> str:
return datetime.now().strftime("%Y%m%d%H%M%S")
Loading

0 comments on commit 1bc76e9

Please sign in to comment.