From f57cd5c2db6d42665198a578cbe95f9f65eb41ba Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Wed, 14 Apr 2021 11:02:34 -0700 Subject: [PATCH 1/2] update smart seed notebook --- .../blueprint.ipynb | 453 +++++++++--------- .../manifest.json | 4 +- 2 files changed, 237 insertions(+), 220 deletions(-) diff --git a/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb b/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb index 4f3ec86d..c9c38c90 100644 --- a/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb +++ b/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb @@ -1,220 +1,237 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "UTRxpSlaczHY" - }, - "source": [ - "# Create synthetic data from partial rows\n", - "\n", - "This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. This blueprint uses\n", - "a helper model known as a `SeriesModel`. Gretel uses a feature known as \"smart seeding\" that will generate rows based on partial values from your training data. This is useful when you want to utilize unique column values as input to the model and let Gretel synthesize the rest of the row for you.\n", - "\n", - "Use Cases for Series Data Synthesis:\n", - "\n", - "- Create synthetic data that has the same number of rows as the training data\n", - "- You want to preserve some of the original row data (primary keys, dates, important categorical data).\n", - "\n", - "Essentially this model will let you just take partial rows from the training data, and synthesize the rest of\n", - "the rows for you.\n", - "\n", - "In the example below, we'll use a combination of a primary key and a couple of categorical fields as seed input." - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "accelerator": "GPU", + "colab": { + "name": "smart-seed-blueprint", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VEM6kjRsczHd" - }, - "outputs": [], - "source": [ - "%%capture\n", - "\n", - "!pip install -U gretel-client gretel-synthetics pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZQ-TmAdwczHd" - }, - "outputs": [], - "source": [ - "# Load your Gretel API key. You can acquire this from the Gretel Console \n", - "# @ https://console.gretel.cloud\n", - "\n", - "import pandas as pd\n", - "from gretel_client import get_cloud_client\n", - "\n", - "pd.set_option('max_colwidth', None)\n", - "\n", - "client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")\n", - "client.install_packages()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YMg9nX6SczHe" - }, - "outputs": [], - "source": [ - "# Load and preview dataset\n", - "\n", - "import pandas as pd\n", - "\n", - "dataset_path = \"https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/customer_finance_data.csv\"\n", - "training_df = pd.read_csv(dataset_path)\n", - "training_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "checkpoint_dir = str(Path.cwd() / \"checkpoints-series\")\n", - "\n", - "config_template = {\n", - " \"checkpoint_dir\": checkpoint_dir,\n", - " \"vocab_size\": 20000,\n", - " \"overwrite\": True\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Rw77l2Vg8nWl" - }, - "outputs": [], - "source": [ - "# Capture transient import errors in Google Colab\n", - "\n", - "try:\n", - " from gretel_helpers.series_models import SeriesModel\n", - "except FileNotFoundError:\n", - " from gretel_helpers.series_models import SeriesModel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# When synthesizing series data we will use partial rows from the training data. Before creating the model, we can analyze\n", - "# the desired partial rows for any suggestions on how to pre-process the training data.\n", - "#\n", - "# In this example we want to use the table's primary key, gender, and age as partial rows, and let Gretel synthesize\n", - "# the rest of the data.\n", - "\n", - "seed_columns = [\"client_id\", \"age\", \"gender\"]\n", - "\n", - "print(\n", - " SeriesModel.get_suggestions(training_df=training_df, seed_columns=seed_columns)\n", - ")\n", - "\n", - "# If we look at this output, we'll see that \"disp_id\" is also fully unique across the table. If this column still\n", - "# needs to be unique when synthesized, we recommend adding it to the seed column list. Otherwise, if it's not really\n", - "# important for your downstream use cases, let's say ML modeling, then we can always remove it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CCW-JaiNczHf" - }, - "outputs": [], - "source": [ - "# Create a model, train, and generate a new DataFrame\n", - "\n", - "try:\n", - " training_df.drop(\"disp_id\", axis=\"columns\", inplace=True)\n", - "except KeyError:\n", - " pass # incase we already dropped it\n", - "\n", - "# Params for SeriesModel:\n", - "# - training_df: Your training DataFrame\n", - "# - seed_columns: A list of columns for which you want to use the original dataset values for\n", - "# - synthetic_config: The usual synthetic data configuration\n", - "# - auto_seed_corr: If enabled, automatically update the seed columns with other correlated fields. This will\n", - "# potentially add new columns to the seed list.\n", - "#\n", - "\n", - "# Create a model, train, and generate a new DataFrame\n", - "\n", - "model = SeriesModel(\n", - " training_df=training_df,\n", - " seed_columns=seed_columns,\n", - " synthetic_config=config_template,\n", - " auto_seed_corr=True\n", - ").train().generate()\n", - "\n", - "synthetic_df = model.df\n", - "model.generate_report()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Take a peak at the synthetic data\n", - "\n", - "synthetic_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zX8qsizqczHg" - }, - "outputs": [], - "source": [ - "# Generate report that shows the statistical performance between the training and synthetic data\n", - "import IPython\n", - "\n", - "report_path = './gretel_report.html'\n", - "IPython.display.HTML(filename=report_path)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "blueprint.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UTRxpSlaczHY" + }, + "source": [ + "# Create synthetic data from partial rows\n", + "\n", + "This blueprint utilizes Gretel's premium SDKs to create a synthetic version of your own data. This blueprint uses\n", + "a helper model known as a `SeriesModel`. Gretel uses a feature known as \"smart seeding\" that will generate rows based on partial values from your training data. This is useful when you want to utilize unique column values as input to the model and let Gretel synthesize the rest of the row for you.\n", + "\n", + "Use Cases for Series Data Synthesis:\n", + "\n", + "- Create synthetic data that has the same number of rows as the training data\n", + "- You want to preserve some of the original row data (primary keys, dates, important categorical data).\n", + "\n", + "Essentially this model will let you just take partial rows from the training data, and synthesize the rest of\n", + "the rows for you.\n", + "\n", + "In the example below, we'll use a combination of a primary key and a couple of categorical fields as seed input." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VEM6kjRsczHd" + }, + "source": [ + "%%capture\n", + "\n", + "!pip install -U gretel-client gretel-synthetics pandas" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZQ-TmAdwczHd" + }, + "source": [ + "# Load your Gretel API key. You can acquire this from the Gretel Console \n", + "# @ https://console.gretel.cloud\n", + "\n", + "import pandas as pd\n", + "from gretel_client import get_cloud_client\n", + "\n", + "pd.set_option('max_colwidth', None)\n", + "\n", + "client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")\n", + "client.install_packages(version=\"dev\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YMg9nX6SczHe" + }, + "source": [ + "# Load and preview dataset\n", + "\n", + "import pandas as pd\n", + "\n", + "dataset_path = \"https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/customer_finance_data.csv\"\n", + "training_df = pd.read_csv(dataset_path)\n", + "training_df" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "h9vNTKPxVAln" + }, + "source": [ + "from pathlib import Path\n", + "\n", + "checkpoint_dir = str(Path.cwd() / \"checkpoints-series\")\n", + "\n", + "config_template = {\n", + " \"checkpoint_dir\": checkpoint_dir,\n", + " \"vocab_size\": 20000,\n", + " \"overwrite\": True,\n", + " \"rnn_units\": 1024,\n", + " \"embedding_dim\": 256,\n", + " \"batch_size\": 64,\n", + " \"learning_rate\": 0.001\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Rw77l2Vg8nWl" + }, + "source": [ + "# Capture transient import errors in Google Colab\n", + "\n", + "try:\n", + " from gretel_helpers.series_models import SeriesModel\n", + "except FileNotFoundError:\n", + " from gretel_helpers.series_models import SeriesModel" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "n7-mhKWaVAlo" + }, + "source": [ + "# When synthesizing series data we will use partial rows from the training data. Before creating the model, we can analyze\n", + "# the desired partial rows for any suggestions on how to pre-process the training data.\n", + "#\n", + "# In this example we want to use the table's primary key, gender, and age as partial rows, and let Gretel synthesize\n", + "# the rest of the data.\n", + "\n", + "seed_columns = [\"client_id\", \"age\", \"gender\"]\n", + "\n", + "print(\n", + " SeriesModel.get_suggestions(training_df=training_df, seed_columns=seed_columns)\n", + ")\n", + "\n", + "# If we look at this output, we'll see that \"disp_id\" is also fully unique across the table. If this column still\n", + "# needs to be unique when synthesized, we recommend adding it to the seed column list. Otherwise, if it's not really\n", + "# important for your downstream use cases, let's say ML modeling, then we can always remove it." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CCW-JaiNczHf" + }, + "source": [ + "# Create a model, train, and generate a new DataFrame\n", + "\n", + "try:\n", + " training_df.drop(\"disp_id\", axis=\"columns\", inplace=True)\n", + "except KeyError:\n", + " pass # incase we already dropped it\n", + "\n", + "# Params for SeriesModel:\n", + "# - training_df: Your training DataFrame\n", + "# - seed_columns: A list of columns for which you want to use the original dataset values for\n", + "# - synthetic_config: The usual synthetic data configuration\n", + "# - auto_seed_corr: If enabled, automatically update the seed columns with other correlated fields. This will\n", + "# potentially add new columns to the seed list.\n", + "# - column_cluster_size: The count of columns to cluster per training batch.\n", + "# Use defaults for more complex data. Higher values may give\n", + "# stronger correlations across the dataset, at the cost\n", + "# of model accuracy. \n", + "\n", + "# Create a model, train, and generate a new DataFrame\n", + "\n", + "model = SeriesModel(\n", + " training_df=training_df,\n", + " seed_columns=seed_columns,\n", + " synthetic_config=config_template,\n", + " auto_seed_corr=True\n", + ")\n", + "\n", + "model.train()\n", + "model.generate(max_invalid=1e5)\n", + "\n", + "synthetic_df = model.df\n", + "model.generate_report()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "MU-6ZsvKVAlp" + }, + "source": [ + "# Take a peek at the synthetic data\n", + "\n", + "synthetic_df" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zX8qsizqczHg" + }, + "source": [ + "# Generate report that shows the statistical performance between the training and synthetic data\n", + "import IPython\n", + "\n", + "report_path = './gretel_report.html'\n", + "IPython.display.HTML(filename=report_path)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/gretel/create_synthetic_data_smart_seeding/manifest.json b/gretel/create_synthetic_data_smart_seeding/manifest.json index 168732be..7e2dfd15 100644 --- a/gretel/create_synthetic_data_smart_seeding/manifest.json +++ b/gretel/create_synthetic_data_smart_seeding/manifest.json @@ -1,6 +1,6 @@ { - "name": "Seed a synthetic model", - "description": "Seed a model with data you wish to retain.", + "name": "Smart-seed a synthetic model", + "description": "Seed a model with columns you wish to retain.", "tags": ["synthetic-data", "ai"], "sample_data_key": null, "language": "python", From e6c35da4c18569892aca2b75aa297e0b82219a3e Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Wed, 14 Apr 2021 14:10:42 -0700 Subject: [PATCH 2/2] remove dev and extra docs --- gretel/create_synthetic_data_smart_seeding/blueprint.ipynb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb b/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb index c9c38c90..acf558e2 100644 --- a/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb +++ b/gretel/create_synthetic_data_smart_seeding/blueprint.ipynb @@ -77,7 +77,7 @@ "pd.set_option('max_colwidth', None)\n", "\n", "client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")\n", - "client.install_packages(version=\"dev\")" + "client.install_packages()" ], "execution_count": null, "outputs": [] @@ -182,10 +182,6 @@ "# - synthetic_config: The usual synthetic data configuration\n", "# - auto_seed_corr: If enabled, automatically update the seed columns with other correlated fields. This will\n", "# potentially add new columns to the seed list.\n", - "# - column_cluster_size: The count of columns to cluster per training batch.\n", - "# Use defaults for more complex data. Higher values may give\n", - "# stronger correlations across the dataset, at the cost\n", - "# of model accuracy. \n", "\n", "# Create a model, train, and generate a new DataFrame\n", "\n",