From 38d64150c956daae08a33ce40aa023d881d3f644 Mon Sep 17 00:00:00 2001 From: araitats <34896074+araitats@users.noreply.github.com> Date: Fri, 23 Apr 2021 14:33:42 -0700 Subject: [PATCH] update to autogluon 0.1.0 #3 (#2132) * autogluon-tabular autogluon==0.1.0 * Delete bank-additional dir Delete bank-additional dir * Delete X_test.csv Delete X_test.csv * Delete test.csv Delete test.csv * Delete train.csv Delete train.csv * Delete tmp dir Delete tmp dir * updated to 0.1.0 updated to 0.1.0 * update to 0.1.0 update to 0.1.0 * update pip update pip * update pip update pip * Update ipynb Update ipynb * Update train.py Update train.py * Delete train.py Delete train.py * Rename to train.py Rename to train.py * Update train.py Extract column info : target. * Retry Retry : ipynb is updated. * train.py train.py is updated. * train.py updated init_args * ipynb updated init_args * Update ipynb Update ipynb * ipynb updated model_uri = estimator.model_data * Deleted ipynb Deleted ipynb * Updated ipynb model_uri = estimator.model_data * train.py updated train.py updated * ipynb updated ipynb updated * train.py updated train.py updated * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Update Dockerfile.inference * Update Dockerfile.training * Add files via upload * Add files via upload Add SageMaker Clarify. Add various ways of building containers. * Add files via upload Add shap - SageMaker Clarify. Add sm-docker tutorial * Add files via upload add ipywidgets Co-authored-by: EC2 Default User --- AutoGluon_Tabular_SageMaker.ipynb | 930 ++++++++++++++++++ .../AutoGluon_Tabular_SageMaker.ipynb | 311 +++++- .../container-inference/Dockerfile.inference | 3 +- .../container-training/Dockerfile.training | 3 +- .../container-training/inference.py | 28 +- .../container-training/train.py | 28 +- 6 files changed, 1272 insertions(+), 31 deletions(-) create mode 100644 AutoGluon_Tabular_SageMaker.ipynb diff --git a/AutoGluon_Tabular_SageMaker.ipynb b/AutoGluon_Tabular_SageMaker.ipynb new file mode 100644 index 0000000000..b671ba5085 --- /dev/null +++ b/AutoGluon_Tabular_SageMaker.ipynb @@ -0,0 +1,930 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "# AutoGluon Tabular with SageMaker\n", + "\n", + "[AutoGluon](https://github.com/awslabs/autogluon) automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications. With just a few lines of code, you can train and deploy high-accuracy deep learning models on tabular, image, and text data.\n", + "This notebook shows how to use AutoGluon-Tabular with Amazon SageMaker by creating custom containers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Prerequisites\n", + "\n", + "If using a SageMaker hosted notebook, select kernel `conda_mxnet_p36`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "import subprocess\n", + "# Make sure docker compose is set up properly for local mode\n", + "subprocess.run(\"./setup.sh\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For Studio\n", + "subprocess.run(\"apt-get update -y\", shell=True)\n", + "subprocess.run(\"apt install unzip\", shell=True)\n", + "subprocess.run(\"pip install ipywidgets\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import boto3\n", + "import sagemaker\n", + "from time import sleep\n", + "from collections import Counter\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sagemaker import get_execution_role, local, Model, utils, s3\n", + "from sagemaker.estimator import Estimator\n", + "from sagemaker.predictor import Predictor\n", + "from sagemaker.serializers import CSVSerializer\n", + "from sagemaker.deserializers import StringDeserializer\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from IPython.core.display import display, HTML\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "# Print settings\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "pd.set_option('display.max_columns', 500)\n", + "pd.set_option('display.max_rows', 10)\n", + "\n", + "# Account/s3 setup\n", + "session = sagemaker.Session()\n", + "local_session = local.LocalSession()\n", + "bucket = session.default_bucket()\n", + "prefix = 'sagemaker/autogluon-tabular'\n", + "region = session.boto_region_name\n", + "role = get_execution_role()\n", + "client = session.boto_session.client(\n", + " \"sts\", region_name=region, endpoint_url=utils.sts_regional_endpoint(region)\n", + " )\n", + "account = client.get_caller_identity()['Account']\n", + "\n", + "registry_uri_training = sagemaker.image_uris.retrieve('mxnet', region, version= '1.7.0', py_version='py3', instance_type='ml.m5.2xlarge', image_scope='training')\n", + "registry_uri_inference = sagemaker.image_uris.retrieve('mxnet', region, version= '1.7.0', py_version='py3', instance_type='ml.m5.2xlarge', image_scope='inference')\n", + "ecr_uri_prefix = account +'.'+'.'.join(registry_uri_training.split('/')[0].split('.')[1:])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Build docker images" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "Build the training/inference image and push to ECR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "training_algorithm_name = 'autogluon-sagemaker-training'\n", + "inference_algorithm_name = 'autogluon-sagemaker-inference'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, you may want to remove existing docker images to make a room to build autogluon containers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(\"docker system prune -af\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [], + "source": [ + "subprocess.run(f\"/bin/bash ./container-training/build_push_training.sh {account} {region} {training_algorithm_name} {ecr_uri_prefix} {registry_uri_training.split('/')[0].split('.')[0]} {registry_uri_training}\", shell=True)\n", + "subprocess.run(\"docker system prune -af\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(f\"/bin/bash ./container-inference/build_push_inference.sh {account} {region} {inference_algorithm_name} {ecr_uri_prefix} {registry_uri_training.split('/')[0].split('.')[0]} {registry_uri_inference}\", shell=True)\n", + "subprocess.run(\"docker system prune -af\", shell=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alternative way of building docker images using sm-docker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The new Amazon SageMaker Studio Image Build convenience package allows data scientists and developers to easily build custom container images from your Studio notebooks via a new CLI. \n", + "Newly built Docker images are tagged and pushed to Amazon ECR. \n", + "\n", + "To use the CLI, you need to ensure the Amazon SageMaker execution role used by your Studio notebook environment (or another AWS Identity and Access Management (IAM) role, if you prefer) has the required permissions to interact with the resources used by the CLI, including access to CodeBuild and Amazon ECR. Your role should have a trust policy with CodeBuild. \n", + "\n", + "You also need to make sure the appropriate permissions are included in your role to run the build in CodeBuild, create a repository in Amazon ECR, and push images to that repository. \n", + "\n", + "See also: https://aws.amazon.com/blogs/machine-learning/using-the-amazon-sagemaker-studio-image-build-cli-to-build-container-images-from-your-studio-notebooks/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "###subprocess.run(\"pip install sagemaker-studio-image-build\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "training_repo_name = training_algorithm_name + ':latest'\n", + "training_repo_name \n", + "\n", + "!sm-docker build . --repository {training_repo_name} \\\n", + "--file ./container-training/Dockerfile.training --build-arg REGISTRY_URI={registry_uri_training}\n", + "\n", + "inference_repo_name = inference_algorithm_name + ':latest'\n", + "inference_repo_name \n", + "\n", + "!sm-docker build . --repository {inference_repo_name} \\\n", + "--file ./container-inference/Dockerfile.inference --build-arg REGISTRY_URI={registry_uri_inference}\n", + "\n", + "'''\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Get the data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "In this example we'll use the direct-marketing dataset to build a binary classification model that predicts whether customers will accept or decline a marketing offer. \n", + "First we'll download the data and split it into train and test sets. AutoGluon does not require a separate validation set (it uses bagged k-fold cross-validation)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# Download and unzip the data\n", + "subprocess.run(f\"aws s3 cp --region {region} s3://sagemaker-sample-data-{region}/autopilot/direct_marketing/bank-additional.zip .\", shell=True)\n", + "subprocess.run(\"unzip -qq -o bank-additional.zip\", shell=True)\n", + "subprocess.run(\"rm bank-additional.zip\", shell=True)\n", + "\n", + "local_data_path = './bank-additional/bank-additional-full.csv'\n", + "data = pd.read_csv(local_data_path)\n", + "\n", + "# Split train/test data\n", + "train = data.sample(frac=0.7, random_state=42)\n", + "test = data.drop(train.index)\n", + "\n", + "# Split test X/y\n", + "label = 'y'\n", + "y_test = test[label]\n", + "X_test = test.drop(columns=[label])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Check the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "train.head(3)\n", + "train.shape\n", + "\n", + "test.head(3)\n", + "test.shape\n", + "\n", + "X_test.head(3)\n", + "X_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "Upload the data to s3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "train_file = 'train.csv'\n", + "train.to_csv(train_file,index=False)\n", + "train_s3_path = session.upload_data(train_file, key_prefix='{}/data'.format(prefix))\n", + "\n", + "test_file = 'test.csv'\n", + "test.to_csv(test_file,index=False)\n", + "test_s3_path = session.upload_data(test_file, key_prefix='{}/data'.format(prefix))\n", + "\n", + "X_test_file = 'X_test.csv'\n", + "X_test.to_csv(X_test_file,index=False)\n", + "X_test_s3_path = session.upload_data(X_test_file, key_prefix='{}/data'.format(prefix))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Hyperparameter Selection\n", + "\n", + "The minimum required settings for training is just a target label, `init_args['label']`.\n", + "\n", + "Additional optional hyperparameters can be passed to the `autogluon.tabular.TabularPredictor.fit` function via `fit_args`.\n", + "\n", + "Below shows a more in depth example of AutoGluon-Tabular hyperparameters from the example [Predicting Columns in a Table - In Depth](https://auto.gluon.ai/stable/tutorials/tabular_prediction/tabular-indepth.html). Please see [fit parameters](https://auto.gluon.ai/stable/_modules/autogluon/tabular/predictor/predictor.html#TabularPredictor) for further information. Note that in order for hyperparameter ranges to work in SageMaker, values passed to the `fit_args['hyperparameters']` must be represented as strings.\n", + "\n", + "```python\n", + "nn_options = {\n", + " 'num_epochs': \"10\",\n", + " 'learning_rate': \"ag.space.Real(1e-4, 1e-2, default=5e-4, log=True)\",\n", + " 'activation': \"ag.space.Categorical('relu', 'softrelu', 'tanh')\",\n", + " 'layers': \"ag.space.Categorical([100],[1000],[200,100],[300,200,100])\",\n", + " 'dropout_prob': \"ag.space.Real(0.0, 0.5, default=0.1)\"\n", + "}\n", + "\n", + "gbm_options = {\n", + " 'num_boost_round': \"100\",\n", + " 'num_leaves': \"ag.space.Int(lower=26, upper=66, default=36)\"\n", + "}\n", + "\n", + "model_hps = {'NN': nn_options, 'GBM': gbm_options} \n", + "\n", + "init_args = {\n", + " 'eval_metric' : 'roc_auc' \n", + " 'label': 'y'\n", + "}\n", + "\n", + "fit_args = {\n", + " 'presets': ['best_quality', 'optimize_for_deployment'],\n", + " 'time_limits': 60*10,\n", + " 'hyperparameters': model_hps,\n", + " 'hyperparameter_tune': True,\n", + " 'search_strategy': 'skopt'\n", + "}\n", + "\n", + "\n", + "hyperparameters = {\n", + " 'fit_args': fit_args,\n", + " 'feature_importance': True\n", + "}\n", + "```\n", + "**Note:** Your hyperparameter choices may affect the size of the model package, which could result in additional time taken to upload your model and complete training. Including `'optimize_for_deployment'` in the list of `fit_args['presets']` is recommended to greatly reduce upload times.\n", + "\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# Define required label and optional additional parameters\n", + "init_args = {\n", + " 'label': 'y'\n", + "}\n", + "\n", + "# Define additional parameters\n", + "fit_args = {\n", + " # Adding 'best_quality' to presets list will result in better performance (but longer runtime)\n", + " 'presets': ['optimize_for_deployment'],\n", + "}\n", + "\n", + "# Pass fit_args to SageMaker estimator hyperparameters\n", + "hyperparameters = {\n", + " 'init_args': init_args, \n", + " 'fit_args': fit_args,\n", + " 'feature_importance': True\n", + "}\n", + "\n", + "tags = [{\n", + " 'Key' : 'AlgorithmName',\n", + " 'Value' : 'AutoGluon-Tabular'\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "## Train\n", + "\n", + "For local training set `train_instance_type` to `local` . \n", + "For non-local training the recommended instance type is `ml.m5.2xlarge`. \n", + "\n", + "**Note:** Depending on how many underlying models are trained, `train_volume_size` may need to be increased so that they all fit on disk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "instance_type = 'ml.m5.2xlarge'\n", + "#instance_type = 'local'\n", + "\n", + "ecr_image = f'{ecr_uri_prefix}/{training_algorithm_name}:latest'\n", + "\n", + "estimator = Estimator(image_uri=ecr_image,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type,\n", + " hyperparameters=hyperparameters,\n", + " volume_size=100,\n", + " tags=tags)\n", + "\n", + "# Set inputs. Test data is optional, but requires a label column.\n", + "inputs = {'training': train_s3_path, 'testing': test_s3_path}\n", + "\n", + "estimator.fit(inputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Review the performance of the trained model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils.ag_utils import launch_viewer\n", + "\n", + "launch_viewer(is_debug=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Create Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# Create predictor object\n", + "class AutoGluonTabularPredictor(Predictor):\n", + " def __init__(self, *args, **kwargs):\n", + " super().__init__(*args, \n", + " serializer=CSVSerializer(), \n", + " deserializer=StringDeserializer(), **kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "ecr_image = f'{ecr_uri_prefix}/{inference_algorithm_name}:latest'\n", + "\n", + "if instance_type == 'local':\n", + " model = estimator.create_model(image_uri=ecr_image, role=role)\n", + "else:\n", + " #model_uri = os.path.join(estimator.output_path, estimator._current_job_name, \"output\", \"model.tar.gz\")\n", + " model_uri = estimator.model_data\n", + " model = Model(ecr_image, model_data=model_uri, role=role, sagemaker_session=session, predictor_cls=AutoGluonTabularPredictor)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Batch Transform" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "For local mode, either `s3:////output/` or `file:///` can be used as outputs.\n", + "\n", + "By including the label column in the test data, you can also evaluate prediction performance (In this case, passing `test_s3_path` instead of `X_test_s3_path`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [], + "source": [ + "output_path = f's3://{bucket}/{prefix}/output/'\n", + "# output_path = f'file://{os.getcwd()}'\n", + "\n", + "transformer = model.transformer(instance_count=1, \n", + " instance_type=instance_type,\n", + " strategy='MultiRecord',\n", + " max_payload=6,\n", + " max_concurrent_transforms=1, \n", + " output_path=output_path)\n", + "\n", + "transformer.transform(test_s3_path, content_type='text/csv', split_type='Line')\n", + "transformer.wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Endpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Deploy remote or local endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "instance_type = 'ml.m5.2xlarge'\n", + "#instance_type = 'local'\n", + "\n", + "predictor = model.deploy(initial_instance_count=1, \n", + " instance_type=instance_type)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Attach to endpoint (or reattach if kernel was restarted)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# Select standard or local session based on instance_type\n", + "if instance_type == 'local': \n", + " sess = local_session\n", + "else: \n", + " sess = session\n", + "\n", + "# Attach to endpoint\n", + "predictor = AutoGluonTabularPredictor(predictor.endpoint_name, sagemaker_session=sess)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Predict on unlabeled test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "results = predictor.predict(X_test.to_csv(index=False)).splitlines()\n", + "\n", + "# Check output\n", + "threshold = 0.5\n", + "y_results = np.array(['yes' if float(i.split(\",\")[1]) > threshold else 'no' for i in results])\n", + "\n", + "print(Counter(y_results))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Predict on data that includes label column \n", + "Prediction performance metrics will be printed to endpoint logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "results = predictor.predict(test.to_csv(index=False)).splitlines()\n", + "\n", + "# Check output\n", + "threshold = 0.5\n", + "y_results = np.array(['yes' if float(i.split(\",\")[1]) > threshold else 'no' for i in results])\n", + "\n", + "print(Counter(y_results))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Check that classification performance metrics match evaluation printed to endpoint logs as expected" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "y_results = np.array(['yes' if float(i.split(\",\")[1]) > threshold else 'no' for i in results])\n", + "\n", + "print(\"accuracy: {}\".format(accuracy_score(y_true=y_test, y_pred=y_results)))\n", + "print(classification_report(y_true=y_test, y_pred=y_results, digits=6))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "##### Clean up endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "predictor.delete_endpoint()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explainability with Amazon SageMaker Clarify\n", + "\n", + "There are growing business needs and legislative regulations that require explainations of why a model made a certain decision. SHAP (SHapley Additive exPlanations) is an approach to explain the output of machine learning models. SHAP values represent a feature's contribution to a change in the model output. SageMaker Clarify uses SHAP to explain the contribution that each input feature makes to the final decision." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Set parameters for SHAP calculation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 0\n", + "num_rows = 500\n", + "\n", + "#Write a csv file used by SageMaker Clarify\n", + "test_explainavility_file = 'test_explainavility.csv'\n", + "train.head(num_rows).to_csv(test_explainavility_file, index=False, header=False)\n", + "test_explainavility_s3_path = session.upload_data(test_explainavility_file, key_prefix='{}/data'.format(prefix))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Specify computing resources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import clarify\n", + "\n", + "model_name = estimator.latest_training_job.job_name\n", + "container_def = model.prepare_container_def()\n", + "session.create_model(model_name,\n", + " role,\n", + " container_def)\n", + "\n", + "clarify_processor = clarify.SageMakerClarifyProcessor(role=role,\n", + " instance_count=1,\n", + " instance_type='ml.c4.xlarge',\n", + " sagemaker_session=session)\n", + "model_config = clarify.ModelConfig(model_name=model_name,\n", + " instance_type='ml.c5.xlarge',\n", + " instance_count=1,\n", + " accept_type='text/csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Run a SageMaker Clarify job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap_config = clarify.SHAPConfig(baseline=X_test.sample(15, random_state=seed).values.tolist(),\n", + " num_samples=100,\n", + " agg_method='mean_abs')\n", + "\n", + "explainability_output_path = 's3://{}/{}/{}/clarify-explainability'.format(bucket, prefix, model_name)\n", + "explainability_data_config = clarify.DataConfig(s3_data_input_path=test_explainavility_s3_path,\n", + " s3_output_path=explainability_output_path,\n", + " label='y',\n", + " headers=train.columns.to_list(),\n", + " dataset_type='text/csv')\n", + "\n", + "predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.5)\n", + "\n", + "clarify_processor.run_explainability(data_config=explainability_data_config,\n", + " model_config=model_config,\n", + " explainability_config=shap_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### View the Explainability Report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can view the explainability report in Studio under the experiments tab. If you're not a Studio user yet, as with the Bias Report, you can access this report at the following S3 bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(f\"aws s3 cp {explainability_output_path} . --recursive\", shell=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Global explanatory methods allow understanding the model and its feature contributions in aggregate over multiple datapoints. Here we show an aggregate bar plot that plots the mean absolute SHAP value for each feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(f\"{sys.executable} -m pip install shap\", shell=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Compute global shap values out of out.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap_values_ = pd.read_csv('explanations_shap/out.csv')\n", + "shap_values_.abs().mean().to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_features = len(train.head(num_rows).drop(['y'], axis = 1).columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shap\n", + "shap_values = [shap_values_.to_numpy()[:,:num_features], shap_values_.to_numpy()[:,num_features:]]\n", + "shap.summary_plot(shap_values, \n", + " plot_type='bar', \n", + " feature_names=train.head(num_rows).drop(['y'], axis = 1).columns.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The detailed summary plot below can provide more context over the above bar chart. It tells which features are most important and, in addition, their range of effects over the dataset. The color allows us to match how changes in the value of a feature effect the change in prediction. The 'red' indicates higher value of the feature and 'blue' indicates lower (normalized over the features)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap.summary_plot(shap_values_[shap_values_.columns[20:]].to_numpy(), \n", + " train.head(num_rows).drop(['y'], axis = 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (MXNet 1.6 Python 3.6 CPU Optimized)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/mxnet-1.6-cpu-py36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/advanced_functionality/autogluon-tabular/AutoGluon_Tabular_SageMaker.ipynb b/advanced_functionality/autogluon-tabular/AutoGluon_Tabular_SageMaker.ipynb index 18cb7d2577..10a5abf70c 100644 --- a/advanced_functionality/autogluon-tabular/AutoGluon_Tabular_SageMaker.ipynb +++ b/advanced_functionality/autogluon-tabular/AutoGluon_Tabular_SageMaker.ipynb @@ -31,8 +31,20 @@ }, "outputs": [], "source": [ + "import subprocess\n", "# Make sure docker compose is set up properly for local mode\n", - "!./setup.sh" + "subprocess.run(\"./setup.sh\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For Studio\n", + "subprocess.run(\"apt-get update -y\", shell=True)\n", + "subprocess.run(\"apt install unzip\", shell=True)" ] }, { @@ -44,6 +56,7 @@ "outputs": [], "source": [ "import os\n", + "import sys\n", "import boto3\n", "import sagemaker\n", "from time import sleep\n", @@ -111,16 +124,94 @@ "inference_algorithm_name = 'autogluon-sagemaker-inference'" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, you may want to remove existing docker images to make a room to build autogluon containers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(\"docker system prune -af\", shell=True)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "Collapsed": "false" + "Collapsed": "false", + "scrolled": true }, "outputs": [], "source": [ - "!/bin/bash ./container-training/build_push_training.sh {account} {region} {training_algorithm_name} {ecr_uri_prefix} {registry_uri_training.split('/')[0].split('.')[0]} {registry_uri_training}\n", - "!/bin/bash ./container-inference/build_push_inference.sh {account} {region} {inference_algorithm_name} {ecr_uri_prefix} {registry_uri_training.split('/')[0].split('.')[0]} {registry_uri_inference}" + "subprocess.run(f\"/bin/bash ./container-training/build_push_training.sh {account} {region} {training_algorithm_name} {ecr_uri_prefix} {registry_uri_training.split('/')[0].split('.')[0]} {registry_uri_training}\", shell=True)\n", + "subprocess.run(\"docker system prune -af\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(f\"/bin/bash ./container-inference/build_push_inference.sh {account} {region} {inference_algorithm_name} {ecr_uri_prefix} {registry_uri_training.split('/')[0].split('.')[0]} {registry_uri_inference}\", shell=True)\n", + "subprocess.run(\"docker system prune -af\", shell=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alternative way of building docker images using sm-docker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The new Amazon SageMaker Studio Image Build convenience package allows data scientists and developers to easily build custom container images from your Studio notebooks via a new CLI. \n", + "Newly built Docker images are tagged and pushed to Amazon ECR. \n", + "\n", + "To use the CLI, you need to ensure the Amazon SageMaker execution role used by your Studio notebook environment (or another AWS Identity and Access Management (IAM) role, if you prefer) has the required permissions to interact with the resources used by the CLI, including access to CodeBuild and Amazon ECR. Your role should have a trust policy with CodeBuild. \n", + "\n", + "You also need to make sure the appropriate permissions are included in your role to run the build in CodeBuild, create a repository in Amazon ECR, and push images to that repository. \n", + "\n", + "See also: https://aws.amazon.com/blogs/machine-learning/using-the-amazon-sagemaker-studio-image-build-cli-to-build-container-images-from-your-studio-notebooks/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#subprocess.run(\"pip install sagemaker-studio-image-build\", shell=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "training_repo_name = training_algorithm_name + ':latest'\n", + "training_repo_name \n", + "\n", + "!sm-docker build . --repository {training_repo_name} \\\n", + "--file ./container-training/Dockerfile.training --build-arg REGISTRY_URI={registry_uri_training}\n", + "\n", + "inference_repo_name = inference_algorithm_name + ':latest'\n", + "inference_repo_name \n", + "\n", + "!sm-docker build . --repository {inference_repo_name} \\\n", + "--file ./container-inference/Dockerfile.inference --build-arg REGISTRY_URI={registry_uri_inference}\n", + "'''\n" ] }, { @@ -151,9 +242,9 @@ "outputs": [], "source": [ "# Download and unzip the data\n", - "!aws s3 cp --region {region} s3://sagemaker-sample-data-{region}/autopilot/direct_marketing/bank-additional.zip .\n", - "!unzip -qq -o bank-additional.zip\n", - "!rm bank-additional.zip\n", + "subprocess.run(f\"aws s3 cp --region {region} s3://sagemaker-sample-data-{region}/autopilot/direct_marketing/bank-additional.zip .\", shell=True)\n", + "subprocess.run(\"unzip -qq -o bank-additional.zip\", shell=True)\n", + "subprocess.run(\"rm bank-additional.zip\", shell=True)\n", "\n", "local_data_path = './bank-additional/bank-additional-full.csv'\n", "data = pd.read_csv(local_data_path)\n", @@ -294,14 +385,13 @@ "\n", "# Define additional parameters\n", "fit_args = {\n", - " 'label': 'y',\n", " # Adding 'best_quality' to presets list will result in better performance (but longer runtime)\n", " 'presets': ['optimize_for_deployment'],\n", "}\n", "\n", "# Pass fit_args to SageMaker estimator hyperparameters\n", "hyperparameters = {\n", - "# 'init_args': init_args, \n", + " 'init_args': init_args, \n", " 'fit_args': fit_args,\n", " 'feature_importance': True\n", "}\n", @@ -517,7 +607,7 @@ " sess = session\n", "\n", "# Attach to endpoint\n", - "predictor = AutoGluonTabularPredictor(predictor.endpoint, sagemaker_session=sess)" + "predictor = AutoGluonTabularPredictor(predictor.endpoint_name, sagemaker_session=sess)" ] }, { @@ -540,7 +630,10 @@ "results = predictor.predict(X_test.to_csv(index=False)).splitlines()\n", "\n", "# Check output\n", - "print(Counter(results))" + "threshold = 0.5\n", + "y_results = np.array(['yes' if float(i.split(\",\")[1]) > threshold else 'no' for i in results])\n", + "\n", + "print(Counter(y_results))" ] }, { @@ -564,7 +657,10 @@ "results = predictor.predict(test.to_csv(index=False)).splitlines()\n", "\n", "# Check output\n", - "print(Counter(results))" + "threshold = 0.5\n", + "y_results = np.array(['yes' if float(i.split(\",\")[1]) > threshold else 'no' for i in results])\n", + "\n", + "print(Counter(y_results))" ] }, { @@ -584,7 +680,8 @@ }, "outputs": [], "source": [ - "y_results = np.array(results)\n", + "threshold = 0.5\n", + "y_results = np.array(['yes' if float(i.split(\",\")[1]) > threshold else 'no' for i in results])\n", "\n", "print(\"accuracy: {}\".format(accuracy_score(y_true=y_test, y_pred=y_results)))\n", "print(classification_report(y_true=y_test, y_pred=y_results, digits=6))" @@ -609,6 +706,194 @@ "source": [ "predictor.delete_endpoint()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explainability with Amazon SageMaker Clarify\n", + "\n", + "There are growing business needs and legislative regulations that require explainations of why a model made a certain decision. SHAP (SHapley Additive exPlanations) is an approach to explain the output of machine learning models. SHAP values represent a feature's contribution to a change in the model output. SageMaker Clarify uses SHAP to explain the contribution that each input feature makes to the final decision." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Set parameters for SHAP calculation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 0\n", + "num_rows = 500\n", + "\n", + "#Write a csv file used by SageMaker Clarify\n", + "test_explainavility_file = 'test_explainavility.csv'\n", + "train.head(num_rows).to_csv(test_explainavility_file, index=False, header=False)\n", + "test_explainavility_s3_path = session.upload_data(test_explainavility_file, key_prefix='{}/data'.format(prefix))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Specify computing resources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import clarify\n", + "\n", + "model_name = estimator.latest_training_job.job_name\n", + "container_def = model.prepare_container_def()\n", + "session.create_model(model_name,\n", + " role,\n", + " container_def)\n", + "\n", + "clarify_processor = clarify.SageMakerClarifyProcessor(role=role,\n", + " instance_count=1,\n", + " instance_type='ml.c4.xlarge',\n", + " sagemaker_session=session)\n", + "model_config = clarify.ModelConfig(model_name=model_name,\n", + " instance_type='ml.c5.xlarge',\n", + " instance_count=1,\n", + " accept_type='text/csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Run a SageMaker Clarify job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap_config = clarify.SHAPConfig(baseline=X_test.sample(15, random_state=seed).values.tolist(),\n", + " num_samples=100,\n", + " agg_method='mean_abs')\n", + "\n", + "explainability_output_path = 's3://{}/{}/{}/clarify-explainability'.format(bucket, prefix, model_name)\n", + "explainability_data_config = clarify.DataConfig(s3_data_input_path=test_explainavility_s3_path,\n", + " s3_output_path=explainability_output_path,\n", + " label='y',\n", + " headers=train.columns.to_list(),\n", + " dataset_type='text/csv')\n", + "\n", + "predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.5)\n", + "\n", + "clarify_processor.run_explainability(data_config=explainability_data_config,\n", + " model_config=model_config,\n", + " explainability_config=shap_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### View the Explainability Report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can view the explainability report in Studio under the experiments tab. If you're not a Studio user yet, as with the Bias Report, you can access this report at the following S3 bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(f\"aws s3 cp {explainability_output_path} . --recursive\", shell=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Global explanatory methods allow understanding the model and its feature contributions in aggregate over multiple datapoints. Here we show an aggregate bar plot that plots the mean absolute SHAP value for each feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(f\"{sys.executable} -m pip install shap\", shell=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Compute global shap values out of out.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap_values_ = pd.read_csv('explanations_shap/out.csv')\n", + "shap_values_.abs().mean().to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_features = len(train.head(num_rows).drop(['y'], axis = 1).columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shap\n", + "shap_values = [shap_values_.to_numpy()[:,:num_features], shap_values_.to_numpy()[:,num_features:]]\n", + "shap.summary_plot(shap_values, \n", + " plot_type='bar', \n", + " feature_names=train.head(num_rows).drop(['y'], axis = 1).columns.tolist())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The detailed summary plot below can provide more context over the above bar chart. It tells which features are most important and, in addition, their range of effects over the dataset. The color allows us to match how changes in the value of a feature effect the change in prediction. The 'red' indicates higher value of the feature and 'blue' indicates lower (normalized over the features)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap.summary_plot(shap_values_[shap_values_.columns[20:]].to_numpy(), \n", + " train.head(num_rows).drop(['y'], axis = 1))" + ] } ], "metadata": { diff --git a/advanced_functionality/autogluon-tabular/container-inference/Dockerfile.inference b/advanced_functionality/autogluon-tabular/container-inference/Dockerfile.inference index d1f234ab5b..0838f9b03f 100644 --- a/advanced_functionality/autogluon-tabular/container-inference/Dockerfile.inference +++ b/advanced_functionality/autogluon-tabular/container-inference/Dockerfile.inference @@ -3,7 +3,8 @@ FROM ${REGISTRY_URI} RUN pip install -U pip RUN pip install -U wheel setuptools -RUN pip install autogluon==0.1.0 +RUN pip install --no-cache-dir autogluon==0.1.0 + RUN pip install PrettyTable # Defines inference.py as script entrypoint diff --git a/advanced_functionality/autogluon-tabular/container-training/Dockerfile.training b/advanced_functionality/autogluon-tabular/container-training/Dockerfile.training index 3c8a6a2834..e2c9273b52 100644 --- a/advanced_functionality/autogluon-tabular/container-training/Dockerfile.training +++ b/advanced_functionality/autogluon-tabular/container-training/Dockerfile.training @@ -4,7 +4,8 @@ FROM ${REGISTRY_URI} RUN pip install -U pip RUN pip install -U wheel setuptools -RUN pip install autogluon==0.1.0 +RUN pip install --no-cache-dir autogluon==0.1.0 +RUN pip install shap RUN pip install PrettyTable RUN pip install bokeh diff --git a/advanced_functionality/autogluon-tabular/container-training/inference.py b/advanced_functionality/autogluon-tabular/container-training/inference.py index 3a6fa274a5..c4340758f8 100644 --- a/advanced_functionality/autogluon-tabular/container-training/inference.py +++ b/advanced_functionality/autogluon-tabular/container-training/inference.py @@ -77,30 +77,42 @@ def transform_fn(models, data, input_content_type, output_content_type): start = timer() net = models[0] column_dict = models[1] + label_map = net.class_labels_internal_map ### # text/csv - if input_content_type == 'text/csv': - + if 'text/csv' in input_content_type: # Load dataset columns = column_dict['columns'] - df = pd.read_csv(StringIO(data), header=None) + + if type(data) == str: + # Load dataset + df = pd.read_csv(StringIO(data), header=None) + else: + df = pd.read_csv(StringIO(data.decode()), header=None) df_preprosessed = preprocess(df, columns, net.label) ds = TabularDataset(data=df_preprosessed) try: - predictions = net.predict(ds) + predictions = net.predict_proba(ds) + predictions_ = net.predict(ds) except: try: - predictions = net.predict(ds.fillna(0.0)) + predictions = net.predict_proba(ds.fillna(0.0)) + predictions_ = net.predict(ds.fillna(0.0)) warnings.warn('Filled NaN\'s with 0.0 in order to predict.') except Exception as e: response_body = e return response_body, output_content_type - + + #threshold = 0.5 + #predictions_label = [[k for k, v in label_map.items() if v == 1][0] if i > threshold else [k for k, v in label_map.items() if v == 0][0] for i in predictions] + predictions_label = predictions_.tolist() + + # Print prediction counts, limit in case of regression problem - pred_counts = Counter(predictions.tolist()) + pred_counts = Counter(predictions_label) n_display_items = 30 if len(pred_counts) > n_display_items: print(f'Top {n_display_items} prediction counts: ' @@ -120,7 +132,7 @@ def transform_fn(models, data, input_content_type, output_content_type): 'Therefore, evaluating prediction performance...') try: performance = net.evaluate_predictions(y_true=ds[target], - y_pred=predictions, + y_pred=np.array(predictions_label), auxiliary_metrics=True) print(json.dumps(performance, indent=4, default=pd.DataFrame.to_json)) time.sleep(0.1) diff --git a/advanced_functionality/autogluon-tabular/container-training/train.py b/advanced_functionality/autogluon-tabular/container-training/train.py index d515547bff..086e800c2e 100644 --- a/advanced_functionality/autogluon-tabular/container-training/train.py +++ b/advanced_functionality/autogluon-tabular/container-training/train.py @@ -22,6 +22,11 @@ logging.basicConfig(level=logging.DEBUG) logging.info(subprocess.call('ls -lR /opt/ml/input'.split())) + +import shap +import smdebug.mxnet as smd +from smdebug.core.writer import FileWriter + with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) from prettytable import PrettyTable @@ -129,7 +134,7 @@ def train(args): train_data = __load_input_data(args.train) # Extract column info - target = args.fit_args['label'] + target = args.init_args['label'] columns = train_data.columns.tolist() column_dict = {"columns":columns} with open('columns.pkl', 'wb') as f: @@ -137,10 +142,12 @@ def train(args): # Train models - args.fit_args.pop('label', None) + args.init_args['path'] = args.model_dir + #args.fit_args.pop('label', None) predictor = TabularPredictor( - label=target, - path=args.model_dir).fit(train_data, **args.fit_args) + **args.init_args + ).fit(train_data, **args.fit_args) + # Results summary predictor.fit_summary(verbosity=3) @@ -238,10 +245,14 @@ def parse_args(): parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR']) parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) + # Arguments to be passed to TabularPredictor() + parser.add_argument('--init_args', type=lambda s: ast.literal_eval(s), + default="{'label': 'y'}", + help='https://auto.gluon.ai/stable/_modules/autogluon/tabular/predictor/predictor.html#TabularPredictor') # Arguments to be passed to task.fit() parser.add_argument('--fit_args', type=lambda s: ast.literal_eval(s), default="{'presets': ['optimize_for_deployment']}", - help='https://autogluon.mxnet.io/api/autogluon.task.html#tabularprediction') + help='https://auto.gluon.ai/stable/_modules/autogluon/tabular/predictor/predictor.html#TabularPredictor') # Additional options parser.add_argument('--feature_importance', type='bool', default=True) @@ -251,10 +262,11 @@ def parse_args(): if __name__ == "__main__": start = timer() args = parse_args() - + # Verify label is included - if 'label' not in args.fit_args: - raise ValueError('"label" is a required parameter of "fit_args"!') + if 'label' not in args.init_args: + raise ValueError('"label" is a required parameter of "init_args"!') + # Convert optional fit call hyperparameters from strings if 'hyperparameters' in args.fit_args: