From 5c5f804918f108cbde3e7f60ca50f84c6a852bd1 Mon Sep 17 00:00:00 2001 From: hxgy610 Date: Wed, 18 Jan 2023 10:48:24 -0500 Subject: [PATCH 1/2] distributed training of sagemaker lightgbm using dask --- .../README.md | 1 + ...r-lightgbm-distributed-training-dask.ipynb | 959 ++++++++++++++++++ 2 files changed, 960 insertions(+) create mode 100644 introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb diff --git a/introduction_to_applying_machine_learning/README.md b/introduction_to_applying_machine_learning/README.md index c32312fa94..6d23dda79e 100644 --- a/introduction_to_applying_machine_learning/README.md +++ b/introduction_to_applying_machine_learning/README.md @@ -4,6 +4,7 @@ These examples provide a gentle introduction to machine learning concepts as they are applied in practical use cases across a variety of sectors. +- [LightGBM_Distributed_Training_Dask](sagemaker_lightgbm_distributed_training_dask) demonstrates the distributed training of Amazon SageMaker's implementation of [LightGBM](https://lightgbm.readthedocs.io/en/latest/) using [Dask](https://www.dask.org/). - [Predicting Customer Churn](xgboost_customer_churn) uses customer interaction and service usage data to find those most likely to churn, and then walks through the cost/benefit trade-offs of providing retention incentives. This uses Amazon SageMaker's implementation of [XGBoost](https://github.com/dmlc/xgboost) to create a highly predictive model. - [Predicting Customer Churn](lightgbm_catboost_tabtransformer_autogluon_churn) uses Amazon SageMaker's implementation of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), [CatBoost](https://catboost.ai/), [TabTransformer](https://arxiv.org/abs/2012.06678), and [AutoGluon-Tabular](https://auto.gluon.ai/stable/index.html) with [SageMaker Automatic Model Tuning](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning.html) to create four predictive models on customer churn dataset, and evaluate their performance on the same test data. - [Cancer Prediction](breast_cancer_prediction) predicts Breast Cancer based on features derived from images, using SageMaker's Linear Learner. diff --git a/introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb b/introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb new file mode 100644 index 0000000000..9bb1d418ba --- /dev/null +++ b/introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb @@ -0,0 +1,959 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "270633d3", + "metadata": {}, + "source": [ + "# Amazon SageMaker LightGBM Distributed training using Dask" + ] + }, + { + "cell_type": "markdown", + "id": "bb41dbfb", + "metadata": {}, + "source": [ + "---\n", + "Losing customers is costly for any business. Identifying unhappy customers early on gives you a chance to offer them incentives to stay. This notebook describes using machine learning (ML) for the automated identification of unhappy customers, also known as customer churn prediction. ML models rarely give perfect predictions though, so this notebook is also about how to incorporate the relative costs of prediction mistakes when determining the financial outcome of using ML.\n", + "\n", + "This notebook demonstrates the use of distributed training for Amazon SageMaker’s implementation of the [LightGBM](https://lightgbm.readthedocs.io/en/latest/) with Dask.\n", + "\n", + "In this notebook, we demonstrate two use cases:\n", + "\n", + "* How to distributedly train a tabular model using Dask on the customer churn dataset.\n", + "* How to use the trained tabular model to perform inference, i.e., classifying new samples.\n", + "\n", + "\n", + "Note: This notebook was tested in Amazon SageMaker Studio on ml.t3.medium instance with Python 3 (Data Science) kernel.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "12c6b9ec", + "metadata": {}, + "source": [ + "## 1. Set Up\n", + "\n", + "---\n", + "Before executing the notebook, there are some initial steps required for setup. This notebook requires latest version of sagemaker and ipywidgets.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da04d4ef", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sagemaker ipywidgets --upgrade" + ] + }, + { + "cell_type": "markdown", + "id": "cb779656", + "metadata": {}, + "source": [ + "\n", + "---\n", + "To train and host on Amazon SageMaker, we need to setup and authenticate the use of AWS services. Here, we use the execution role associated with the current notebook instance as the AWS account role with SageMaker access. It has necessary permissions, including access to your data in S3.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56d15a71", + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker, boto3, json\n", + "from sagemaker import get_execution_role\n", + "\n", + "aws_role = get_execution_role()\n", + "aws_region = boto3.Session().region_name\n", + "sess = sagemaker.Session()\n", + "\n", + "bucket = sess.default_bucket()\n", + "prefix = \"sagemaker/DEMO-churn-dt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adf9b9ed", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import io\n", + "import os\n", + "import sys\n", + "import time\n", + "import json\n", + "from IPython.display import display\n", + "from time import strftime, gmtime\n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.serializers import CSVSerializer\n", + "from sklearn import preprocessing" + ] + }, + { + "cell_type": "markdown", + "id": "40198d95", + "metadata": {}, + "source": [ + "## 2. Data Preparation and Visualization\n", + "\n", + "Mobile operators have historical records on which customers ultimately ended up churning and which continued using the service. We can use this historical information to construct an ML model of one mobile operator’s churn using a process called training. After training the model, we can pass the profile information of an arbitrary customer (the same profile information that we used to train the model) to the model, and have the model predict whether this customer is going to churn. Of course, we expect the model to make mistakes. After all, predicting the future is tricky business! But we’ll learn how to deal with prediction errors.\n", + "\n", + "The dataset we use is publicly available and was mentioned in the book [Discovering Knowledge in Data](https://www.amazon.com/dp/0470908742/) by Daniel T. Larose. It is attributed by the author to the University of California Irvine Repository of Machine Learning Datasets. Let’s download and read that dataset in now:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "687a827d", + "metadata": {}, + "outputs": [], + "source": [ + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(f\"sagemaker-sample-files\", \"datasets/tabular/synthetic/churn.txt\", \"churn.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7df8cfb8", + "metadata": {}, + "outputs": [], + "source": [ + "churn = pd.read_csv(\"./churn.txt\")\n", + "pd.set_option(\"display.max_columns\", 500)\n", + "churn.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "fe5f8234", + "metadata": {}, + "source": [ + "By modern standards, it’s a relatively small dataset, with only 5,000 records, where each record uses 21 attributes to describe the profile of a customer of an unknown US mobile operator. The attributes are:\n", + "\n", + "`State`: the US state in which the customer resides, indicated by a two-letter abbreviation; for example, OH or NJ\n", + "\n", + "`Account Length`: the number of days that this account has been active\n", + "\n", + "`Area Code`: the three-digit area code of the corresponding customer’s phone number\n", + "\n", + "`Phone`: the remaining seven-digit phone number\n", + "\n", + "`Int’l Plan`: whether the customer has an international calling plan: yes/no\n", + "\n", + "`VMail Plan`: whether the customer has a voice mail feature: yes/no\n", + "\n", + "`VMail Message`: the average number of voice mail messages per month\n", + "\n", + "`Day Mins`: the total number of calling minutes used during the day\n", + "\n", + "`Day Calls`: the total number of calls placed during the day\n", + "\n", + "`Day Charge`: the billed cost of daytime calls\n", + "\n", + "`Eve Mins`, `Eve Calls`, `Eve Charge`: the billed cost for calls placed during the evening\n", + "\n", + "`Night Mins`, `Night Calls`, `Night Charge`: the billed cost for calls placed during nighttime\n", + "\n", + "`Intl Mins`, `Intl Calls`, `Intl Charge`: the billed cost for international calls\n", + "\n", + "`CustServ Calls`: the number of calls placed to Customer Service\n", + "\n", + "`Churn?`: whether the customer left the service: true/false\n", + "\n", + "The last attribute, `Churn?`, is known as the target attribute: the attribute that we want the ML model to predict. Because the target attribute is binary, our model will be performing binary prediction, also known as binary classification.\n", + "\n", + "Let’s begin exploring the data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a27af5ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Histograms for each numeric features\n", + "display(churn.describe())\n", + "%matplotlib inline\n", + "hist = churn.hist(bins=30, sharey=True, figsize=(10, 10))" + ] + }, + { + "cell_type": "markdown", + "id": "c987d494", + "metadata": {}, + "source": [ + "We can see immediately that: - `State` appears to be quite evenly distributed. - `Phone` takes on too many unique values to be of any practical use. It’s possible that parsing out the prefix could have some value, but without more context on how these are allocated, we should avoid using it. - Most of the numeric features are surprisingly nicely distributed, with many showing bell-like gaussianity. `VMail Message` is a notable exception." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eb8b330", + "metadata": {}, + "outputs": [], + "source": [ + "churn = churn.drop(\"Phone\", axis=1)\n", + "churn[\"Area Code\"] = churn[\"Area Code\"].astype(object)" + ] + }, + { + "cell_type": "markdown", + "id": "a2ceceb7", + "metadata": {}, + "source": [ + "Next let’s look at the relationship between each of the features and our target variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06492d81", + "metadata": {}, + "outputs": [], + "source": [ + "for column in churn.select_dtypes(include=[\"object\"]).columns:\n", + " if column != \"Churn?\":\n", + " display(pd.crosstab(index=churn[column], columns=churn[\"Churn?\"], normalize=\"columns\"))\n", + "\n", + "for column in churn.select_dtypes(exclude=[\"object\"]).columns:\n", + " print(column)\n", + " hist = churn[[column, \"Churn?\"]].hist(by=\"Churn?\", bins=30)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9c35cecc", + "metadata": {}, + "source": [ + "We convert the target attribute to binary value and move it to the first column of the dataset to meet requirements of SageMaker built-in tabular algorithms (For an example, see [SageMaker LightGBM documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/lightgbm.html))." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "565c809d", + "metadata": {}, + "outputs": [], + "source": [ + "churn[\"target\"] = churn[\"Churn?\"].map({\"True.\": 1, \"False.\": 0})\n", + "churn.drop([\"Churn?\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a231510", + "metadata": {}, + "outputs": [], + "source": [ + "churn = churn[[\"target\"] + churn.columns.tolist()[:-1]]" + ] + }, + { + "cell_type": "markdown", + "id": "14a15e08", + "metadata": {}, + "source": [ + "We identify the column indexes of the categorical attribute, which is required by LightGBM, CatBoost, and TabTransformer algorithm (AutoGluon-Tabular has built-in feature engineering to identify the categorical attribute automatically, and thus does not require such input)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd4c11ed", + "metadata": {}, + "outputs": [], + "source": [ + "cat_columns = [\n", + " \"State\",\n", + " \"Account Length\",\n", + " \"Area Code\",\n", + " \"Phone\",\n", + " \"Int'l Plan\",\n", + " \"VMail Plan\",\n", + " \"VMail Message\",\n", + " \"Day Calls\",\n", + " \"Eve Calls\",\n", + " \"Night Calls\",\n", + " \"Intl Calls\",\n", + " \"CustServ Calls\",\n", + "]\n", + "\n", + "cat_idx = []\n", + "for idx, col_name in enumerate(churn.columns.tolist()):\n", + " if col_name in cat_columns:\n", + " cat_idx.append(idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "466d84a3", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"cat_idx.json\", \"w\") as outfile:\n", + " json.dump({\"cat_idx\": cat_idx}, outfile)" + ] + }, + { + "cell_type": "markdown", + "id": "ea910d97", + "metadata": {}, + "source": [ + "[LightGBM official documentation](https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support) requires that all categorical features should be encoded as non-negative integers. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bb951cf", + "metadata": {}, + "outputs": [], + "source": [ + "for idx, col_name in enumerate(churn.columns.tolist()):\n", + " if col_name in cat_columns:\n", + " le = preprocessing.LabelEncoder()\n", + " churn[col_name] = le.fit_transform(churn[col_name])" + ] + }, + { + "cell_type": "markdown", + "id": "a5ff6f07", + "metadata": {}, + "source": [ + "We split the churn dataset into train, validation, and test set using stratified sampling. Validation set is used for early stopping and AMT. Test set is used for performance evaluations in the end. Next, we upload them into a S3 path for training.\n", + "\n", + "The structure of the S3 path for training should be structured as below.\n", + "\n", + "* The supported input data format for training is `csv`. You are allowed to put more than 1 data file under both train and valdiation channel. The name of data file can be any one as long as it ends with `.csv`.\n", + "* The first column corresponds to the target and the rest of columns correspond to features. This follows the convention of [SageMaker XGBoost algorithm](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html). \n", + "* The `cat_idx.json` is categorical column indexes. It contains a dictionary of a key-value pair. The key can be any string. The value is the list of column indexes of categorical features. The index starts with value 1 as value 0 corresponds to the target variable. Please see example above to format the `cat_idx.json`.\n", + "* For the validation data, we encourage you to include one data file under its channel such that the all of the validation data points can be assigned to one machine. Thus, the validation score is for all of the validation data points and can be easily parsed by the AMT for hyperparameter optimization.\n", + "* Current distributed training only supports CPU.\n", + "\n", + "-- `train`
\n", + "      -- `data_1.csv`
\n", + "      -- `data_2.csv`
\n", + "      -- `data_3.csv`
\n", + "      -- `cat_idx.json`\n", + " \n", + "-- `validation`
\n", + "      -- `data.csv` \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fede804a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train, val_n_test = train_test_split(\n", + " churn, test_size=0.3, random_state=42, stratify=churn[\"target\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43d49126", + "metadata": {}, + "outputs": [], + "source": [ + "val, test = train_test_split(\n", + " val_n_test, test_size=0.3, random_state=42, stratify=val_n_test[\"target\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ef7194c", + "metadata": {}, + "outputs": [], + "source": [ + "train.to_csv(\"train.csv\", header=False, index=False)\n", + "val.to_csv(\"validation.csv\", header=False, index=False)\n", + "test.to_csv(\"test.csv\", header=False, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f9e4fe64", + "metadata": {}, + "source": [ + "For demonstartion purpose on including multiple files under the training channel, we simply duplicate the training data multiple times as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22bb7a84", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "for i in tqdm(range(200)):\n", + " boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, f\"train/data_{i}.csv\")\n", + " ).upload_file(\"train.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a358d0d", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, \"validation/data.csv\")\n", + ").upload_file(\"validation.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54c33e88", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, \"test/data.csv\")\n", + ").upload_file(\"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8c71dcd", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, \"train/cat_idx.json\")\n", + ").upload_file(\"cat_idx.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "e0418e02", + "metadata": {}, + "source": [ + "## 3. Distributedly Train A SageMaker LightGBM Model with AMT" + ] + }, + { + "cell_type": "markdown", + "id": "20512887", + "metadata": {}, + "source": [ + "### 3.1. Retrieve Training Artifacts\n", + "\n", + "___\n", + "\n", + "Here, we retrieve the training docker container, the training algorithm source, and the tabular algorithm. Note that model_version=\"*\" fetches the latest model.\n", + "\n", + "For the training algorithm, we have four choices in this demonstration for classification task.\n", + "* [LightGBM](https://lightgbm.readthedocs.io/en/latest/): To use this algorithm, specify `train_model_id` as `lightgbm-classification-model` in the cell below.\n", + "\n", + "For regression task, the `train_model_id` is `lightgbm-regression-model`.\n", + "\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8565b382", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import image_uris, model_uris, script_uris\n", + "\n", + "train_model_id, train_model_version, train_scope = \"lightgbm-classification-model\", \"*\", \"training\"\n", + "training_instance_type = \"ml.m5.4xlarge\"\n", + "\n", + "# Retrieve the docker image\n", + "train_image_uri = image_uris.retrieve(\n", + " region=None,\n", + " framework=None,\n", + " model_id=train_model_id,\n", + " model_version=train_model_version,\n", + " image_scope=train_scope,\n", + " instance_type=training_instance_type,\n", + ")\n", + "# Retrieve the training script\n", + "train_source_uri = script_uris.retrieve(\n", + " model_id=train_model_id, model_version=train_model_version, script_scope=train_scope\n", + ")\n", + "# Retrieve the pre-trained model tarball to further fine-tune\n", + "train_model_uri = model_uris.retrieve(\n", + " model_id=train_model_id, model_version=train_model_version, model_scope=train_scope\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "99e958e5", + "metadata": {}, + "source": [ + "### 3.2. Set Training Parameters\n", + "\n", + "---\n", + "\n", + "Now that we are done with all the setup that is needed, we are ready to train our tabular algorithm. To begin, let us create a [``sageMaker.estimator.Estimator``](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) object. This estimator will launch the training job. \n", + "\n", + "There are two kinds of parameters that need to be set for training. The first one are the parameters for the training job. These include: (i) Training data path. This is S3 folder in which the input data is stored, (ii) Output path: This the s3 folder in which the training output is stored. (iii) Training instance type: This indicates the type of machine on which to run the training.\n", + "\n", + "The second set of parameters are algorithm specific training hyper-parameters. \n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de5c2074", + "metadata": {}, + "outputs": [], + "source": [ + "training_dataset_s3_path = f\"s3://{bucket}/{prefix}/train\"\n", + "validation_dataset_s3_path = f\"s3://{bucket}/{prefix}/validation\"\n", + "\n", + "output_prefix = \"jumpstart-example-tabular-training\"\n", + "s3_output_location = f\"s3://{bucket}/{output_prefix}/output_lgb\"" + ] + }, + { + "cell_type": "markdown", + "id": "74273473", + "metadata": {}, + "source": [ + "---\n", + "For algorithm specific hyper-parameters, we start by fetching python dictionary of the training hyper-parameters that the algorithm accepts with their default values. This can then be overridden to custom values. For the evaluation metric that is used by early stopping and automatic model tuning, we choose `auc` score. Note. LightGBM does not have built-in F1 score supported. See [LightGBM documentation](https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters).\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1c5b649", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import hyperparameters\n", + "\n", + "# Retrieve the default hyper-parameters for fine-tuning the model\n", + "hyperparameters = hyperparameters.retrieve_default(\n", + " model_id=train_model_id, model_version=train_model_version\n", + ")\n", + "\n", + "# [Optional] Override default hyperparameters with custom values\n", + "hyperparameters[\"num_boost_round\"] = \"200\"\n", + "\n", + "\n", + "hyperparameters[\"metric\"] = \"auc\"\n", + "hyperparameters[\"tree_learner\"] = \"voting\" # use AllReduce method for distributed training\n", + "\n", + "del hyperparameters[\n", + " \"early_stopping_rounds\"\n", + "] # current distributed training with early stopping has some issues. See https://github.com/microsoft/SynapseML/issues/728#issuecomment-1221599961\n", + "# thus it is disabled for distributed training.\n", + "print(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "id": "2e9b3d87", + "metadata": {}, + "source": [ + "### 3.3. Train with Automatic Model Tuning \n", + "\n", + "\n", + "Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. We will use a HyperparameterTuner object to interact with Amazon SageMaker hyperparameter tuning APIs.\n", + "\n", + "* Note. In this notebook, we set AMT budget (total tuning jobs) as 10 for each of the tabular algorithm except AutoGluon-Tabular. For [AutoGluon-Tabular](https://arxiv.org/abs/2003.06505), it succeeds by ensembling multiple models and stacking them in multiple layers. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7c8d7d6", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner\n", + "\n", + "use_amt = True\n", + "\n", + "hyperparameter_ranges_lgb = {\n", + " \"learning_rate\": ContinuousParameter(1e-4, 1, scaling_type=\"Logarithmic\"),\n", + " \"num_boost_round\": IntegerParameter(2, 30),\n", + " \"num_leaves\": IntegerParameter(10, 50),\n", + " \"feature_fraction\": ContinuousParameter(0.1, 1),\n", + " \"bagging_fraction\": ContinuousParameter(0.1, 1),\n", + " \"bagging_freq\": IntegerParameter(1, 10),\n", + " \"max_depth\": IntegerParameter(5, 30),\n", + " \"min_data_in_leaf\": IntegerParameter(5, 50),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "8f60b812", + "metadata": {}, + "source": [ + "### 3.4. Start Training" + ] + }, + { + "cell_type": "markdown", + "id": "8d427761", + "metadata": {}, + "source": [ + "---\n", + "We start by creating the estimator object with all the required assets and then launch the training job. \n", + "\n", + "* To enable distributed training, you only need specify the number of instances to be more than 1.\n", + "* You might need increase the argument volumn_size if your dataset size is larger than the default value (30GB). Otherwise, you may see insufficient disk memory error.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46cfd39e", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.estimator import Estimator\n", + "import random\n", + "\n", + "training_job_name = \"jumpstart-example-distri-lgb-g\" + str(random.randint(0, 100))\n", + "\n", + "# Create SageMaker Estimator instance\n", + "tabular_estimator = Estimator(\n", + " role=aws_role,\n", + " image_uri=train_image_uri,\n", + " source_dir=train_source_uri,\n", + " model_uri=train_model_uri,\n", + " entry_point=\"transfer_learning.py\",\n", + " instance_count=4, ### select the instance count you would like to use for distributed training\n", + " volume_size=30, ### volume_size (int or PipelineVariable): Size in GB of the storage volume to use for storing input and output data during training (default: 30).\n", + " instance_type=training_instance_type,\n", + " max_run=360000,\n", + " hyperparameters=hyperparameters,\n", + " output_path=s3_output_location,\n", + ")\n", + "\n", + "if use_amt:\n", + "\n", + " tuner = HyperparameterTuner(\n", + " tabular_estimator,\n", + " \"auc\",\n", + " hyperparameter_ranges_lgb,\n", + " [{\"Name\": \"auc\", \"Regex\": \"auc: ([0-9\\\\.]+)\"}],\n", + " max_jobs=20,\n", + " max_parallel_jobs=5,\n", + " objective_type=\"Maximize\",\n", + " )\n", + "\n", + " tuner.fit(\n", + " {\n", + " \"train\": training_dataset_s3_path,\n", + " \"validation\": validation_dataset_s3_path,\n", + " },\n", + " logs=True,\n", + " job_name=training_job_name,\n", + " )\n", + "else:\n", + " # Launch a SageMaker Training job by passing s3 path of the training data\n", + " tabular_estimator.fit(\n", + " {\n", + " \"train\": training_dataset_s3_path,\n", + " \"validation\": validation_dataset_s3_path,\n", + " },\n", + " logs=True,\n", + " job_name=training_job_name,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "2199060d", + "metadata": {}, + "source": [ + "### 3.5. Deploy and Run Inference on the Trained Tabular Model\n", + "\n", + "---\n", + "\n", + "In this section, you learn how to query an existing endpoint and make predictions of the examples you input. For each example, the model will output the probability of the sample for each class in the model. \n", + "Next, the predicted class label is obtained by taking the class label with the maximum probability over others.\n", + "\n", + "\n", + "We start by retrieving the artifacts and deploy the `tabular_estimator` that we trained.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f480f9a5", + "metadata": {}, + "outputs": [], + "source": [ + "inference_instance_type = \"ml.m5.4xlarge\"\n", + "\n", + "# Retrieve the inference docker container uri\n", + "deploy_image_uri = image_uris.retrieve(\n", + " region=None,\n", + " framework=None,\n", + " image_scope=\"inference\",\n", + " model_id=train_model_id,\n", + " model_version=train_model_version,\n", + " instance_type=inference_instance_type,\n", + ")\n", + "# Retrieve the inference script uri\n", + "deploy_source_uri = script_uris.retrieve(\n", + " model_id=train_model_id, model_version=train_model_version, script_scope=\"inference\"\n", + ")\n", + "\n", + "endpoint_name = \"jumpstart-example-distri-lgb-g\" + str(random.randint(0, 100))\n", + "\n", + "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n", + "predictor = (tuner if use_amt else tabular_estimator).deploy(\n", + " initial_instance_count=1,\n", + " instance_type=inference_instance_type,\n", + " entry_point=\"inference.py\",\n", + " image_uri=deploy_image_uri,\n", + " source_dir=deploy_source_uri,\n", + " endpoint_name=endpoint_name,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0c7a8372", + "metadata": {}, + "source": [ + "---\n", + "Next, we read the customer churn test data into pandas data frame, prepare the ground truth target and predicting features to send into the endpoint. \n", + "\n", + "Below is the screenshot of the first 5 examples in the test set.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22675cea", + "metadata": {}, + "outputs": [], + "source": [ + "newline, bold, unbold = \"\\n\", \"\\033[1m\", \"\\033[0m\"\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score\n", + "from sklearn.metrics import confusion_matrix\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# read the data\n", + "test_data_file_name = \"test.csv\"\n", + "test_data = pd.read_csv(test_data_file_name, header=None)\n", + "test_data.columns = [\"Target\"] + [f\"Feature_{i}\" for i in range(1, test_data.shape[1])]\n", + "\n", + "num_examples, num_columns = test_data.shape\n", + "print(\n", + " f\"{bold}The test dataset contains {num_examples} examples and {num_columns} columns.{unbold}\\n\"\n", + ")\n", + "\n", + "# prepare the ground truth target and predicting features to send into the endpoint.\n", + "ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]\n", + "\n", + "print(f\"{bold}The first 5 observations of the data: {unbold} \\n\")\n", + "test_data.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "5bbda7e4", + "metadata": {}, + "source": [ + "---\n", + "The following code queries the endpoint you have created to get the prediction for each test example. \n", + "The `query_endpoint()` function returns an array-like of shape (num_examples, num_classes), where each row indicates \n", + "the probability of the example for each class in the model. The num_classes is 2 in above test data. \n", + "Next, the predicted class label is obtained by taking the class label with the maximum probability over others for each example. \n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7b0bebf", + "metadata": {}, + "outputs": [], + "source": [ + "content_type = \"text/csv\"\n", + "\n", + "\n", + "def query_endpoint(encoded_tabular_data, endpoint_name):\n", + " client = boto3.client(\"runtime.sagemaker\")\n", + " response = client.invoke_endpoint(\n", + " EndpointName=endpoint_name,\n", + " ContentType=content_type,\n", + " Body=encoded_tabular_data,\n", + " )\n", + " return response\n", + "\n", + "\n", + "def parse_response(query_response):\n", + " model_predictions = json.loads(query_response[\"Body\"].read())\n", + " predicted_probabilities = model_predictions[\"probabilities\"]\n", + " return np.array(predicted_probabilities)\n", + "\n", + "\n", + "# split the test data into smaller size of batches to query the endpoint if test data has large size.\n", + "batch_size = 1500\n", + "predict_prob = []\n", + "for i in np.arange(0, num_examples, step=batch_size):\n", + " query_response_batch = query_endpoint(\n", + " features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode(\"utf-8\"),\n", + " endpoint_name,\n", + " )\n", + " predict_prob_batch = parse_response(query_response_batch) # prediction probability per batch\n", + " predict_prob.append(predict_prob_batch)\n", + "\n", + "\n", + "predict_prob = np.concatenate(predict_prob, axis=0)\n", + "predict_label = np.argmax(predict_prob, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "cd7c94ae", + "metadata": {}, + "source": [ + "## 3.6. Evaluate the Prediction Results Returned from the Endpoint\n", + "\n", + "---\n", + "We evaluate the predictions results returned from the endpoint by following two ways.\n", + "\n", + "* Visualize the predictions results by plotting the confusion matrix.\n", + "\n", + "* Measure the prediction results quantitatively.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94720689", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the predictions results by plotting the confusion matrix.\n", + "conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label)\n", + "fig, ax = plt.subplots(figsize=(7.5, 7.5))\n", + "ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)\n", + "for i in range(conf_matrix.shape[0]):\n", + " for j in range(conf_matrix.shape[1]):\n", + " ax.text(x=j, y=i, s=conf_matrix[i, j], va=\"center\", ha=\"center\", size=\"xx-large\")\n", + "\n", + "plt.xlabel(\"Predictions\", fontsize=18)\n", + "plt.ylabel(\"Actuals\", fontsize=18)\n", + "plt.title(\"Confusion Matrix\", fontsize=18)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79bb9864", + "metadata": {}, + "outputs": [], + "source": [ + "# Measure the prediction results quantitatively.\n", + "eval_accuracy = accuracy_score(ground_truth_label.values, predict_label)\n", + "eval_f1 = f1_score(ground_truth_label.values, predict_label)\n", + "eval_auc = roc_auc_score(ground_truth_label.values, predict_prob[:, 1])\n", + "\n", + "lgb_results = pd.DataFrame.from_dict(\n", + " {\n", + " \"Accuracy\": eval_accuracy,\n", + " \"F1\": eval_f1,\n", + " \"AUC\": eval_auc,\n", + " },\n", + " orient=\"index\",\n", + " columns=[\"LightGBM with AMT\"],\n", + ")\n", + "\n", + "lgb_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bacee7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete the SageMaker endpoint and the attached resources\n", + "predictor.delete_model()\n", + "predictor.delete_endpoint()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a19f87fcb64a32001349bed57ca1d3dc995ee212 Mon Sep 17 00:00:00 2001 From: hxgy610 Date: Wed, 18 Jan 2023 10:48:24 -0500 Subject: [PATCH 2/2] distributed training of sagemaker lightgbm using dask --- .../README.md | 1 + ...r-lightgbm-distributed-training-dask.ipynb | 959 ++++++++++++++++++ 2 files changed, 960 insertions(+) create mode 100644 introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb diff --git a/introduction_to_applying_machine_learning/README.md b/introduction_to_applying_machine_learning/README.md index c32312fa94..6d23dda79e 100644 --- a/introduction_to_applying_machine_learning/README.md +++ b/introduction_to_applying_machine_learning/README.md @@ -4,6 +4,7 @@ These examples provide a gentle introduction to machine learning concepts as they are applied in practical use cases across a variety of sectors. +- [LightGBM_Distributed_Training_Dask](sagemaker_lightgbm_distributed_training_dask) demonstrates the distributed training of Amazon SageMaker's implementation of [LightGBM](https://lightgbm.readthedocs.io/en/latest/) using [Dask](https://www.dask.org/). - [Predicting Customer Churn](xgboost_customer_churn) uses customer interaction and service usage data to find those most likely to churn, and then walks through the cost/benefit trade-offs of providing retention incentives. This uses Amazon SageMaker's implementation of [XGBoost](https://github.com/dmlc/xgboost) to create a highly predictive model. - [Predicting Customer Churn](lightgbm_catboost_tabtransformer_autogluon_churn) uses Amazon SageMaker's implementation of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), [CatBoost](https://catboost.ai/), [TabTransformer](https://arxiv.org/abs/2012.06678), and [AutoGluon-Tabular](https://auto.gluon.ai/stable/index.html) with [SageMaker Automatic Model Tuning](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning.html) to create four predictive models on customer churn dataset, and evaluate their performance on the same test data. - [Cancer Prediction](breast_cancer_prediction) predicts Breast Cancer based on features derived from images, using SageMaker's Linear Learner. diff --git a/introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb b/introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb new file mode 100644 index 0000000000..9bb1d418ba --- /dev/null +++ b/introduction_to_applying_machine_learning/sagemaker_lightgbm_distributed_training_dask/sagemaker-lightgbm-distributed-training-dask.ipynb @@ -0,0 +1,959 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "270633d3", + "metadata": {}, + "source": [ + "# Amazon SageMaker LightGBM Distributed training using Dask" + ] + }, + { + "cell_type": "markdown", + "id": "bb41dbfb", + "metadata": {}, + "source": [ + "---\n", + "Losing customers is costly for any business. Identifying unhappy customers early on gives you a chance to offer them incentives to stay. This notebook describes using machine learning (ML) for the automated identification of unhappy customers, also known as customer churn prediction. ML models rarely give perfect predictions though, so this notebook is also about how to incorporate the relative costs of prediction mistakes when determining the financial outcome of using ML.\n", + "\n", + "This notebook demonstrates the use of distributed training for Amazon SageMaker’s implementation of the [LightGBM](https://lightgbm.readthedocs.io/en/latest/) with Dask.\n", + "\n", + "In this notebook, we demonstrate two use cases:\n", + "\n", + "* How to distributedly train a tabular model using Dask on the customer churn dataset.\n", + "* How to use the trained tabular model to perform inference, i.e., classifying new samples.\n", + "\n", + "\n", + "Note: This notebook was tested in Amazon SageMaker Studio on ml.t3.medium instance with Python 3 (Data Science) kernel.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "12c6b9ec", + "metadata": {}, + "source": [ + "## 1. Set Up\n", + "\n", + "---\n", + "Before executing the notebook, there are some initial steps required for setup. This notebook requires latest version of sagemaker and ipywidgets.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da04d4ef", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sagemaker ipywidgets --upgrade" + ] + }, + { + "cell_type": "markdown", + "id": "cb779656", + "metadata": {}, + "source": [ + "\n", + "---\n", + "To train and host on Amazon SageMaker, we need to setup and authenticate the use of AWS services. Here, we use the execution role associated with the current notebook instance as the AWS account role with SageMaker access. It has necessary permissions, including access to your data in S3.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56d15a71", + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker, boto3, json\n", + "from sagemaker import get_execution_role\n", + "\n", + "aws_role = get_execution_role()\n", + "aws_region = boto3.Session().region_name\n", + "sess = sagemaker.Session()\n", + "\n", + "bucket = sess.default_bucket()\n", + "prefix = \"sagemaker/DEMO-churn-dt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adf9b9ed", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import io\n", + "import os\n", + "import sys\n", + "import time\n", + "import json\n", + "from IPython.display import display\n", + "from time import strftime, gmtime\n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.serializers import CSVSerializer\n", + "from sklearn import preprocessing" + ] + }, + { + "cell_type": "markdown", + "id": "40198d95", + "metadata": {}, + "source": [ + "## 2. Data Preparation and Visualization\n", + "\n", + "Mobile operators have historical records on which customers ultimately ended up churning and which continued using the service. We can use this historical information to construct an ML model of one mobile operator’s churn using a process called training. After training the model, we can pass the profile information of an arbitrary customer (the same profile information that we used to train the model) to the model, and have the model predict whether this customer is going to churn. Of course, we expect the model to make mistakes. After all, predicting the future is tricky business! But we’ll learn how to deal with prediction errors.\n", + "\n", + "The dataset we use is publicly available and was mentioned in the book [Discovering Knowledge in Data](https://www.amazon.com/dp/0470908742/) by Daniel T. Larose. It is attributed by the author to the University of California Irvine Repository of Machine Learning Datasets. Let’s download and read that dataset in now:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "687a827d", + "metadata": {}, + "outputs": [], + "source": [ + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(f\"sagemaker-sample-files\", \"datasets/tabular/synthetic/churn.txt\", \"churn.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7df8cfb8", + "metadata": {}, + "outputs": [], + "source": [ + "churn = pd.read_csv(\"./churn.txt\")\n", + "pd.set_option(\"display.max_columns\", 500)\n", + "churn.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "fe5f8234", + "metadata": {}, + "source": [ + "By modern standards, it’s a relatively small dataset, with only 5,000 records, where each record uses 21 attributes to describe the profile of a customer of an unknown US mobile operator. The attributes are:\n", + "\n", + "`State`: the US state in which the customer resides, indicated by a two-letter abbreviation; for example, OH or NJ\n", + "\n", + "`Account Length`: the number of days that this account has been active\n", + "\n", + "`Area Code`: the three-digit area code of the corresponding customer’s phone number\n", + "\n", + "`Phone`: the remaining seven-digit phone number\n", + "\n", + "`Int’l Plan`: whether the customer has an international calling plan: yes/no\n", + "\n", + "`VMail Plan`: whether the customer has a voice mail feature: yes/no\n", + "\n", + "`VMail Message`: the average number of voice mail messages per month\n", + "\n", + "`Day Mins`: the total number of calling minutes used during the day\n", + "\n", + "`Day Calls`: the total number of calls placed during the day\n", + "\n", + "`Day Charge`: the billed cost of daytime calls\n", + "\n", + "`Eve Mins`, `Eve Calls`, `Eve Charge`: the billed cost for calls placed during the evening\n", + "\n", + "`Night Mins`, `Night Calls`, `Night Charge`: the billed cost for calls placed during nighttime\n", + "\n", + "`Intl Mins`, `Intl Calls`, `Intl Charge`: the billed cost for international calls\n", + "\n", + "`CustServ Calls`: the number of calls placed to Customer Service\n", + "\n", + "`Churn?`: whether the customer left the service: true/false\n", + "\n", + "The last attribute, `Churn?`, is known as the target attribute: the attribute that we want the ML model to predict. Because the target attribute is binary, our model will be performing binary prediction, also known as binary classification.\n", + "\n", + "Let’s begin exploring the data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a27af5ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Histograms for each numeric features\n", + "display(churn.describe())\n", + "%matplotlib inline\n", + "hist = churn.hist(bins=30, sharey=True, figsize=(10, 10))" + ] + }, + { + "cell_type": "markdown", + "id": "c987d494", + "metadata": {}, + "source": [ + "We can see immediately that: - `State` appears to be quite evenly distributed. - `Phone` takes on too many unique values to be of any practical use. It’s possible that parsing out the prefix could have some value, but without more context on how these are allocated, we should avoid using it. - Most of the numeric features are surprisingly nicely distributed, with many showing bell-like gaussianity. `VMail Message` is a notable exception." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eb8b330", + "metadata": {}, + "outputs": [], + "source": [ + "churn = churn.drop(\"Phone\", axis=1)\n", + "churn[\"Area Code\"] = churn[\"Area Code\"].astype(object)" + ] + }, + { + "cell_type": "markdown", + "id": "a2ceceb7", + "metadata": {}, + "source": [ + "Next let’s look at the relationship between each of the features and our target variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06492d81", + "metadata": {}, + "outputs": [], + "source": [ + "for column in churn.select_dtypes(include=[\"object\"]).columns:\n", + " if column != \"Churn?\":\n", + " display(pd.crosstab(index=churn[column], columns=churn[\"Churn?\"], normalize=\"columns\"))\n", + "\n", + "for column in churn.select_dtypes(exclude=[\"object\"]).columns:\n", + " print(column)\n", + " hist = churn[[column, \"Churn?\"]].hist(by=\"Churn?\", bins=30)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9c35cecc", + "metadata": {}, + "source": [ + "We convert the target attribute to binary value and move it to the first column of the dataset to meet requirements of SageMaker built-in tabular algorithms (For an example, see [SageMaker LightGBM documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/lightgbm.html))." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "565c809d", + "metadata": {}, + "outputs": [], + "source": [ + "churn[\"target\"] = churn[\"Churn?\"].map({\"True.\": 1, \"False.\": 0})\n", + "churn.drop([\"Churn?\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a231510", + "metadata": {}, + "outputs": [], + "source": [ + "churn = churn[[\"target\"] + churn.columns.tolist()[:-1]]" + ] + }, + { + "cell_type": "markdown", + "id": "14a15e08", + "metadata": {}, + "source": [ + "We identify the column indexes of the categorical attribute, which is required by LightGBM, CatBoost, and TabTransformer algorithm (AutoGluon-Tabular has built-in feature engineering to identify the categorical attribute automatically, and thus does not require such input)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd4c11ed", + "metadata": {}, + "outputs": [], + "source": [ + "cat_columns = [\n", + " \"State\",\n", + " \"Account Length\",\n", + " \"Area Code\",\n", + " \"Phone\",\n", + " \"Int'l Plan\",\n", + " \"VMail Plan\",\n", + " \"VMail Message\",\n", + " \"Day Calls\",\n", + " \"Eve Calls\",\n", + " \"Night Calls\",\n", + " \"Intl Calls\",\n", + " \"CustServ Calls\",\n", + "]\n", + "\n", + "cat_idx = []\n", + "for idx, col_name in enumerate(churn.columns.tolist()):\n", + " if col_name in cat_columns:\n", + " cat_idx.append(idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "466d84a3", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"cat_idx.json\", \"w\") as outfile:\n", + " json.dump({\"cat_idx\": cat_idx}, outfile)" + ] + }, + { + "cell_type": "markdown", + "id": "ea910d97", + "metadata": {}, + "source": [ + "[LightGBM official documentation](https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support) requires that all categorical features should be encoded as non-negative integers. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bb951cf", + "metadata": {}, + "outputs": [], + "source": [ + "for idx, col_name in enumerate(churn.columns.tolist()):\n", + " if col_name in cat_columns:\n", + " le = preprocessing.LabelEncoder()\n", + " churn[col_name] = le.fit_transform(churn[col_name])" + ] + }, + { + "cell_type": "markdown", + "id": "a5ff6f07", + "metadata": {}, + "source": [ + "We split the churn dataset into train, validation, and test set using stratified sampling. Validation set is used for early stopping and AMT. Test set is used for performance evaluations in the end. Next, we upload them into a S3 path for training.\n", + "\n", + "The structure of the S3 path for training should be structured as below.\n", + "\n", + "* The supported input data format for training is `csv`. You are allowed to put more than 1 data file under both train and valdiation channel. The name of data file can be any one as long as it ends with `.csv`.\n", + "* The first column corresponds to the target and the rest of columns correspond to features. This follows the convention of [SageMaker XGBoost algorithm](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html). \n", + "* The `cat_idx.json` is categorical column indexes. It contains a dictionary of a key-value pair. The key can be any string. The value is the list of column indexes of categorical features. The index starts with value 1 as value 0 corresponds to the target variable. Please see example above to format the `cat_idx.json`.\n", + "* For the validation data, we encourage you to include one data file under its channel such that the all of the validation data points can be assigned to one machine. Thus, the validation score is for all of the validation data points and can be easily parsed by the AMT for hyperparameter optimization.\n", + "* Current distributed training only supports CPU.\n", + "\n", + "-- `train`
\n", + "      -- `data_1.csv`
\n", + "      -- `data_2.csv`
\n", + "      -- `data_3.csv`
\n", + "      -- `cat_idx.json`\n", + " \n", + "-- `validation`
\n", + "      -- `data.csv` \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fede804a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train, val_n_test = train_test_split(\n", + " churn, test_size=0.3, random_state=42, stratify=churn[\"target\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43d49126", + "metadata": {}, + "outputs": [], + "source": [ + "val, test = train_test_split(\n", + " val_n_test, test_size=0.3, random_state=42, stratify=val_n_test[\"target\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ef7194c", + "metadata": {}, + "outputs": [], + "source": [ + "train.to_csv(\"train.csv\", header=False, index=False)\n", + "val.to_csv(\"validation.csv\", header=False, index=False)\n", + "test.to_csv(\"test.csv\", header=False, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f9e4fe64", + "metadata": {}, + "source": [ + "For demonstartion purpose on including multiple files under the training channel, we simply duplicate the training data multiple times as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22bb7a84", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "for i in tqdm(range(200)):\n", + " boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, f\"train/data_{i}.csv\")\n", + " ).upload_file(\"train.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a358d0d", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, \"validation/data.csv\")\n", + ").upload_file(\"validation.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54c33e88", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, \"test/data.csv\")\n", + ").upload_file(\"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8c71dcd", + "metadata": {}, + "outputs": [], + "source": [ + "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n", + " os.path.join(prefix, \"train/cat_idx.json\")\n", + ").upload_file(\"cat_idx.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "e0418e02", + "metadata": {}, + "source": [ + "## 3. Distributedly Train A SageMaker LightGBM Model with AMT" + ] + }, + { + "cell_type": "markdown", + "id": "20512887", + "metadata": {}, + "source": [ + "### 3.1. Retrieve Training Artifacts\n", + "\n", + "___\n", + "\n", + "Here, we retrieve the training docker container, the training algorithm source, and the tabular algorithm. Note that model_version=\"*\" fetches the latest model.\n", + "\n", + "For the training algorithm, we have four choices in this demonstration for classification task.\n", + "* [LightGBM](https://lightgbm.readthedocs.io/en/latest/): To use this algorithm, specify `train_model_id` as `lightgbm-classification-model` in the cell below.\n", + "\n", + "For regression task, the `train_model_id` is `lightgbm-regression-model`.\n", + "\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8565b382", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import image_uris, model_uris, script_uris\n", + "\n", + "train_model_id, train_model_version, train_scope = \"lightgbm-classification-model\", \"*\", \"training\"\n", + "training_instance_type = \"ml.m5.4xlarge\"\n", + "\n", + "# Retrieve the docker image\n", + "train_image_uri = image_uris.retrieve(\n", + " region=None,\n", + " framework=None,\n", + " model_id=train_model_id,\n", + " model_version=train_model_version,\n", + " image_scope=train_scope,\n", + " instance_type=training_instance_type,\n", + ")\n", + "# Retrieve the training script\n", + "train_source_uri = script_uris.retrieve(\n", + " model_id=train_model_id, model_version=train_model_version, script_scope=train_scope\n", + ")\n", + "# Retrieve the pre-trained model tarball to further fine-tune\n", + "train_model_uri = model_uris.retrieve(\n", + " model_id=train_model_id, model_version=train_model_version, model_scope=train_scope\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "99e958e5", + "metadata": {}, + "source": [ + "### 3.2. Set Training Parameters\n", + "\n", + "---\n", + "\n", + "Now that we are done with all the setup that is needed, we are ready to train our tabular algorithm. To begin, let us create a [``sageMaker.estimator.Estimator``](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) object. This estimator will launch the training job. \n", + "\n", + "There are two kinds of parameters that need to be set for training. The first one are the parameters for the training job. These include: (i) Training data path. This is S3 folder in which the input data is stored, (ii) Output path: This the s3 folder in which the training output is stored. (iii) Training instance type: This indicates the type of machine on which to run the training.\n", + "\n", + "The second set of parameters are algorithm specific training hyper-parameters. \n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de5c2074", + "metadata": {}, + "outputs": [], + "source": [ + "training_dataset_s3_path = f\"s3://{bucket}/{prefix}/train\"\n", + "validation_dataset_s3_path = f\"s3://{bucket}/{prefix}/validation\"\n", + "\n", + "output_prefix = \"jumpstart-example-tabular-training\"\n", + "s3_output_location = f\"s3://{bucket}/{output_prefix}/output_lgb\"" + ] + }, + { + "cell_type": "markdown", + "id": "74273473", + "metadata": {}, + "source": [ + "---\n", + "For algorithm specific hyper-parameters, we start by fetching python dictionary of the training hyper-parameters that the algorithm accepts with their default values. This can then be overridden to custom values. For the evaluation metric that is used by early stopping and automatic model tuning, we choose `auc` score. Note. LightGBM does not have built-in F1 score supported. See [LightGBM documentation](https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters).\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1c5b649", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import hyperparameters\n", + "\n", + "# Retrieve the default hyper-parameters for fine-tuning the model\n", + "hyperparameters = hyperparameters.retrieve_default(\n", + " model_id=train_model_id, model_version=train_model_version\n", + ")\n", + "\n", + "# [Optional] Override default hyperparameters with custom values\n", + "hyperparameters[\"num_boost_round\"] = \"200\"\n", + "\n", + "\n", + "hyperparameters[\"metric\"] = \"auc\"\n", + "hyperparameters[\"tree_learner\"] = \"voting\" # use AllReduce method for distributed training\n", + "\n", + "del hyperparameters[\n", + " \"early_stopping_rounds\"\n", + "] # current distributed training with early stopping has some issues. See https://github.com/microsoft/SynapseML/issues/728#issuecomment-1221599961\n", + "# thus it is disabled for distributed training.\n", + "print(hyperparameters)" + ] + }, + { + "cell_type": "markdown", + "id": "2e9b3d87", + "metadata": {}, + "source": [ + "### 3.3. Train with Automatic Model Tuning \n", + "\n", + "\n", + "Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. We will use a HyperparameterTuner object to interact with Amazon SageMaker hyperparameter tuning APIs.\n", + "\n", + "* Note. In this notebook, we set AMT budget (total tuning jobs) as 10 for each of the tabular algorithm except AutoGluon-Tabular. For [AutoGluon-Tabular](https://arxiv.org/abs/2003.06505), it succeeds by ensembling multiple models and stacking them in multiple layers. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7c8d7d6", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner\n", + "\n", + "use_amt = True\n", + "\n", + "hyperparameter_ranges_lgb = {\n", + " \"learning_rate\": ContinuousParameter(1e-4, 1, scaling_type=\"Logarithmic\"),\n", + " \"num_boost_round\": IntegerParameter(2, 30),\n", + " \"num_leaves\": IntegerParameter(10, 50),\n", + " \"feature_fraction\": ContinuousParameter(0.1, 1),\n", + " \"bagging_fraction\": ContinuousParameter(0.1, 1),\n", + " \"bagging_freq\": IntegerParameter(1, 10),\n", + " \"max_depth\": IntegerParameter(5, 30),\n", + " \"min_data_in_leaf\": IntegerParameter(5, 50),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "8f60b812", + "metadata": {}, + "source": [ + "### 3.4. Start Training" + ] + }, + { + "cell_type": "markdown", + "id": "8d427761", + "metadata": {}, + "source": [ + "---\n", + "We start by creating the estimator object with all the required assets and then launch the training job. \n", + "\n", + "* To enable distributed training, you only need specify the number of instances to be more than 1.\n", + "* You might need increase the argument volumn_size if your dataset size is larger than the default value (30GB). Otherwise, you may see insufficient disk memory error.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46cfd39e", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.estimator import Estimator\n", + "import random\n", + "\n", + "training_job_name = \"jumpstart-example-distri-lgb-g\" + str(random.randint(0, 100))\n", + "\n", + "# Create SageMaker Estimator instance\n", + "tabular_estimator = Estimator(\n", + " role=aws_role,\n", + " image_uri=train_image_uri,\n", + " source_dir=train_source_uri,\n", + " model_uri=train_model_uri,\n", + " entry_point=\"transfer_learning.py\",\n", + " instance_count=4, ### select the instance count you would like to use for distributed training\n", + " volume_size=30, ### volume_size (int or PipelineVariable): Size in GB of the storage volume to use for storing input and output data during training (default: 30).\n", + " instance_type=training_instance_type,\n", + " max_run=360000,\n", + " hyperparameters=hyperparameters,\n", + " output_path=s3_output_location,\n", + ")\n", + "\n", + "if use_amt:\n", + "\n", + " tuner = HyperparameterTuner(\n", + " tabular_estimator,\n", + " \"auc\",\n", + " hyperparameter_ranges_lgb,\n", + " [{\"Name\": \"auc\", \"Regex\": \"auc: ([0-9\\\\.]+)\"}],\n", + " max_jobs=20,\n", + " max_parallel_jobs=5,\n", + " objective_type=\"Maximize\",\n", + " )\n", + "\n", + " tuner.fit(\n", + " {\n", + " \"train\": training_dataset_s3_path,\n", + " \"validation\": validation_dataset_s3_path,\n", + " },\n", + " logs=True,\n", + " job_name=training_job_name,\n", + " )\n", + "else:\n", + " # Launch a SageMaker Training job by passing s3 path of the training data\n", + " tabular_estimator.fit(\n", + " {\n", + " \"train\": training_dataset_s3_path,\n", + " \"validation\": validation_dataset_s3_path,\n", + " },\n", + " logs=True,\n", + " job_name=training_job_name,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "2199060d", + "metadata": {}, + "source": [ + "### 3.5. Deploy and Run Inference on the Trained Tabular Model\n", + "\n", + "---\n", + "\n", + "In this section, you learn how to query an existing endpoint and make predictions of the examples you input. For each example, the model will output the probability of the sample for each class in the model. \n", + "Next, the predicted class label is obtained by taking the class label with the maximum probability over others.\n", + "\n", + "\n", + "We start by retrieving the artifacts and deploy the `tabular_estimator` that we trained.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f480f9a5", + "metadata": {}, + "outputs": [], + "source": [ + "inference_instance_type = \"ml.m5.4xlarge\"\n", + "\n", + "# Retrieve the inference docker container uri\n", + "deploy_image_uri = image_uris.retrieve(\n", + " region=None,\n", + " framework=None,\n", + " image_scope=\"inference\",\n", + " model_id=train_model_id,\n", + " model_version=train_model_version,\n", + " instance_type=inference_instance_type,\n", + ")\n", + "# Retrieve the inference script uri\n", + "deploy_source_uri = script_uris.retrieve(\n", + " model_id=train_model_id, model_version=train_model_version, script_scope=\"inference\"\n", + ")\n", + "\n", + "endpoint_name = \"jumpstart-example-distri-lgb-g\" + str(random.randint(0, 100))\n", + "\n", + "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n", + "predictor = (tuner if use_amt else tabular_estimator).deploy(\n", + " initial_instance_count=1,\n", + " instance_type=inference_instance_type,\n", + " entry_point=\"inference.py\",\n", + " image_uri=deploy_image_uri,\n", + " source_dir=deploy_source_uri,\n", + " endpoint_name=endpoint_name,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0c7a8372", + "metadata": {}, + "source": [ + "---\n", + "Next, we read the customer churn test data into pandas data frame, prepare the ground truth target and predicting features to send into the endpoint. \n", + "\n", + "Below is the screenshot of the first 5 examples in the test set.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22675cea", + "metadata": {}, + "outputs": [], + "source": [ + "newline, bold, unbold = \"\\n\", \"\\033[1m\", \"\\033[0m\"\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score\n", + "from sklearn.metrics import confusion_matrix\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# read the data\n", + "test_data_file_name = \"test.csv\"\n", + "test_data = pd.read_csv(test_data_file_name, header=None)\n", + "test_data.columns = [\"Target\"] + [f\"Feature_{i}\" for i in range(1, test_data.shape[1])]\n", + "\n", + "num_examples, num_columns = test_data.shape\n", + "print(\n", + " f\"{bold}The test dataset contains {num_examples} examples and {num_columns} columns.{unbold}\\n\"\n", + ")\n", + "\n", + "# prepare the ground truth target and predicting features to send into the endpoint.\n", + "ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]\n", + "\n", + "print(f\"{bold}The first 5 observations of the data: {unbold} \\n\")\n", + "test_data.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "5bbda7e4", + "metadata": {}, + "source": [ + "---\n", + "The following code queries the endpoint you have created to get the prediction for each test example. \n", + "The `query_endpoint()` function returns an array-like of shape (num_examples, num_classes), where each row indicates \n", + "the probability of the example for each class in the model. The num_classes is 2 in above test data. \n", + "Next, the predicted class label is obtained by taking the class label with the maximum probability over others for each example. \n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7b0bebf", + "metadata": {}, + "outputs": [], + "source": [ + "content_type = \"text/csv\"\n", + "\n", + "\n", + "def query_endpoint(encoded_tabular_data, endpoint_name):\n", + " client = boto3.client(\"runtime.sagemaker\")\n", + " response = client.invoke_endpoint(\n", + " EndpointName=endpoint_name,\n", + " ContentType=content_type,\n", + " Body=encoded_tabular_data,\n", + " )\n", + " return response\n", + "\n", + "\n", + "def parse_response(query_response):\n", + " model_predictions = json.loads(query_response[\"Body\"].read())\n", + " predicted_probabilities = model_predictions[\"probabilities\"]\n", + " return np.array(predicted_probabilities)\n", + "\n", + "\n", + "# split the test data into smaller size of batches to query the endpoint if test data has large size.\n", + "batch_size = 1500\n", + "predict_prob = []\n", + "for i in np.arange(0, num_examples, step=batch_size):\n", + " query_response_batch = query_endpoint(\n", + " features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode(\"utf-8\"),\n", + " endpoint_name,\n", + " )\n", + " predict_prob_batch = parse_response(query_response_batch) # prediction probability per batch\n", + " predict_prob.append(predict_prob_batch)\n", + "\n", + "\n", + "predict_prob = np.concatenate(predict_prob, axis=0)\n", + "predict_label = np.argmax(predict_prob, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "cd7c94ae", + "metadata": {}, + "source": [ + "## 3.6. Evaluate the Prediction Results Returned from the Endpoint\n", + "\n", + "---\n", + "We evaluate the predictions results returned from the endpoint by following two ways.\n", + "\n", + "* Visualize the predictions results by plotting the confusion matrix.\n", + "\n", + "* Measure the prediction results quantitatively.\n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94720689", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the predictions results by plotting the confusion matrix.\n", + "conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label)\n", + "fig, ax = plt.subplots(figsize=(7.5, 7.5))\n", + "ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)\n", + "for i in range(conf_matrix.shape[0]):\n", + " for j in range(conf_matrix.shape[1]):\n", + " ax.text(x=j, y=i, s=conf_matrix[i, j], va=\"center\", ha=\"center\", size=\"xx-large\")\n", + "\n", + "plt.xlabel(\"Predictions\", fontsize=18)\n", + "plt.ylabel(\"Actuals\", fontsize=18)\n", + "plt.title(\"Confusion Matrix\", fontsize=18)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79bb9864", + "metadata": {}, + "outputs": [], + "source": [ + "# Measure the prediction results quantitatively.\n", + "eval_accuracy = accuracy_score(ground_truth_label.values, predict_label)\n", + "eval_f1 = f1_score(ground_truth_label.values, predict_label)\n", + "eval_auc = roc_auc_score(ground_truth_label.values, predict_prob[:, 1])\n", + "\n", + "lgb_results = pd.DataFrame.from_dict(\n", + " {\n", + " \"Accuracy\": eval_accuracy,\n", + " \"F1\": eval_f1,\n", + " \"AUC\": eval_auc,\n", + " },\n", + " orient=\"index\",\n", + " columns=[\"LightGBM with AMT\"],\n", + ")\n", + "\n", + "lgb_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bacee7c", + "metadata": {}, + "outputs": [], + "source": [ + "# Delete the SageMaker endpoint and the attached resources\n", + "predictor.delete_model()\n", + "predictor.delete_endpoint()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}