diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/README.md b/sagemaker-clarify/clarify-explainability-inference-pipelines/README.md new file mode 100644 index 0000000000..677cb46881 --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/README.md @@ -0,0 +1,25 @@ +## Credit risk prediction and explainability with Amazon SageMaker + +This example shows how to user SageMaker Clarify to run explainability jobs on a SageMaker hosted inference pipeline. + +Below is the architecture diagram used in the solution: + +![alt text](clarify_inf_pipeline_arch.png) + + +The notebook performs the following steps: + +1. Prepare raw training and test data +2. Create a SageMaker Processing job which performs preprocessing on the raw training data and also produces an SKlearn model which is reused for deployment. +3. Train an XGBoost model on the processed data using SageMaker's built-in XGBoost container +4. Create a SageMaker Inference pipeline containing the SKlearn and XGBoost model in a series +5. Perform inference by supplying raw test data +6. Set up and run explainability job powered by SageMaker Clarify +7. Use open source shap library to create summary and waterfall plots to understand the feature importance better +8. Run bias analysis jobs +9. Clean up + + +The attached notebook can be run in Amazon SageMaker Studio. + + diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/clarify_inf_pipeline_arch.png b/sagemaker-clarify/clarify-explainability-inference-pipelines/clarify_inf_pipeline_arch.png new file mode 100644 index 0000000000..7e0e19672e Binary files /dev/null and b/sagemaker-clarify/clarify-explainability-inference-pipelines/clarify_inf_pipeline_arch.png differ diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/credit_risk_explainability_inference_pipelines.ipynb b/sagemaker-clarify/clarify-explainability-inference-pipelines/credit_risk_explainability_inference_pipelines.ipynb new file mode 100644 index 0000000000..3bcc87cf56 --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/credit_risk_explainability_inference_pipelines.ipynb @@ -0,0 +1,1484 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Credit risk prediction and explainability with Amazon SageMaker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Credit risk explainability use case](credit_risk_prediction.png)\n", + "\n", + "1. [Overview](#Overview)\n", + "1. [Prerequisites and Data](#Prerequisites-and-Data)\n", + " 1. [Initialize SageMaker](#Initialize-SageMaker)\n", + " 1. [Download data](#Download-data)\n", + " 1. [Loading the data: German credit (Update) Dataset](#Loading-the-data:-German-credit-Dataset) \n", + " 1. [Data inspection](#Data-inspection) \n", + " 1. [Data preprocessing Model and upload to S3](#Preprocess-and-Upload-Training-Data) \n", + "1. [Train XGBoost Model](#Train-XGBoost-Model)\n", + " 1. [Train Model](#Train-Model)\n", + "1. [Deploy Model](#Train-XGBoost-Model)\n", + "1. [Amazon SageMaker Clarify](#Amazon-SageMaker-Clarify)\n", + " 1. [Explaining Predictions](#Explaining-Predictions)\n", + " 1. [Viewing the Explainability Report](#Viewing-the-Explainability-Report)\n", + " 2. [Explaining individual bad credit prediction example](#Explaining-individual-prediction)\n", + " 2. [Understanding Bias](#Bias-Detection)\n", + " 1. [Pre-training bias metrics](#pre-training)\n", + " 2. [Post-training bias metrics](#post-training)\n", + "1. [Clean Up](#Clean-Up)\n", + "\n", + "## Overview\n", + "Amazon SageMaker helps data scientists and developers to prepare, build, train, and deploy high-quality machine learning (ML) models quickly by bringing together a broad set of capabilities purpose-built for ML.\n", + "\n", + "[Amazon SageMaker Clarify](https://aws.amazon.com/sagemaker/clarify/) helps improve your machine learning models by detecting potential bias and helping explain how these models make predictions. The fairness and explainability functionality provided by SageMaker Clarify takes a step towards enabling AWS customers to build trustworthy and understandable machine learning models. \n", + "\n", + "Amazon SageMaker provides pre-made images for machine and deep learning frameworks for supported frameworks such as Scikit-Learn, XGBoost, TensorFlow, PyTorch, MXNet, or Chainer. These are preloaded with the corresponding framework and some additional Python packages, such as Pandas and NumPy, so you can write your own code for model training. See [here](https://docs.aws.amazon.com/sagemaker/latest/dg/algorithms-choose.html#supported-frameworks-benefits) for more information.\n", + "\n", + "\n", + "[Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/) provides a single, web-based visual interface where you can perform all ML development activities including notebooks, experiment management, automatic model creation, debugging, and model and data drift detection.\n", + "\n", + "In this SageMaker Studio notebook, we highlight how you can use SageMaker to train models, host them as an inference pipeline, and provide bias detection and explainability to analyze data and understand prediction outcomes from the model.\n", + "This sample notebook walks you through: \n", + "\n", + "1. Download and explore credit risk dataset - [South German Credit (UPDATE) Data Set](https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29)\n", + "2. Preprocessing data with sklearn on the dataset\n", + "3. Training GBM model with XGBoost on the dataset\n", + "5. Build an inference pipeline model (sklearn model and XGBoost model together) to preprocess input data and produce a prediction outcome per instance\n", + "6. Hosting and scoring the single model (Optional)\n", + "7. Single SageMaker Clarify job to provide Kernel SHAP values for the SageMaker model on training and test datasets.\n", + "\n", + "![Credit risk explainability model inference](clarify_inf_pipeline_arch.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites and Data\n", + "### Initialize SageMaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from io import StringIO\n", + "import os\n", + "import time\n", + "import sys\n", + "import IPython\n", + "from time import gmtime, strftime\n", + "\n", + "import boto3\n", + "import numpy as np\n", + "import pandas as pd\n", + "import urllib\n", + "\n", + "import sagemaker\n", + "from sagemaker.s3 import S3Uploader\n", + "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "from sagemaker.inputs import TrainingInput\n", + "from sagemaker.xgboost import XGBoost\n", + "from sagemaker.s3 import S3Downloader\n", + "from sagemaker.s3 import S3Uploader\n", + "from sagemaker import Session\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.xgboost import XGBoostModel\n", + "from sagemaker.sklearn import SKLearnModel\n", + "from sagemaker.pipeline import PipelineModel\n", + "\n", + "\n", + "session = Session()\n", + "bucket = session.default_bucket()\n", + "prefix = \"sagemaker/sagemaker-clarify-credit-risk-model\"\n", + "region = session.boto_region_name\n", + "\n", + "# Define IAM role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download data\n", + "\n", + "First, __download__ the data and save it in the `data` folder.\n", + "\n", + "\n", + "$^{[2]}$ Ulrike Grömping\n", + "Beuth University of Applied Sciences Berlin\n", + "Website with contact information: https://prof.beuth-hochschule.de/groemping/." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "S3Downloader.download(\n", + " \"s3://sagemaker-sample-files/datasets/tabular/uci_statlog_german_credit_data/SouthGermanCredit.asc\",\n", + " \"data\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "credit_columns = [\n", + " \"status\",\n", + " \"duration\",\n", + " \"credit_history\",\n", + " \"purpose\",\n", + " \"amount\",\n", + " \"savings\",\n", + " \"employment_duration\",\n", + " \"installment_rate\",\n", + " \"personal_status_sex\",\n", + " \"other_debtors\",\n", + " \"present_residence\",\n", + " \"property\",\n", + " \"age\",\n", + " \"other_installment_plans\",\n", + " \"housing\",\n", + " \"number_credits\",\n", + " \"job\",\n", + " \"people_liable\",\n", + " \"telephone\",\n", + " \"foreign_worker\",\n", + " \"credit_risk\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$`laufkont = status`\n", + " \n", + " 1 : no checking account \n", + " 2 : ... < 0 DM \n", + " 3 : 0<= ... < 200 DM \n", + " 4 : ... >= 200 DM / salary for at least 1 year\n", + "\n", + "$`laufzeit = duration`\n", + " \n", + "\n", + "$`moral = credit_history`\n", + " \n", + " 0 : delay in paying off in the past \n", + " 1 : critical account/other credits elsewhere \n", + " 2 : no credits taken/all credits paid back duly\n", + " 3 : existing credits paid back duly till now \n", + " 4 : all credits at this bank paid back duly \n", + "\n", + "$`verw = purpose`\n", + " \n", + " 0 : others \n", + " 1 : car (new) \n", + " 2 : car (used) \n", + " 3 : furniture/equipment\n", + " 4 : radio/television \n", + " 5 : domestic appliances\n", + " 6 : repairs \n", + " 7 : education \n", + " 8 : vacation \n", + " 9 : retraining \n", + " 10 : business \n", + "\n", + "$`hoehe = amount`\n", + " \n", + "\n", + "$`sparkont = savings`\n", + " \n", + " 1 : unknown/no savings account\n", + " 2 : ... < 100 DM \n", + " 3 : 100 <= ... < 500 DM \n", + " 4 : 500 <= ... < 1000 DM \n", + " 5 : ... >= 1000 DM \n", + "\n", + "$`beszeit = employment_duration`\n", + " \n", + " 1 : unemployed \n", + " 2 : < 1 yr \n", + " 3 : 1 <= ... < 4 yrs\n", + " 4 : 4 <= ... < 7 yrs\n", + " 5 : >= 7 yrs \n", + "\n", + "$`rate = installment_rate`\n", + " \n", + " 1 : >= 35 \n", + " 2 : 25 <= ... < 35\n", + " 3 : 20 <= ... < 25\n", + " 4 : < 20 \n", + "\n", + "$`famges = personal_status_sex`\n", + " \n", + " 1 : male : divorced/separated \n", + " 2 : female : non-single or male : single\n", + " 3 : male : married/widowed \n", + " 4 : female : single \n", + "\n", + "$`buerge = other_debtors`\n", + " \n", + " 1 : none \n", + " 2 : co-applicant\n", + " 3 : guarantor \n", + "\n", + "$`wohnzeit = present_residence`\n", + " \n", + " 1 : < 1 yr \n", + " 2 : 1 <= ... < 4 yrs\n", + " 3 : 4 <= ... < 7 yrs\n", + " 4 : >= 7 yrs \n", + "\n", + "$`verm = property`\n", + " \n", + " 1 : unknown / no property \n", + " 2 : car or other \n", + " 3 : building soc. savings agr./life insurance\n", + " 4 : real estate \n", + "\n", + "$`alter = age`\n", + " \n", + "\n", + "$`weitkred = other_installment_plans`\n", + " \n", + " 1 : bank \n", + " 2 : stores\n", + " 3 : none \n", + "\n", + "$`wohn = housing`\n", + " \n", + " 1 : for free\n", + " 2 : rent \n", + " 3 : own \n", + "\n", + "$`bishkred = number_credits`\n", + " \n", + " 1 : 1 \n", + " 2 : 2-3 \n", + " 3 : 4-5 \n", + " 4 : >= 6\n", + "\n", + "$`beruf = job`\n", + " \n", + " 1 : unemployed/unskilled - non-resident \n", + " 2 : unskilled - resident \n", + " 3 : skilled employee/official \n", + " 4 : manager/self-empl./highly qualif. employee\n", + "\n", + "$`pers = people_liable`\n", + " \n", + " 1 : 3 or more\n", + " 2 : 0 to 2 \n", + "\n", + "$`telef = telephone`\n", + " \n", + " 1 : no \n", + " 2 : yes (under customer name)\n", + "\n", + "$`gastarb = foreign_worker`\n", + " \n", + " 1 : yes\n", + " 2 : no \n", + "\n", + "$`kredit = credit_risk`\n", + " \n", + " 0 : bad \n", + " 1 : good\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "training_data = pd.read_csv(\n", + " \"data/SouthGermanCredit.asc\",\n", + " names=credit_columns,\n", + " header=0,\n", + " sep=r\" \",\n", + " engine=\"python\",\n", + " na_values=\"?\",\n", + ").dropna()\n", + "\n", + "print(training_data.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data inspection\n", + "Plotting histograms for the distribution of the different features is a good way to visualize the data. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_data[\"credit_risk\"].value_counts().sort_values().plot(\n", + " kind=\"bar\", title=\"Counts of Target\", rot=0\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the raw training and test CSV files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# prepare raw test data\n", + "test_data = training_data.sample(frac=0.1)\n", + "test_data = test_data.drop([\"credit_risk\"], axis=1)\n", + "test_filename = \"test.csv\"\n", + "test_columns = [\n", + " \"status\",\n", + " \"duration\",\n", + " \"credit_history\",\n", + " \"purpose\",\n", + " \"amount\",\n", + " \"savings\",\n", + " \"employment_duration\",\n", + " \"installment_rate\",\n", + " \"personal_status_sex\",\n", + " \"other_debtors\",\n", + " \"present_residence\",\n", + " \"property\",\n", + " \"age\",\n", + " \"other_installment_plans\",\n", + " \"housing\",\n", + " \"number_credits\",\n", + " \"job\",\n", + " \"people_liable\",\n", + " \"telephone\",\n", + " \"foreign_worker\",\n", + "]\n", + "test_data.to_csv(test_filename, index=False, header=True, columns=test_columns, sep=\",\")\n", + "\n", + "# prepare raw training data\n", + "train_filename = \"train.csv\"\n", + "training_data.to_csv(train_filename, index=False, header=True, columns=credit_columns, sep=\",\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Encode and Upload Data\n", + "Here we encode the training and test data. Encoding input data is not necessary for SageMaker Clarify, but is necessary for XGBoost models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_raw = S3Uploader.upload(test_filename, \"s3://{}/{}/data/test\".format(bucket, prefix))\n", + "print(test_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_raw = S3Uploader.upload(train_filename, \"s3://{}/{}/data/train\".format(bucket, prefix))\n", + "print(train_raw)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using SageMaker Processing jobs for preprocessing\n", + "\n", + "We will use SageMaker Processing jobs to perform the preprocessing on the raw data. SageMaker Processing provides prebuilt container for SKlearn which we will use here. We will output a sklearn model that can be used for preprocessing inference requests. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sklearn_processor = SKLearnProcessor(\n", + " role=role,\n", + " base_job_name=\"sagemaker-clarify-credit-risk-processing-job\",\n", + " instance_type=\"ml.m5.large\",\n", + " instance_count=1,\n", + " framework_version=\"0.20.0\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us have a look at the preprocessing script prepared to run in the processing job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pygmentize processing/preprocessor.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_data_path = \"s3://{0}/{1}/data/train/\".format(bucket, prefix)\n", + "train_data_path = \"s3://{0}/{1}/data/preprocessed/train/\".format(bucket, prefix)\n", + "val_data_path = \"s3://{0}/{1}/data/preprocessed/val/\".format(bucket, prefix)\n", + "model_path = \"s3://{0}/{1}/sklearn/\".format(bucket, prefix)\n", + "\n", + "\n", + "sklearn_processor.run(\n", + " code=\"processing/preprocessor.py\",\n", + " inputs=[\n", + " ProcessingInput(\n", + " input_name=\"raw_data\", source=raw_data_path, destination=\"/opt/ml/processing/input\"\n", + " )\n", + " ],\n", + " outputs=[\n", + " ProcessingOutput(\n", + " output_name=\"train_data\", source=\"/opt/ml/processing/train\", destination=train_data_path\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"val_data\", source=\"/opt/ml/processing/val\", destination=val_data_path\n", + " ),\n", + " ProcessingOutput(\n", + " output_name=\"model\", source=\"/opt/ml/processing/model\", destination=model_path\n", + " ),\n", + " ],\n", + " arguments=[\"--train-test-split-ratio\", \"0.2\"],\n", + " logs=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train XGBoost Model\n", + "In this step, we will train an XGBoost model on the preprocessed data. We will use our own training script with the built-in XGBoost container provided by SageMaker.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pygmentize training/train_xgboost.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up XGBoost Estimator\n", + "\n", + "Next, let us set up: \n", + " 1. Pre-defined values for Hyperparameters for XGBoost algorithm\n", + " 1. XGBoost Estimator for SageMaker\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hyperparameters = {\n", + " \"max_depth\": \"5\",\n", + " \"eta\": \"0.1\",\n", + " \"gamma\": \"4\",\n", + " \"min_child_weight\": \"6\",\n", + " \"silent\": \"1\",\n", + " \"objective\": \"binary:logistic\",\n", + " \"num_round\": \"100\",\n", + " \"subsample\": \"0.8\",\n", + " \"eval_metric\": \"auc\",\n", + " \"early_stopping_rounds\": \"20\",\n", + "}\n", + "\n", + "entry_point = \"train_xgboost.py\"\n", + "source_dir = \"training/\"\n", + "output_path = \"s3://{0}/{1}/{2}\".format(bucket, prefix, \"xgb_model\")\n", + "code_location = \"s3://{0}/{1}/code\".format(bucket, prefix)\n", + "\n", + "estimator = XGBoost(\n", + " entry_point=entry_point,\n", + " source_dir=source_dir,\n", + " output_path=output_path,\n", + " code_location=code_location,\n", + " hyperparameters=hyperparameters,\n", + " instance_type=\"ml.c5.xlarge\",\n", + " instance_count=1,\n", + " framework_version=\"0.90-2\",\n", + " py_version=\"py3\",\n", + " role=role,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training\n", + "\n", + "Now it's time to start the training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job_name = f\"credit-risk-xgb-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}\"\n", + "\n", + "train_input = TrainingInput(\n", + " \"s3://{0}/{1}/data/preprocessed/train/\".format(bucket, prefix), content_type=\"csv\"\n", + ")\n", + "val_input = TrainingInput(\n", + " \"s3://{0}/{1}/data/preprocessed/val/\".format(bucket, prefix), content_type=\"csv\"\n", + ")\n", + "\n", + "inputs = {\"train\": train_input, \"validation\": val_input}\n", + "\n", + "estimator.fit(inputs, job_name=job_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create an Inference Pipeline\n", + "\n", + "We will be deploying a SageMaker inference pipeline which will:\n", + " 1. Accept raw data as input\n", + " 1. preprocess the data with the SKlearn model we built earlier\n", + " 1. Pass the output of the Sklearn model as an input to the XGBoost model automatically\n", + " 1. Deliver the final inference result from the XGBoost model\n", + " \n", + "\n", + "To know more, check out the documentation on inference pipelines: https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Retrieve model artifacts\n", + "\n", + "First, we need to create two Amazon SageMaker Model objects, which associate the artifacts of training (serialized model artifacts in Amazon S3) to the Docker container used for inference. In order to do that, we need to get the paths to our serialized models in Amazon S3. We define the model data location of SKlearn and XGBoost models here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessor_model_data = \"s3://{}/{}/{}\".format(bucket, prefix, \"sklearn\") + \"/model.tar.gz\"\n", + "\n", + "xgboost_model_data = (\n", + " \"s3://{}/{}/{}/{}\".format(bucket, prefix, \"xgb_model\", job_name) + \"/output/model.tar.gz\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create SKlearn Model Object\n", + "\n", + "Next step is to create an `SKlearnModel` object which will contain the following important information:\n", + " 1. location of the sklearn model data\n", + " 1. our custom inference code\n", + " 1. SKlearn version to use (ensure this is the same the one used during pre-processing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For hosting this model we provide a custom inference script, that is used to process the inputs and outputs and execute the transform.\n", + "\n", + "The inference script is implemented in the `inference/sklearn/inference.py` file. The custom script defines:\n", + "\n", + "- a custom `input_fn` for pre-processing inference requests. Our input function accepts only CSV input, loads the input in a Pandas dataframe and assigns feature column names to the dataframe\n", + "- a custom `predict_fn` for running the transform over the inputs\n", + "- a custom `model_fn` for deserializing the model\n", + "\n", + "We will be using the default implementation of the `output_function` provided by SageMaker SKlearn container. To know more, check out: https://github.com/aws/sagemaker-scikit-learn-container\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pygmentize inference/sklearn/inference.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Now, let us define the SKLearnModel Object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sklearn_inference_code_location = \"s3://{}/{}/{}/code\".format(bucket, prefix, \"sklearn\")\n", + "\n", + "sklearn_model = SKLearnModel(\n", + " name=\"sklearn-model-{0}\".format(str(int(time.time()))),\n", + " model_data=preprocessor_model_data,\n", + " entry_point=\"inference.py\",\n", + " source_dir=\"inference/sklearn/\",\n", + " code_location=sklearn_inference_code_location,\n", + " role=role,\n", + " sagemaker_session=session,\n", + " framework_version=\"0.20.0\",\n", + " py_version=\"py3\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### XGBoost Model\n", + "\n", + "Similarly to the previous steps, we can create an XGBoost model object. Also here, we have to provide a custom inference script.\n", + "\n", + "The inference script is implemented in the `inference/xgboost/inference.py` file. The custom script defines:\n", + "\n", + "- a custom `input_fn` for pre-processing inference requests. This input function is able to handle JSON requests, plus all content types supported by the default XGBoost container. For additional information please visit: https://github.com/aws/sagemaker-xgboost-container/blob/master/src/sagemaker_xgboost_container/encoder.py. The reason for adding the JSON content type is that the container-to-container default request content type in an inference pipeline is JSON.\n", + "\n", + "- a custom `model_fn` for deserializing the model\n", + "\n", + "Let us have a look at the inference script.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pygmentize inference/xgboost/inference.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let us define the XGBoost model Object\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xgboost_inference_code_location = \"s3://{}/{}/{}/code\".format(bucket, prefix, \"xgb_model\")\n", + "\n", + "xgboost_model = XGBoostModel(\n", + " name=\"xgb-model-{0}\".format(str(int(time.time()))),\n", + " model_data=xgboost_model_data,\n", + " entry_point=\"inference.py\",\n", + " source_dir=\"inference/xgboost/\",\n", + " code_location=xgboost_inference_code_location,\n", + " framework_version=\"0.90-2\",\n", + " py_version=\"py3\",\n", + " role=role,\n", + " sagemaker_session=session,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Pipeline Model\n", + "\n", + "Once we have models ready, we can deploy them in a pipeline, by building a PipelineModel object and calling the deploy() method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_model_name = \"credit-risk-inference-pipeline-{0}\".format(str(int(time.time())))\n", + "\n", + "pipeline_model = PipelineModel(\n", + " name=pipeline_model_name,\n", + " role=role,\n", + " models=[sklearn_model, xgboost_model],\n", + " sagemaker_session=session,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Take note of the `model name` as it will be required while setting up the explainability job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_model.name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deploy Model (optional - Not needed for Clarify)\n", + "\n", + "Let's deploy the model and test the inference pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint_name = \"credit-risk-pipeline-endpoint-{0}\".format(str(int(time.time())))\n", + "print(endpoint_name)\n", + "\n", + "pipeline_model.deploy(\n", + " initial_instance_count=1, instance_type=\"ml.m5.xlarge\", endpoint_name=endpoint_name\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inference (optional - Not needed for Clarify)\n", + "\n", + "Now that the model has been deployed, lets us optionally test it against the raw test data we created earlier in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_dataset = S3Downloader.read_file(test_raw)\n", + "\n", + "predictor = sagemaker.predictor.Predictor(\n", + " endpoint_name,\n", + " session,\n", + " serializer=sagemaker.serializers.CSVSerializer(),\n", + " deserializer=sagemaker.deserializers.CSVDeserializer(),\n", + ")\n", + "\n", + "predictions = predictor.predict(test_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Amazon SageMaker Clarify\n", + "Now that you have your model set up. Let's say hello to SageMaker Clarify!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker import clarify\n", + "\n", + "clarify_processor = clarify.SageMakerClarifyProcessor(\n", + " role=role, instance_count=1, instance_type=\"ml.c4.xlarge\", sagemaker_session=session\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explaining Predictions\n", + "There are expanding business needs and legislative regulations that require explanations of _why_ a model made the decision it did. SageMaker Clarify uses [SHAP library](https://github.com/slundberg/shap) to explain the contribution that each input feature makes to the final decision. SageMaker Clarify uses a scalable and efficient implementation of [Kernel SHAP](https://github.com/slundberg/shap#model-agnostic-example-with-kernelexplainer-explains-any-function) with an option to use spark based parallelization with multiple processing instances. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a baseline for SHAP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a contrastive explainability technique, SHAP values are calculated by evaluating the model on synthetic data generated against a baseline sample. The explanations of the same case can be different depending on the choices of this baseline sample. \n", + "\n", + "We are interested in explaining bad credit predictions. Hence, we would like the baseline choice to have E(x) closer to 1(belonging to the good credit class). \n", + "\n", + "We use the [mode](https://en.wikipedia.org/wiki/Mode_(statistics)) statistic to create the baseline. The mode is a good choice for categorical variables. We observe that the model prediction for the baseline has a high probability for the good credit class and hence it satisfies our requirement for the baseline. \n", + "\n", + "For more information on selecting informative vs non-informative baselines, see [SHAP Baselines for Explainability ](https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-feature-attribute-shap-baselines.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load the raw training data in a data frame\n", + "raw_train_df = pd.read_csv(\"train.csv\", header=0, names=None, sep=\",\")\n", + "\n", + "# drop the target column\n", + "baseline = raw_train_df.drop([\"credit_risk\"], axis=1).mode().iloc[0].values.astype(\"int\").tolist()\n", + "\n", + "print(baseline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check baseline prediction E[(x)]\n", + "pred_baseline = predictor.predict(baseline)\n", + "print(pred_baseline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup configurations for Clarify" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, setup some more configurations to start the explainability analysis by Clarify. We need to set up the following:\n", + " 1. **SHAPConfig**: to create the baseline. In this example, the mean_abs is the mean of absolute SHAP values for all instances, specified as the baseline\n", + " \n", + " \n", + " 1. **DataConfig**: to provide some basic information about data I/O to SageMaker Clarify. We specify where to find the input dataset, where to store the output, the header names, and the dataset type.\n", + " 1. **ModelConfig**: to specify information about the trained model here we re-use the model name created earlier\n", + " \n", + "To know more about what these configurations mean for Clarify, check out the documentation here: https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-configure-processing-jobs.html\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap_config = clarify.SHAPConfig(\n", + " baseline=[baseline],\n", + " num_samples=2000, # num_samples are permutations from your features, so should be large enough as compared to number of input features, for example, 2k + 2* num_features\n", + " agg_method=\"mean_abs\",\n", + " use_logit=True,\n", + ") # we want the shap values to have log-odds units so that the equation 'shap values + expected probability = predicted probability' for each instance record )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "explainability_output_path = \"s3://{}/{}/clarify-explainability\".format(bucket, prefix)\n", + "\n", + "explainability_data_config = clarify.DataConfig(\n", + " s3_data_input_path=test_raw,\n", + " s3_output_path=explainability_output_path,\n", + " # label='credit_risk', # target column is not present in the test dataset\n", + " headers=test_columns,\n", + " dataset_type=\"text/csv\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_config = clarify.ModelConfig(\n", + " model_name=pipeline_model.name, # specify the inference pipeline model name\n", + " instance_type=\"ml.c5.xlarge\",\n", + " instance_count=1,\n", + " accept_type=\"text/csv\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run SageMaker Clarify Explainability job\n", + "\n", + "All the configurations are in place. Let's start the explainability job. This will spin up an ephemeral SageMaker endpoint and perform inference and calculate explanations on that endpoint. It does not use any existing production endpoint deployments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clarify_processor.run_explainability(\n", + " data_config=explainability_data_config,\n", + " model_config=model_config,\n", + " explainability_config=shap_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Viewing the Explainability Report\n", + "\n", + "Once the job is complete, you can view the explainability report in Studio under the 'Experiments and trials' tab\n", + "\n", + "Look out for a trial component named 'clarify-explainability-' and see the Explainability tab. \n", + "\n", + "If you're not a Studio user yet, you can access this report at the following S3 bucket.\n", + "\n", + "The report contains global explanations for the model with the input dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "explainability_output_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze the results of Clarify \n", + "\n", + "In this section, we will analyze and understand the local explainability results for each individual prediction produced by Clarify. Clarify produces a CSV file which contains the SHAP value for each feature per prediction. Let us download the CSV." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.s3 import S3Downloader\n", + "import json\n", + "import io\n", + "\n", + "# read the shap values\n", + "S3Downloader.download(s3_uri=explainability_output_path + \"/explanations_shap\", local_path=\"output\")\n", + "shap_values_df = pd.read_csv(\"output/out.csv\")\n", + "print(shap_values_df.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a dataframe containing the model predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas import DataFrame\n", + "\n", + "predictions_df = DataFrame(predictions, columns=[\"probability_score\"])\n", + "\n", + "predictions_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that by default SHAP explains classifier models in terms of their margin output, before the logistic link function. That means the units of SHAP output and are log-odds units, so negative values imply probabilities of less than 0.5 meaning bad credit class (class 0). \n", + "\n", + "E(y) is the log-odd (logit) unit for the prediction on the input baseline\n", + "\n", + "y is the log-odd (logit) unit for the prediction output\n", + "\n", + "SHAP values are in log-odd units as well \n", + "\n", + "The following is expected to hold true for every individual prediction : \n", + "\n", + "sum(SHAP values) + E(y)) == model_prediction_logit\n", + "\n", + "logistic(model_prediction_logit) = model_prediction_probability\n", + "\n", + "E(y) < 0 implies baseline probability less than 0.5 (bad credit baseline)\n", + "\n", + "E(y) > 0 implies baseline probability greater than 0.5 (good credit baseline)\n", + "\n", + "y < 0 implies predicted probability less than 0.5 (bad credit)\n", + "\n", + "y > 0 implies predicted probability greater than 0.5 (good credit) \n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can retrieve E(y) , the log-odd unit of the prediction for the baseline input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the base expected value to be used to plot SHAP values\n", + "S3Downloader.download(s3_uri=explainability_output_path + \"/analysis.json\", local_path=\"output\")\n", + "\n", + "with open(\"output/analysis.json\") as json_file:\n", + " data = json.load(json_file)\n", + " base_value = data[\"explanations\"][\"kernel_shap\"][\"label0\"][\"expected_value\"]\n", + "\n", + "print(\"base value: \", base_value)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "E(y) > 0 implies baseline probability greater than 0.5 (good credit baseline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Join the predictions, SHAP value and test data\n", + "\n", + "Now, we create a single dataframe containing all test data rows, with their corresponding SHAP values and prediction score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# join the probability score and shap values together in a single data frame\n", + "predictions_df.reset_index(drop=True, inplace=True)\n", + "shap_values_df.reset_index(drop=True, inplace=True)\n", + "test_data.reset_index(drop=True, inplace=True)\n", + "\n", + "prediction_shap_df = pd.concat([predictions_df, shap_values_df, test_data], axis=1)\n", + "prediction_shap_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a need to downcast the probability score as large precision values are not very useful in analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_shap_df[\"probability_score\"] = pd.to_numeric(\n", + " prediction_shap_df[\"probability_score\"], downcast=\"float\"\n", + ")\n", + "\n", + "prediction_shap_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert the probability score to binary prediction\n", + "\n", + "Now, convert the probability scores to a binary value(1/0), based on a threshold(0.5), where probability scores greater than 0.5 are positive outcomes and lesser are negative outcomes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new column as 'Prediction' converting the probability score to either 1 or 0\n", + "prediction_shap_df.insert(\n", + " 0, \"Prediction\", (prediction_shap_df[\"probability_score\"] > 0.5).astype(int)\n", + ")\n", + "\n", + "prediction_shap_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filter out bad predictions\n", + "\n", + "Since we interested in explaining negative outcomes (bad credit predictions) only in this exercise, we filter the records to keep only the record with prediction as 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bad_credit_outcomes_df = prediction_shap_df[prediction_shap_df.iloc[:, 0] == 0]\n", + "bad_credit_outcomes_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create SHAP plots \n", + "\n", + "Now we try to create some SHAP plots to understand how much different features contributed to the negative outcome." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Install open source SHAP library for more visualizations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!conda install -c conda-forge shap -y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shap" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### SHAP summary plot for all individual bad credit prediction instance in the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap.summary_plot(\n", + " bad_credit_outcomes_df.iloc[:, 2:22].to_numpy(),\n", + " bad_credit_outcomes_df.iloc[:, 22:42].to_numpy(),\n", + " feature_names=test_data.columns,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### SHAP explanation plot for a single bad credit ensemble prediction instance " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "min_index = prediction_shap_df[\"probability_score\"].idxmin()\n", + "print(min_index)\n", + "print(\"mean probability of dataset\")\n", + "print(prediction_shap_df[[\"probability_score\"]].mean())\n", + "print(\"individual probability\")\n", + "print(prediction_shap_df.iloc[45, 1])\n", + "print(\"sum of shap values\")\n", + "print(prediction_shap_df.iloc[45, 2:22].sum())\n", + "print(\"base value from analysis.json\")\n", + "print(base_value)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example 'bad credit' prediction SHAP values.\n", + "\n", + "In the chart below, f(x) is the prediction of this particular individual instance in log-odd units. If negative, it means it is a bad credit prediction. \n", + "\n", + "In the chart below, E(f(x)) is the prediction of the baseline input in log-odd units. It is positive , which means it belongs to the good credit class. \n", + "\n", + "The individual example is contrasted against the good credit baseline. So the features with negative SHAP values drive the final negative decision from the initial baseline positive value.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### In the below example, the input features (status = 1) , (purpose = 0) and (personal_status_sex = 2) are the top 3 features driving the negative decision. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can refer the data description to understand the mapping of these values to logical categories. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "explanation_obj = shap._explanation.Explanation(\n", + " values=prediction_shap_df.iloc[min_index, 2:22].to_numpy(),\n", + " base_values=base_value,\n", + " data=test_data.iloc[min_index].to_numpy(),\n", + " feature_names=test_data.columns,\n", + ")\n", + "shap.plots.waterfall(shap_values=explanation_obj, max_display=20, show=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Feel free to change the min_index in the plot above to explain predictions of other individual instances" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extra Exercise - Calculate Bias metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bias_report_output_path = \"s3://{}/{}/clarify-bias\".format(bucket, prefix)\n", + "bias_data_config = clarify.DataConfig(\n", + " s3_data_input_path=train_raw,\n", + " s3_output_path=bias_report_output_path,\n", + " label=\"credit_risk\",\n", + " headers=training_data.columns.to_list(),\n", + " dataset_type=\"text/csv\",\n", + ")\n", + "predictions_config = clarify.ModelPredictedLabelConfig(label=None, probability=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bias_config = clarify.BiasConfig(\n", + " label_values_or_threshold=[1],\n", + " facet_name=\"age\",\n", + " facet_values_or_threshold=[40],\n", + " group_name=\"personal_status_sex\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clarify_processor.run_bias(\n", + " data_config=bias_data_config,\n", + " bias_config=bias_config,\n", + " model_config=model_config,\n", + " model_predicted_label_config=predictions_config,\n", + " pre_training_methods=\"all\",\n", + " post_training_methods=\"all\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Viewing the Bias detection Report\n", + "You can view the bis detection report in Studio under the experiments tab \n", + "\n", + "If you're not a Studio user yet, you can access this report at the following S3 bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bias_report_output_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean Up\n", + "Finally, don't forget to clean up the resources we set up and used for this demo!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "session.delete_endpoint(endpoint_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "session.delete_model(pipeline_model.name)" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (Data Science)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/credit_risk_prediction.png b/sagemaker-clarify/clarify-explainability-inference-pipelines/credit_risk_prediction.png new file mode 100644 index 0000000000..aab7849d28 Binary files /dev/null and b/sagemaker-clarify/clarify-explainability-inference-pipelines/credit_risk_prediction.png differ diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/data/codetable.txt b/sagemaker-clarify/clarify-explainability-inference-pipelines/data/codetable.txt new file mode 100644 index 0000000000..df39538a98 --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/data/codetable.txt @@ -0,0 +1,133 @@ +$`laufkont = status` + + 1 : no checking account + 2 : ... < 0 DM + 3 : 0<= ... < 200 DM + 4 : ... >= 200 DM / salary for at least 1 year + +$`laufzeit = duration` + + +$`moral = credit_history` + + 0 : delay in paying off in the past + 1 : critical account/other credits elsewhere + 2 : no credits taken/all credits paid back duly + 3 : existing credits paid back duly till now + 4 : all credits at this bank paid back duly + +$`verw = purpose` + + 0 : others + 1 : car (new) + 2 : car (used) + 3 : furniture/equipment + 4 : radio/television + 5 : domestic appliances + 6 : repairs + 7 : education + 8 : vacation + 9 : retraining + 10 : business + +$`hoehe = amount` + + +$`sparkont = savings` + + 1 : unknown/no savings account + 2 : ... < 100 DM + 3 : 100 <= ... < 500 DM + 4 : 500 <= ... < 1000 DM + 5 : ... >= 1000 DM + +$`beszeit = employment_duration` + + 1 : unemployed + 2 : < 1 yr + 3 : 1 <= ... < 4 yrs + 4 : 4 <= ... < 7 yrs + 5 : >= 7 yrs + +$`rate = installment_rate` + + 1 : >= 35 + 2 : 25 <= ... < 35 + 3 : 20 <= ... < 25 + 4 : < 20 + +$`famges = personal_status_sex` + + 1 : male : divorced/separated + 2 : female : non-single or male : single + 3 : male : married/widowed + 4 : female : single + +$`buerge = other_debtors` + + 1 : none + 2 : co-applicant + 3 : guarantor + +$`wohnzeit = present_residence` + + 1 : < 1 yr + 2 : 1 <= ... < 4 yrs + 3 : 4 <= ... < 7 yrs + 4 : >= 7 yrs + +$`verm = property` + + 1 : unknown / no property + 2 : car or other + 3 : building soc. savings agr./life insurance + 4 : real estate + +$`alter = age` + + +$`weitkred = other_installment_plans` + + 1 : bank + 2 : stores + 3 : none + +$`wohn = housing` + + 1 : for free + 2 : rent + 3 : own + +$`bishkred = number_credits` + + 1 : 1 + 2 : 2-3 + 3 : 4-5 + 4 : >= 6 + +$`beruf = job` + + 1 : unemployed/unskilled - non-resident + 2 : unskilled - resident + 3 : skilled employee/official + 4 : manager/self-empl./highly qualif. employee + +$`pers = people_liable` + + 1 : 3 or more + 2 : 0 to 2 + +$`telef = telephone` + + 1 : no + 2 : yes (under customer name) + +$`gastarb = foreign_worker` + + 1 : yes + 2 : no + +$`kredit = credit_risk` + + 0 : bad + 1 : good diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/inference/sklearn/inference.py b/sagemaker-clarify/clarify-explainability-inference-pipelines/inference/sklearn/inference.py new file mode 100644 index 0000000000..640d0e2bde --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/inference/sklearn/inference.py @@ -0,0 +1,69 @@ +from __future__ import print_function + + +from io import StringIO +import os + + +import pandas as pd + + +from sklearn.externals import joblib + + +feature_columns_names = [ + "status", + "duration", + "credit_history", + "purpose", + "amount", + "savings", + "employment_duration", + "installment_rate", + "personal_status_sex", + "other_debtors", + "present_residence", + "property", + "age", + "other_installment_plans", + "housing", + "number_credits", + "job", + "people_liable", + "telephone", + "foreign_worker", +] + + +def input_fn(input_data, content_type): + + if content_type == "text/csv": + df = pd.read_csv(StringIO(input_data), header=None, index_col=False, sep=",") + + first_row = df.iloc[0:1].values[0].tolist() + + if len(df.columns) == len(feature_columns_names): + print("column length is correct") + + if set(first_row) == set(feature_columns_names): + print("the row contains header, remove the row") + df = df.iloc[1:] + df.reset_index(drop=True, inplace=True) + + df.columns = feature_columns_names + + return df + else: + raise ValueError("{} not supported by script!".format(content_type)) + + +def predict_fn(input_data, model): + input_data.head(1) + features = model.transform(input_data) + print("successful sklearn inference", features) + return features + + +def model_fn(model_dir): + preprocessor = joblib.load(os.path.join(model_dir, "model.joblib")) + return preprocessor diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/inference/xgboost/inference.py b/sagemaker-clarify/clarify-explainability-inference-pipelines/inference/xgboost/inference.py new file mode 100644 index 0000000000..9e5af73bf6 --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/inference/xgboost/inference.py @@ -0,0 +1,26 @@ +import pickle as pkl +import json +import numpy as np +import xgboost as xgb + +from sagemaker_containers.beta.framework import content_types +from sagemaker_xgboost_container import encoder as xgb_encoders + + +def input_fn(input_data, content_type): + if content_type == content_types.JSON: + print("Recieved content type is json") + print("input_data is", input_data) + obj = json.loads(input_data) + print("obj", obj) + array = np.array(obj) + return xgb.DMatrix(array) + else: + print("content type is not json") + return xgb_encoders.decode(input_data, content_type) + + +def model_fn(model_dir): + model_file = model_dir + "/model.bin" + model = pkl.load(open(model_file, "rb")) + return model diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/processing/preprocessor.py b/sagemaker-clarify/clarify-explainability-inference-pipelines/processing/preprocessor.py new file mode 100644 index 0000000000..d0b364f95e --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/processing/preprocessor.py @@ -0,0 +1,123 @@ +import argparse +import os +import warnings + +import pandas as pd +import numpy as np +import tarfile +import sklearn +from sklearn.externals import joblib +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import OneHotEncoder, LabelEncoder +from sklearn.compose import make_column_transformer + +from sklearn.exceptions import DataConversionWarning + +warnings.filterwarnings(action="ignore", category=DataConversionWarning) + +columns = [ + "status", + "duration", + "credit_history", + "purpose", + "amount", + "savings", + "employment_duration", + "installment_rate", + "personal_status_sex", + "other_debtors", + "present_residence", + "property", + "age", + "other_installment_plans", + "housing", + "number_credits", + "job", + "people_liable", + "telephone", + "foreign_worker", + "credit_risk", +] + +if __name__ == "__main__": + + # Read the arguments passed to the script. + parser = argparse.ArgumentParser() + parser.add_argument("--train-test-split-ratio", type=float, default=0.3) + args, _ = parser.parse_known_args() + + print("Received arguments {}".format(args)) + + # Read input data into a Pandas dataframe. + input_data_path = os.path.join("/opt/ml/processing/input", "train.csv") + print("Reading input data from {}".format(input_data_path)) + df = pd.read_csv(input_data_path, names=None, header=0, sep=",") + + # Defining one-hot encoders. + print("performing one hot encoding") + transformer = make_column_transformer( + ( + [ + "credit_history", + "purpose", + "personal_status_sex", + "other_debtors", + "property", + "other_installment_plans", + "housing", + "job", + "telephone", + "foreign_worker", + ], + OneHotEncoder(sparse=False), + ), + remainder="passthrough", + ) + + print("preparing the features and labels") + X = df.drop("credit_risk", axis=1) + y = df["credit_risk"] + + print("building sklearn transformer") + featurizer_model = transformer.fit(X) + features = featurizer_model.transform(X) + labels = LabelEncoder().fit_transform(y) + + # Splitting. + split_ratio = args.train_test_split_ratio + print("Splitting data into train and validation sets with ratio {}".format(split_ratio)) + X_train, X_val, y_train, y_val = train_test_split( + features, labels, test_size=split_ratio, random_state=0 + ) + + print("Train features shape after preprocessing: {}".format(X_train.shape)) + print("Validation features shape after preprocessing: {}".format(X_val.shape)) + + # Saving outputs. + train_features_output_path = os.path.join("/opt/ml/processing/train", "train_features.csv") + train_labels_output_path = os.path.join("/opt/ml/processing/train", "train_labels.csv") + + val_features_output_path = os.path.join("/opt/ml/processing/val", "val_features.csv") + val_labels_output_path = os.path.join("/opt/ml/processing/val", "val_labels.csv") + + print("Saving training features to {}".format(train_features_output_path)) + pd.DataFrame(X_train).to_csv(train_features_output_path, header=False, index=False) + + print("Saving training labels to {}".format(train_labels_output_path)) + pd.DataFrame(y_train).to_csv(train_labels_output_path, header=False, index=False) + + print("Saving validation features to {}".format(val_features_output_path)) + pd.DataFrame(X_val).to_csv(val_features_output_path, header=False, index=False) + + print("Saving validation labels to {}".format(val_labels_output_path)) + pd.DataFrame(y_val).to_csv(val_labels_output_path, header=False, index=False) + + # Saving model. + model_path = os.path.join("/opt/ml/processing/model", "model.joblib") + model_output_path = os.path.join("/opt/ml/processing/model", "model.tar.gz") + + print("Saving featurizer model to {}".format(model_output_path)) + joblib.dump(featurizer_model, model_path) + tar = tarfile.open(model_output_path, "w:gz") + tar.add(model_path, arcname="model.joblib") + tar.close() diff --git a/sagemaker-clarify/clarify-explainability-inference-pipelines/training/train_xgboost.py b/sagemaker-clarify/clarify-explainability-inference-pipelines/training/train_xgboost.py new file mode 100644 index 0000000000..88e5e990a7 --- /dev/null +++ b/sagemaker-clarify/clarify-explainability-inference-pipelines/training/train_xgboost.py @@ -0,0 +1,86 @@ +import argparse +import json +import os +import random +import pandas as pd +import glob +import pickle as pkl + +import xgboost + + +def parse_args(): + + parser = argparse.ArgumentParser() + + parser.add_argument("--max_depth", type=int, default=5) + parser.add_argument("--eta", type=float, default=0.05) + parser.add_argument("--gamma", type=int, default=4) + parser.add_argument("--min_child_weight", type=int, default=6) + parser.add_argument("--silent", type=int, default=0) + parser.add_argument("--objective", type=str, default="binary:logistic") + parser.add_argument("--eval_metric", type=str, default="auc") + parser.add_argument("--num_round", type=int, default=100) + parser.add_argument("--subsample", type=float, default=0.8) + parser.add_argument("--early_stopping_rounds", type=int, default=20) + + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) + parser.add_argument("--validation", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION")) + + args = parser.parse_args() + + return args + + +def main(): + + args = parse_args() + train_files_path, validation_files_path = args.train, args.validation + + train_features_path = os.path.join(args.train, "train_features.csv") + train_labels_path = os.path.join(args.train, "train_labels.csv") + + val_features_path = os.path.join(args.validation, "val_features.csv") + val_labels_path = os.path.join(args.validation, "val_labels.csv") + + print("Loading training dataframes...") + df_train_features = pd.read_csv(train_features_path) + df_train_labels = pd.read_csv(train_labels_path) + + print("Loading validation dataframes...") + df_val_features = pd.read_csv(val_features_path) + df_val_labels = pd.read_csv(val_labels_path) + + X = df_train_features.values + y = df_train_labels.values + + val_X = df_val_features.values + val_y = df_val_labels.values + + dtrain = xgboost.DMatrix(X, label=y) + dval = xgboost.DMatrix(val_X, label=val_y) + + watchlist = [(dtrain, "train"), (dval, "validation")] + + params = { + "max_depth": args.max_depth, + "eta": args.eta, + "gamma": args.gamma, + "min_child_weight": args.min_child_weight, + "silent": args.silent, + "objective": args.objective, + "subsample": args.subsample, + "eval_metric": args.eval_metric, + "early_stopping_rounds": args.early_stopping_rounds, + } + + bst = xgboost.train( + params=params, dtrain=dtrain, evals=watchlist, num_boost_round=args.num_round + ) + + model_dir = os.environ.get("SM_MODEL_DIR") + pkl.dump(bst, open(model_dir + "/model.bin", "wb")) + + +if __name__ == "__main__": + main()