From f2112bee3864679bf832447e16b26f38555f0bfa Mon Sep 17 00:00:00 2001 From: atqy <95724753+atqy@users.noreply.github.com> Date: Fri, 29 Apr 2022 10:07:30 -0700 Subject: [PATCH] Refactor sequential notebooks into independent notebooks for use-case/retail-recommend (#3363) * refactor sequential notebooks into independent notebooks * cleanup * reformat * make pandas version compatible * reformat * cleanup * dleete instance type * edit links Co-authored-by: EC2 Default User --- use-cases/index.rst | 5 +- .../1_retail_recommend_dataprep.ipynb | 715 ------------- ...rain_tune.ipynb => retail_recommend.ipynb} | 961 ++++++++++-------- ....ipynb => retail_recommend_pipeline.ipynb} | 105 +- 4 files changed, 588 insertions(+), 1198 deletions(-) delete mode 100644 use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb rename use-cases/retail_recommend/{2_retail_recommend_train_tune.ipynb => retail_recommend.ipynb} (65%) rename use-cases/retail_recommend/{3_retail_recommend_pipeline.ipynb => retail_recommend_pipeline.ipynb} (96%) diff --git a/use-cases/index.rst b/use-cases/index.rst index 9ab084bbf6..5f406a8668 100644 --- a/use-cases/index.rst +++ b/use-cases/index.rst @@ -27,9 +27,8 @@ E-Commerce Personalization .. toctree:: :maxdepth: 1 - retail_recommend/1_retail_recommend_dataprep - retail_recommend/2_retail_recommend_train_tune - retail_recommend/3_retail_recommend_pipeline + retail_recommend/retail_recommend + retail_recommend/retail_recommend_pipeline Computer Vision for Medical Imaging diff --git a/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb b/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb deleted file mode 100644 index 6b4c30b5e4..0000000000 --- a/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb +++ /dev/null @@ -1,715 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 1. Data Preparation\n", - "\n", - "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", - "\n", - "## Dataset\n", - "\n", - "The dataset for this demo comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail). It contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. The following attributes are included in our dataset:\n", - "+ InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.\n", - "+ StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.\n", - "+ Description: Product (item) name. Nominal.\n", - "+ Quantity: The quantities of each product (item) per transaction. Numeric.\n", - "+ InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.\n", - "+ UnitPrice: Unit price. Numeric, Product price per unit in sterling.\n", - "+ CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.\n", - "+ Country: Country name. Nominal, the name of the country where each customer resides. \n", - "\n", - "Citation: Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197–208, 2012 (Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Solution Architecture\n", - "----\n", - "![Architecture](./images/retail_rec_dataprep.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uq sagemaker boto3" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored variables and their in-db values:\n" - ] - } - ], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import sagemaker\n", - "import sagemaker.amazon.common as smac\n", - "import boto3\n", - "\n", - "import io\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from scipy.sparse import csr_matrix, hstack, save_npz\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "assert sagemaker.__version__ >= \"2.21.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", - "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", - "\n", - "bucket = sagemaker_session.default_bucket()\n", - "print(f\"using bucket{bucket} in region {region} \\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read the data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(541909, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom
\n", - "
" - ], - "text/plain": [ - " InvoiceNo StockCode Description Quantity \\\n", - "0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n", - "1 536365 71053 WHITE METAL LANTERN 6 \n", - "2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n", - "3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n", - "4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n", - "\n", - " InvoiceDate UnitPrice CustomerID Country \n", - "0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n", - "1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n", - "3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(\"data/Online Retail.csv\")\n", - "print(df.shape)\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preprocessing\n", - "\n", - "First, we check for any null (i.e. missing) values." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "InvoiceNo 0\n", - "StockCode 0\n", - "Description 1454\n", - "Quantity 0\n", - "InvoiceDate 0\n", - "UnitPrice 0\n", - "CustomerID 135080\n", - "Country 0\n", - "dtype: int64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Drop any records with a missing CustomerID. If we do not know who the customer is, then it is not helpful to us when we make recommendations." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(406829, 8)\n" - ] - } - ], - "source": [ - "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", - "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", - "print(df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "sns.distplot(df[\"Quantity\"], kde=True)\n", - "plt.title(\"Distribution of Quantity\")\n", - "plt.xlabel(\"Quantity\");" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Most of our quantities are realteively small (positive) numbers, but there are also some negative quantities as well as extreme outliers (both postiive and negative outliers). " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "sns.distplot(df[\"UnitPrice\"], kde=True)\n", - "plt.title(\"Distribution of Unit Prices\")\n", - "plt.xlabel(\"Price\");" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are no negative prices, which is good, but we can see some extreme outliers." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
QuantityUnitPriceCustomerID
count406829.000000406829.000000406829.000000
mean12.0613033.46047115287.690570
std248.69337069.3151621713.600303
min-80995.0000000.00000012346.000000
25%2.0000001.25000013953.000000
50%5.0000001.95000015152.000000
75%12.0000003.75000016791.000000
max80995.00000038970.00000018287.000000
\n", - "
" - ], - "text/plain": [ - " Quantity UnitPrice CustomerID\n", - "count 406829.000000 406829.000000 406829.000000\n", - "mean 12.061303 3.460471 15287.690570\n", - "std 248.693370 69.315162 1713.600303\n", - "min -80995.000000 0.000000 12346.000000\n", - "25% 2.000000 1.250000 13953.000000\n", - "50% 5.000000 1.950000 15152.000000\n", - "75% 12.000000 3.750000 16791.000000\n", - "max 80995.000000 38970.000000 18287.000000" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(274399, 6)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", - " \"Quantity\"\n", - "].sum()\n", - "df = df.loc[df > 0].reset_index()\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def loadDataset(dataframe):\n", - " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", - " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", - " ohe_output = enc.fit_transform(dataframe[onehot_cols])\n", - "\n", - " vectorizer = TfidfVectorizer(min_df=2)\n", - " unique_descriptions = dataframe[\"Description\"].unique()\n", - " vectorizer.fit(unique_descriptions)\n", - " tfidf_output = vectorizer.transform(dataframe[\"Description\"])\n", - "\n", - " row = range(len(dataframe))\n", - " col = [0] * len(dataframe)\n", - " unit_price = csr_matrix((dataframe[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", - "\n", - " X = hstack([ohe_output, tfidf_output, unit_price], format=\"csr\", dtype=\"float32\")\n", - "\n", - " y = dataframe[\"Quantity\"].values.astype(\"float32\")\n", - "\n", - " return X, y" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "X, y = loadDataset(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.9991284988048746" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# display sparsity\n", - "total_cells = X.shape[0] * X.shape[1]\n", - "(total_cells - X.nnz) / total_cells" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our data is over 99.9% sparse. Because of this high sparsity, the sparse matrix data type allows us to represent our data using only a small fraction of the memory that a dense matrix would require." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Data For Modeling\n", - "\n", - "+ Split the data into training and testing sets\n", - "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((219519, 9284), (54880, 9284), (219519,), (54880,))" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save numpy arrays to local storage in /data folder\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"data/online_retail_preprocessed.csv\", index=False)\n", - "save_npz(\"data/X_train.npz\", X_train)\n", - "save_npz(\"data/X_test.npz\", X_test)\n", - "np.savez(\"data/y_train.npz\", y_train)\n", - "np.savez(\"data/y_test.npz\", y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "prefix = \"personalization\"\n", - "\n", - "train_key = \"train.protobuf\"\n", - "train_prefix = f\"{prefix}/train\"\n", - "\n", - "test_key = \"test.protobuf\"\n", - "test_prefix = f\"{prefix}/test\"\n", - "\n", - "output_prefix = f\"s3://{bucket}/{prefix}/output\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def writeDatasetToProtobuf(X, y, bucket, prefix, key):\n", - " buf = io.BytesIO()\n", - " smac.write_spmatrix_to_sparse_tensor(buf, X, y)\n", - " buf.seek(0)\n", - " obj = \"{}/{}\".format(prefix, key)\n", - " boto3.resource(\"s3\").Bucket(bucket).Object(obj).upload_fileobj(buf)\n", - " return \"s3://{}/{}\".format(bucket, obj)\n", - "\n", - "\n", - "train_data_location = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)\n", - "test_data_location = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)\n", - "\n", - "print(train_data_location)\n", - "print(test_data_location)\n", - "print(\"Output: {}\".format(output_prefix))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'train_data_location' (str)\n", - "Stored 'test_data_location' (str)\n" - ] - } - ], - "source": [ - "%store train_data_location\n", - "%store test_data_location" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the next notebook we will explore training and tuning." - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb similarity index 65% rename from use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb rename to use-cases/retail_recommend/retail_recommend.ipynb index 3bb6535cf2..ef3775439e 100644 --- a/use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 2. Train and Make Predictions\n", + "# Recommendation Engine for E-Commerce Sales\n", "\n", "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", "\n", "## Dataset\n", "\n", "The dataset for this demo comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail). It contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. The following attributes are included in our dataset:\n", - "\n", "+ InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.\n", "+ StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.\n", "+ Description: Product (item) name. Nominal.\n", @@ -28,9 +27,225 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Solution Architecture\n", + "## Part 1: Data Preparation\n", "----\n", - "![Architecture](./images/retail_rec_train_reg_deploy.png)" + "The first of the notebook will focus on preparing the data for training.\n", + "\n", + "### Solution Architecture\n", + "![Architecture](./images/retail_rec_dataprep.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import sagemaker.amazon.common as smac\n", + "import boto3\n", + "\n", + "import io\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from scipy.sparse import csr_matrix, hstack, save_npz\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert sagemaker.__version__ >= \"2.21.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region = boto3.Session().region_name\n", + "boto3.setup_default_session(region_name=region)\n", + "boto_session = boto3.Session(region_name=region)\n", + "\n", + "s3_client = boto3.client(\"s3\", region_name=region)\n", + "\n", + "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", + "sagemaker_session = sagemaker.session.Session(\n", + " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", + ")\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "\n", + "bucket = sagemaker_session.default_bucket()\n", + "print(f\"using bucket{bucket} in region {region} \\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/Online Retail.csv\")\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preprocessing\n", + "\n", + "First, we check for any null (i.e. missing) values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop any records with a missing CustomerID. If we do not know who the customer is, then it is not helpful to us when we make recommendations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", + "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", + "print(df.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "sns.distplot(df[\"Quantity\"], kde=True)\n", + "plt.title(\"Distribution of Quantity\")\n", + "plt.xlabel(\"Quantity\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most of our quantities are realteively small (positive) numbers, but there are also some negative quantities as well as extreme outliers (both postiive and negative outliers). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "sns.distplot(df[\"UnitPrice\"], kde=True)\n", + "plt.title(\"Distribution of Unit Prices\")\n", + "plt.xlabel(\"Price\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are no negative prices, which is good, but we can see some extreme outliers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", + " \"Quantity\"\n", + "].sum()\n", + "df = df.loc[df > 0].reset_index()\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def loadDataset(dataframe):\n", + " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", + " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", + " ohe_output = enc.fit_transform(dataframe[onehot_cols])\n", + "\n", + " vectorizer = TfidfVectorizer(min_df=2)\n", + " unique_descriptions = dataframe[\"Description\"].unique()\n", + " vectorizer.fit(unique_descriptions)\n", + " tfidf_output = vectorizer.transform(dataframe[\"Description\"])\n", + "\n", + " row = range(len(dataframe))\n", + " col = [0] * len(dataframe)\n", + " unit_price = csr_matrix((dataframe[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", + "\n", + " X = hstack([ohe_output, tfidf_output, unit_price], format=\"csr\", dtype=\"float32\")\n", + "\n", + " y = dataframe[\"Quantity\"].values.astype(\"float32\")\n", + "\n", + " return X, y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = loadDataset(df)" ] }, { @@ -38,22 +253,58 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# display sparsity\n", + "total_cells = X.shape[0] * X.shape[1]\n", + "(total_cells - X.nnz) / total_cells" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our data is over 99.9% sparse. Because of this high sparsity, the sparse matrix data type allows us to represent our data using only a small fraction of the memory that a dense matrix would require." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Data For Modeling\n", + "\n", + "+ Split the data into training and testing sets\n", + "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save numpy arrays to local storage in /data folder\n" + ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "!pip install -Uq sagemaker boto3" + "df.to_csv(\"data/online_retail_preprocessed.csv\", index=False)\n", + "save_npz(\"data/X_train.npz\", X_train)\n", + "save_npz(\"data/X_test.npz\", X_test)\n", + "np.savez(\"data/y_train.npz\", y_train)\n", + "np.savez(\"data/y_test.npz\", y_test)" ] }, { @@ -62,13 +313,55 @@ "metadata": {}, "outputs": [], "source": [ - "%store -r\n", - "%store" + "prefix = \"personalization\"\n", + "\n", + "train_key = \"train.protobuf\"\n", + "train_prefix = f\"{prefix}/train\"\n", + "\n", + "test_key = \"test.protobuf\"\n", + "test_prefix = f\"{prefix}/test\"\n", + "\n", + "output_prefix = f\"s3://{bucket}/{prefix}/output\"" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def writeDatasetToProtobuf(X, y, bucket, prefix, key):\n", + " buf = io.BytesIO()\n", + " smac.write_spmatrix_to_sparse_tensor(buf, X, y)\n", + " buf.seek(0)\n", + " obj = \"{}/{}\".format(prefix, key)\n", + " boto3.resource(\"s3\").Bucket(bucket).Object(obj).upload_fileobj(buf)\n", + " return \"s3://{}/{}\".format(bucket, obj)\n", + "\n", + "\n", + "train_data_location = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)\n", + "test_data_location = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)\n", + "\n", + "print(train_data_location)\n", + "print(test_data_location)\n", + "print(\"Output: {}\".format(output_prefix))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Train, Tune, and Deploy Model\n", + "----\n", + "This second part will focus on training, tuning, and deploying a model trained on the data prepared in part 1.\n", + "\n", + "### Solution Architecture\n", + "![Architecture](./images/retail_rec_train_reg_deploy.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -90,16 +383,7 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "assert sagemaker.__version__ >= \"2.21.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -126,158 +410,261 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Read the data" + "### Prepare Data For Modeling\n", + "\n", + "+ Split the data into training and testing sets\n", + "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load array\n", + "X_train = load_npz(\"./data/X_train.npz\")\n", + "X_test = load_npz(\"./data/X_test.npz\")\n", + "y_train_npzfile = np.load(\"./data/y_train.npz\")\n", + "y_test_npzfile = np.load(\"./data/y_test.npz\")\n", + "y_train = y_train_npzfile.f.arr_0\n", + "y_test = y_test_npzfile.f.arr_0\n", + "\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_dims = X_train.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(\"factorization-machines\", region=boto_session.region_name)\n", + "\n", + "fm = sagemaker.estimator.Estimator(\n", + " container,\n", + " sagemaker_role,\n", + " instance_count=1,\n", + " instance_type=\"ml.c5.xlarge\",\n", + " output_path=output_prefix,\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", + "\n", + "fm.set_hyperparameters(\n", + " feature_dim=input_dims,\n", + " predictor_type=\"regressor\",\n", + " mini_batch_size=1000,\n", + " num_factors=64,\n", + " epochs=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if \"training_job_name\" not in locals():\n", + "\n", + " fm.fit({\"train\": train_data_location, \"test\": test_data_location})\n", + " training_job_name = fm.latest_training_job.job_name\n", + "\n", + "else:\n", + " print(f\"Using previous training job: {training_job_name}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Prepare Data For Modeling\n", + "## Make Predictions\n", "\n", - "+ Split the data into training and testing sets\n", - "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + "Now that we've trained our model, we can deploy it behind an Amazon SageMaker real-time hosted endpoint. This will allow out to make predictions (or inference) from the model dyanamically.\n", + "\n", + "Note, Amazon SageMaker allows you the flexibility of importing models trained elsewhere, as well as the choice of not importing models if the target of model creation is AWS Lambda, AWS Greengrass, Amazon Redshift, Amazon Athena, or other deployment target.\n", + "\n", + "Here we will take the top customer, the customer who spent the most money, and try to find which items to recommend to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.deserializers import JSONDeserializer\n", + "from sagemaker.serializers import JSONSerializer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class FMSerializer(JSONSerializer):\n", + " def serialize(self, data):\n", + " js = {\"instances\": []}\n", + " for row in data:\n", + " js[\"instances\"].append({\"features\": row.tolist()})\n", + " return json.dumps(js)\n", + "\n", + "\n", + "fm_predictor = fm.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " serializer=FMSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + ")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# load array\n", - "X_train = load_npz(\"./data/X_train.npz\")\n", - "X_test = load_npz(\"./data/X_test.npz\")\n", - "y_train_npzfile = np.load(\"./data/y_train.npz\")\n", - "y_test_npzfile = np.load(\"./data/y_test.npz\")\n", - "y_train = y_train_npzfile.f.arr_0\n", - "y_test = y_test_npzfile.f.arr_0" + "# find customer who spent the most money\n", + "df = pd.read_csv(\"data/online_retail_preprocessed.csv\")\n", + "\n", + "df[\"invoice_amount\"] = df[\"Quantity\"] * df[\"UnitPrice\"]\n", + "top_customer = (\n", + " df.groupby(\"CustomerID\").sum()[\"invoice_amount\"].sort_values(ascending=False).index[0]\n", + ")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((219519, 9284), (54880, 9284), (219519,), (54880,))" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + "def get_recommendations(df, customer_id, n_recommendations, n_ranks=100):\n", + " popular_items = (\n", + " df.groupby([\"StockCode\", \"UnitPrice\"])\n", + " .nunique()[\"CustomerID\"]\n", + " .sort_values(ascending=False)\n", + " .reset_index()\n", + " )\n", + " top_n_items = popular_items[\"StockCode\"].iloc[:n_ranks].values\n", + " top_n_prices = popular_items[\"UnitPrice\"].iloc[:n_ranks].values\n", + "\n", + " # stock codes can have multiple descriptions, so we will choose whichever description is most common\n", + " item_map = df.groupby(\"StockCode\").agg(lambda x: x.value_counts().index[0])[\"Description\"]\n", + "\n", + " # find customer's country\n", + " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", + " country = df_subset[\"Country\"].value_counts().index[0]\n", + "\n", + " data = []\n", + " flattened_item_map = [item_map[i] for i in top_n_items]\n", + " for idx in range(len(top_n_items)):\n", + " data.append(\n", + " {\n", + " \"StockCode\": top_n_items[idx],\n", + " \"Description\": flattened_item_map[idx],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices[idx],\n", + " }\n", + " )\n", + "\n", + " df_inference = pd.DataFrame(data)\n", + "\n", + " # we need to build the data set similar to how we built it for training\n", + " # it should have the same number of features as the training data\n", + " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", + " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", + " enc.fit(df[onehot_cols])\n", + " onehot_output = enc.transform(df_inference[onehot_cols])\n", + "\n", + " vectorizer = TfidfVectorizer(min_df=2)\n", + " unique_descriptions = df[\"Description\"].unique()\n", + " vectorizer.fit(unique_descriptions)\n", + " tfidf_output = vectorizer.transform(df_inference[\"Description\"])\n", + "\n", + " row = range(len(df_inference))\n", + " col = [0] * len(df_inference)\n", + " unit_price = csr_matrix((df_inference[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", + "\n", + " X_inference = hstack([onehot_output, tfidf_output, unit_price], format=\"csr\")\n", + "\n", + " result = fm_predictor.predict(X_inference.toarray())\n", + " preds = [i[\"score\"] for i in result[\"predictions\"]]\n", + " index_array = np.array(preds).argsort()\n", + " items = enc.inverse_transform(onehot_output)[:, 0]\n", + " top_recs = np.take_along_axis(items, index_array, axis=0)[: -n_recommendations - 1 : -1]\n", + " recommendations = [[i, item_map[i]] for i in top_recs]\n", + " return recommendations" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'input_dims' (int)\n" - ] - } - ], + "outputs": [], "source": [ - "input_dims = X_train.shape[1]\n", - "%store input_dims" + "print(\"Top 5 recommended products:\")\n", + "get_recommendations(df, top_customer, n_recommendations=5, n_ranks=100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Train the factorization machine model\n", - "\n", - "Once we have the data preprocessed and available in the correct format for training, the next step is to actually train the model using the data. \n", - "\n", - "We'll use the Amazon SageMaker Python SDK to kick off training and monitor status until it is completed. In this example that takes only a few minutes. Despite the model only need 1-2 minutes to train, there is some extra time required upfront to provision hardware and load the algorithm container.\n", - "\n", - "First, let's specify our containers. To find the rigth container, we'll create a small lookup. More details on algorithm containers can be found in [AWS documentation.](https://docs-aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)" + "Once you are done with the endpoint, you should delete the endpoint to save cost and free resources." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "container = sagemaker.image_uris.retrieve(\"factorization-machines\", region=boto_session.region_name)\n", - "\n", - "fm = sagemaker.estimator.Estimator(\n", - " container,\n", - " sagemaker_role,\n", - " instance_count=1,\n", - " instance_type=\"ml.c5.xlarge\",\n", - " output_path=output_prefix,\n", - " sagemaker_session=sagemaker_session,\n", - ")\n", - "\n", - "fm.set_hyperparameters(\n", - " feature_dim=input_dims,\n", - " predictor_type=\"regressor\",\n", - " mini_batch_size=1000,\n", - " num_factors=64,\n", - " epochs=20,\n", - ")" + "fm_predictor.delete_model()\n", + "fm_predictor.delete_endpoint()" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "if 'training_job_name' not in locals():\n", - " \n", - " fm.fit({'train': train_data_location, 'test': test_data_location})\n", - " training_job_name = fm.latest_training_job.job_name\n", - " %store training_job_name\n", - " \n", - "else:\n", - " print(f'Using previous training job: {training_job_name}')" + "## Optional Part: Registering the Model in SageMaker Model Registry\n", + "\n", + "Once a useful model has been trained, you have the option to register the model for future reference and possible deployment. To do so, we must first properly associate the artifacts of the model." ] }, { - "cell_type": "code", - "execution_count": 11, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName=training_job_name)" + "### Training data artifact" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### Training data artifact" + "training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName=training_job_name)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/cdd7fbecb4eefa22c43b2ad48140acc2\n" - ] - } - ], + "outputs": [], "source": [ "training_data_s3_uri = training_job_info[\"InputDataConfig\"][0][\"DataSource\"][\"S3DataSource\"][\n", " \"S3Uri\"\n", @@ -318,17 +705,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/3acde2fc029adeff9c767be68feac3a7\n" - ] - } - ], + "outputs": [], "source": [ "trained_model_s3_uri = training_job_info[\"ModelArtifacts\"][\"S3ModelArtifacts\"]\n", "\n", @@ -358,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -377,18 +756,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Association already exists with DataSet\n", - "Association with Model: SUCCEESFUL\n" - ] - } - ], + "outputs": [], "source": [ "artifact_list = [[training_data_artifact, \"ContributedTo\"], [model_artifact, \"Produced\"]]\n", "\n", @@ -430,41 +800,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## SageMaker Model Registry\n", - "\n", - "Once a useful model has been trained and its artifacts properly associated, the next step is to register the model for future reference and possible deployment.\n", - "\n", "### Create Model Package Group\n", "\n", - "A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning." + "After associating all the relevant artifacts, the Model Package Group can now be created. A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'mpg_name' (str)\n", - "Model Package Group name: retail-recommendation-2021-03-01-21-41\n" - ] - } - ], + "outputs": [], "source": [ - "if 'mpg_name' not in locals():\n", - " timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", - " mpg_name = f'retail-recommendation-{timestamp}'\n", - " %store mpg_name\n", + "if \"mpg_name\" not in locals():\n", + " timestamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + " mpg_name = f\"retail-recommendation-{timestamp}\"\n", "\n", - "print(f'Model Package Group name: {mpg_name}')" + "print(f\"Model Package Group name: {mpg_name}\")" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -493,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -519,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -546,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -562,7 +918,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -586,17 +942,9 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model package status: Completed\n" - ] - } - ], + "outputs": [], "source": [ "mp_info = sagemaker_boto_client.describe_model_package(\n", " ModelPackageName=mp_response[\"ModelPackageArn\"]\n", @@ -615,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -630,277 +978,24 @@ "update_response = sagemaker_boto_client.update_model_package(**model_package_update)" ] }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Name/SourceDirectionTypeAssociation TypeLineage Type
0s3://...1-03-01-21-36-56-437/output/model.tar.gzInputModelProducedartifact
1s3://...12437/personalization/test/test.protobufInputDataSetContributedToartifact
2s3://...437/personalization/train/train.protobufInputDataSetContributedToartifact
340461...2.amazonaws.com/factorization-machines:1InputImageContributedToartifact
4s3://...1-03-01-21-36-56-437/output/model.tar.gzOutputModelProducedartifact
\n", - "
" - ], - "text/plain": [ - " Name/Source Direction Type \\\n", - "0 s3://...1-03-01-21-36-56-437/output/model.tar.gz Input Model \n", - "1 s3://...12437/personalization/test/test.protobuf Input DataSet \n", - "2 s3://...437/personalization/train/train.protobuf Input DataSet \n", - "3 40461...2.amazonaws.com/factorization-machines:1 Input Image \n", - "4 s3://...1-03-01-21-36-56-437/output/model.tar.gz Output Model \n", - "\n", - " Association Type Lineage Type \n", - "0 Produced artifact \n", - "1 ContributedTo artifact \n", - "2 ContributedTo artifact \n", - "3 ContributedTo artifact \n", - "4 Produced artifact " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from sagemaker.lineage.visualizer import LineageTableVisualizer\n", - "\n", - "viz = LineageTableVisualizer(sagemaker_session)\n", - "display(viz.show(training_job_name=training_job_name))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Make Predictions\n", - "\n", - "Now that we've trained our model, we can deploy it behind an Amazon SageMaker real-time hosted endpoint. This will allow out to make predictions (or inference) from the model dyanamically.\n", - "\n", - "Note, Amazon SageMaker allows you the flexibility of importing models trained elsewhere, as well as the choice of not importing models if the target of model creation is AWS Lambda, AWS Greengrass, Amazon Redshift, Amazon Athena, or other deployment target.\n", - "\n", - "Here we will take the top customer, the customer who spent the most money, and try to find which items to recommend to them." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.deserializers import JSONDeserializer\n", - "from sagemaker.serializers import JSONSerializer" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "class FMSerializer(JSONSerializer):\n", - " def serialize(self, data):\n", - " js = {\"instances\": []}\n", - " for row in data:\n", - " js[\"instances\"].append({\"features\": row.tolist()})\n", - " return json.dumps(js)\n", - "\n", - "\n", - "fm_predictor = fm.deploy(\n", - " initial_instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " serializer=FMSerializer(),\n", - " deserializer=JSONDeserializer(),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "# find customer who spent the most money\n", - "df = pd.read_csv(\"data/online_retail_preprocessed.csv\")\n", - "\n", - "df[\"invoice_amount\"] = df[\"Quantity\"] * df[\"UnitPrice\"]\n", - "top_customer = (\n", - " df.groupby(\"CustomerID\").sum()[\"invoice_amount\"].sort_values(ascending=False).index[0]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def get_recommendations(df, customer_id, n_recommendations, n_ranks=100):\n", - " popular_items = (\n", - " df.groupby([\"StockCode\", \"UnitPrice\"])\n", - " .nunique()[\"CustomerID\"]\n", - " .sort_values(ascending=False)\n", - " .reset_index()\n", - " )\n", - " top_n_items = popular_items[\"StockCode\"].iloc[:n_ranks].values\n", - " top_n_prices = popular_items[\"UnitPrice\"].iloc[:n_ranks].values\n", - "\n", - " # stock codes can have multiple descriptions, so we will choose whichever description is most common\n", - " item_map = df.groupby(\"StockCode\").agg(lambda x: x.value_counts().index[0])[\"Description\"]\n", - "\n", - " # find customer's country\n", - " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", - " country = df_subset[\"Country\"].value_counts().index[0]\n", - "\n", - " data = {\n", - " \"StockCode\": top_n_items,\n", - " \"Description\": [item_map[i] for i in top_n_items],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices,\n", - " }\n", - "\n", - " df_inference = pd.DataFrame(data)\n", - "\n", - " # we need to build the data set similar to how we built it for training\n", - " # it should have the same number of features as the training data\n", - " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", - " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", - " enc.fit(df[onehot_cols])\n", - " onehot_output = enc.transform(df_inference[onehot_cols])\n", - "\n", - " vectorizer = TfidfVectorizer(min_df=2)\n", - " unique_descriptions = df[\"Description\"].unique()\n", - " vectorizer.fit(unique_descriptions)\n", - " tfidf_output = vectorizer.transform(df_inference[\"Description\"])\n", - "\n", - " row = range(len(df_inference))\n", - " col = [0] * len(df_inference)\n", - " unit_price = csr_matrix((df_inference[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", - "\n", - " X_inference = hstack([onehot_output, tfidf_output, unit_price], format=\"csr\")\n", + "from sagemaker.lineage.visualizer import LineageTableVisualizer\n", "\n", - " result = fm_predictor.predict(X_inference.toarray())\n", - " preds = [i[\"score\"] for i in result[\"predictions\"]]\n", - " index_array = np.array(preds).argsort()\n", - " items = enc.inverse_transform(onehot_output)[:, 0]\n", - " top_recs = np.take_along_axis(items, index_array, axis=0)[: -n_recommendations - 1 : -1]\n", - " recommendations = [[i, item_map[i]] for i in top_recs]\n", - " return recommendations" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Top 5 recommended products:\n" - ] - }, - { - "data": { - "text/plain": [ - "[['22423', 'REGENCY CAKESTAND 3 TIER'],\n", - " ['22776', 'SWEETHEART CAKESTAND 3 TIER'],\n", - " ['22624', 'IVORY KITCHEN SCALES'],\n", - " ['85123A', 'WHITE HANGING HEART T-LIGHT HOLDER'],\n", - " ['85099B', 'JUMBO BAG RED RETROSPOT']]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(\"Top 5 recommended products:\")\n", - "get_recommendations(df, top_customer, n_recommendations=5, n_ranks=100)" + "viz = LineageTableVisualizer(sagemaker_session)\n", + "display(viz.show(training_job_name=training_job_name))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -912,7 +1007,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb similarity index 96% rename from use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb rename to use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 5d9e3b09ae..3e5bc3b221 100644 --- a/use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 3. Build Pipeline\n", + "# Recommendation Engine for E-Commerce Sales - Pipeline Mode\n", "\n", "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", "\n", @@ -32,28 +32,18 @@ "![Architecture](./images/retail_rec_pipeline.png)" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uq sagemaker boto3" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "%store -r\n", - "%store" + "! pip install --upgrade sagemaker" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -63,13 +53,16 @@ "from sagemaker.workflow.step_collections import RegisterModel\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString\n", + "import datetime\n", "import boto3\n", - "import time" + "import time\n", + "import pandas as pd\n", + "from preprocessing import loadDataset" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -78,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -108,9 +101,41 @@ "## Define Estimator" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, the number of feature dimensions must be calculated as it is a hyperparameter of the estimator. The feature dimensions are calculated by looking at the dataset, cleaning and preprocessing it as defined in the first part of [Recommendation Engine for E-Commerce Sales](retail_recommend.ipynb), and then counting the number of feature dimensions are in the processed dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/Online Retail.csv\")\n", + "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", + "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", + "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", + " \"Quantity\"\n", + "].sum()\n", + "df = df.loc[df > 0].reset_index()\n", + "X, y = loadDataset(df)\n", + "input_dims = X.shape[1]\n", + "input_dims" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After calculating all the hyperparameters that are needed, the estimator is created." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -209,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -233,13 +258,13 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", @@ -252,10 +277,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "timestamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + "mpg_name = f\"retail-recommendation-{timestamp}\"\n", + "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", " estimator=fm,\n", @@ -271,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -306,20 +334,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'pipeline_name' (str)\n" - ] - } - ], + "outputs": [], "source": [ "pipeline_name = f\"PersonalizationDemo\"\n", - "%store pipeline_name\n", "\n", "pipeline = Pipeline(\n", " name=pipeline_name,\n", @@ -376,21 +395,13 @@ " display(viz.show(pipeline_execution_step=execution_step))\n", " time.sleep(5)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -402,7 +413,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.6.13" } }, "nbformat": 4,