diff --git a/LICENSE.txt b/LICENSE.txt
index d645695673..6ff2c6fd00 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -200,3 +200,21 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+
+ ======================================================================================
+ Amazon SageMaker Examples Subcomponents:
+
+ The Amazon SageMaker Examples project contains subcomponents with separate
+ copyright notices and license terms. Your use of the source code for the
+ these subcomponents is subject to the terms and conditions of the following
+ licenses. See licenses/ for text of these licenses.
+
+ If a folder hierarchy is listed as subcomponent, separate listings of
+ further subcomponents (files or folder hierarchies) part of the hierarchy
+ take precedence.
+
+ =======================================================================================
+ 2-clause BSD license
+ =======================================================================================
+ _static/kendrasearchtools.js
+ _templates/search.html
diff --git a/README.md b/README.md
index 14d69da747..ff96af648a 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,7 @@ These examples introduce SageMaker Autopilot. Autopilot automatically performs f
- [Customer Churn AutoML](autopilot/) shows how to use SageMaker Autopilot to automatically train a model for the [Predicting Customer Churn](introduction_to_applying_machine_learning/xgboost_customer_churn) task.
- [Targeted Direct Marketing AutoML](autopilot/) shows how to use SageMaker Autopilot to automatically train a model.
- [Housing Prices AutoML](sagemaker-autopilot/housing_prices) shows how to use SageMaker Autopilot for a linear regression problem (predict housing prices).
+- [Portfolio Churn Prediction with Amazon SageMaker Autopilot and Neo4j](autopilot/sagemaker_autopilot_neo4j_portfolio_churn.ipynb) shows how to use SageMaker Autopilot with graph embeddings to predict investment portfolio churn.
### Introduction to Amazon Algorithms
diff --git a/_static/kendrasearchtools.js b/_static/kendrasearchtools.js
new file mode 100644
index 0000000000..4920607010
--- /dev/null
+++ b/_static/kendrasearchtools.js
@@ -0,0 +1,700 @@
+/*
+ * kendrasearchtools.js
+ * ~~~~~~~~~~~~~~~~
+ *
+ * A modification of searchtools.js (https://github.com/sphinx-doc/sphinx/blob/275d9/sphinx/themes/basic/static/searchtools.js)
+ * where the default full-text search implemented in searchtools.js is replaced with AWS Kendra searching over multiple
+ * websites. The default full-text search is still kept and implemented as a fallback in the case that the Kendra search doesn't work.
+ *
+ * :copyright: Copyright 2007-2021 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+if (!Scorer) {
+ /**
+ * Simple result scoring code.
+ */
+ var Scorer = {
+ // Implement the following function to further tweak the score for each result
+ // The function takes a result array [filename, title, anchor, descr, score]
+ // and returns the new score.
+ /*
+ score: function(result) {
+ return result[4];
+ },
+ */
+
+ // query matches the full name of an object
+ objNameMatch: 11,
+ // or matches in the last dotted part of the object name
+ objPartialMatch: 6,
+ // Additive scores depending on the priority of the object
+ objPrio: {0: 15, // used to be importantResults
+ 1: 5, // used to be objectResults
+ 2: -5}, // used to be unimportantResults
+ // Used when the priority is not in the mapping.
+ objPrioDefault: 0,
+
+ // query found in title
+ title: 15,
+ partialTitle: 7,
+ // query found in terms
+ term: 5,
+ partialTerm: 2
+ };
+}
+
+if (!splitQuery) {
+ function splitQuery(query) {
+ return query.split(/\s+/);
+ }
+}
+
+/**
+ * default rtd search (used as fallback)
+ */
+var Search = {
+
+ _index : null,
+ _queued_query : null,
+ _pulse_status : -1,
+
+ htmlToText : function(htmlString) {
+ var virtualDocument = document.implementation.createHTMLDocument('virtual');
+ var htmlElement = $(htmlString, virtualDocument);
+ htmlElement.find('.headerlink').remove();
+ docContent = htmlElement.find('[role=main]')[0];
+ if(docContent === undefined) {
+ console.warn("Content block not found. Sphinx search tries to obtain it " +
+ "via '[role=main]'. Could you check your theme or template.");
+ return "";
+ }
+ return docContent.textContent || docContent.innerText;
+ },
+
+ init : function() {
+ var params = $.getQueryParameters();
+ if (params.q) {
+ var query = params.q[0];
+ $('input[name="q"]')[0].value = query;
+ // this.performSearch(query);
+ }
+ },
+
+ loadIndex : function(url) {
+ $.ajax({type: "GET", url: url, data: null,
+ dataType: "script", cache: true,
+ complete: function(jqxhr, textstatus) {
+ if (textstatus != "success") {
+ document.getElementById("searchindexloader").src = url;
+ }
+ }});
+ },
+
+ setIndex : function(index) {
+ var q;
+ this._index = index;
+ if ((q = this._queued_query) !== null) {
+ this._queued_query = null;
+ Search.query(q);
+ }
+ },
+
+ hasIndex : function() {
+ return this._index !== null;
+ },
+
+ deferQuery : function(query) {
+ this._queued_query = query;
+ },
+
+ stopPulse : function() {
+ this._pulse_status = 0;
+ },
+
+ startPulse : function() {
+ if (this._pulse_status >= 0)
+ return;
+ function pulse() {
+ var i;
+ Search._pulse_status = (Search._pulse_status + 1) % 4;
+ var dotString = '';
+ for (i = 0; i < Search._pulse_status; i++)
+ dotString += '.';
+ Search.dots.text(dotString);
+ if (Search._pulse_status > -1)
+ window.setTimeout(pulse, 500);
+ }
+ pulse();
+ },
+
+ /**
+ * perform a search for something (or wait until index is loaded)
+ */
+ performSearch : function(query) {
+ // create the required interface elements
+ this.out = $('#search-results');
+ this.title = $('#search-results h2:first'); // $('
').appendTo(this.out);
+ this.out.css("margin", "auto");
+
+ $('#search-progress').text(_('Preparing search...'));
+ this.startPulse();
+
+ this.query(query, 1, pageSize=10, filters=filters)
+ },
+
+};
+
+$(document).ready(function() {
+ KendraSearch.init();
+});
diff --git a/_static/pagination.css b/_static/pagination.css
new file mode 100644
index 0000000000..7584510574
--- /dev/null
+++ b/_static/pagination.css
@@ -0,0 +1,17 @@
+.pagination {
+ display: inline-block;
+}
+
+.pagination a {
+ color: black;
+ float: left;
+ padding: 8px 16px;
+ text-decoration: none;
+}
+
+.pagination a.active {
+ background-color: #2a80b9;
+ color: white;
+}
+
+.pagination a:hover:not(.active) {background-color: #ddd;}
\ No newline at end of file
diff --git a/_static/search_accessories.css b/_static/search_accessories.css
new file mode 100644
index 0000000000..c7e09e1f06
--- /dev/null
+++ b/_static/search_accessories.css
@@ -0,0 +1,29 @@
+.example-badge {
+ background-color: #c63340;
+ color: white;
+ padding: 0.25rem 0.5rem;
+ text-align: center;
+ border-radius: 5px;
+ font-size: 0.8rem;
+ display: inline-block;
+}
+
+.aws-doc-badge {
+ background-color: #e18b50;
+ color: white;
+ padding: 0.25rem 0.5rem;
+ text-align: center;
+ border-radius: 5px;
+ font-size: 0.8rem;
+ display: inline-block;
+}
+
+.sdk-doc-badge {
+ background-color: #4c968f;
+ color: white;
+ padding: 0.25rem 0.5rem;
+ text-align: center;
+ border-radius: 5px;
+ font-size: 0.8rem;
+ display: inline-block;
+}
\ No newline at end of file
diff --git a/_templates/search.html b/_templates/search.html
new file mode 100644
index 0000000000..93c01c7799
--- /dev/null
+++ b/_templates/search.html
@@ -0,0 +1,56 @@
+{#
+ basic/search.html
+ ~~~~~~~~~~~~~~~~~
+
+ Template for the search page.
+
+ :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
+ :license: BSD, see https://github.com/sphinx-doc/sphinx/blob/master/LICENSE for details.
+#}
+{%- extends "layout.html" %}
+{% set title = _('Search') %}
+{% set display_vcs_links = False %}
+{%- block scripts %}
+ {{ super() }}
+
+
+{%- endblock %}
+{% block footer %}
+{# this is used when loading the search index using $.ajax fails,
+ such as on Chrome for documents on localhost #}
+
+
+{{ super() }}
+{% endblock %}
+{% block body %}
+
+
+{% if search_performed %}
+ {# Translators: Search is a noun, not a verb #}
+
{{ _('Search Results') }}
+ {% if not search_results %}
+
{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve used the correct terminology.') }}
+ {% endif %}
+{% endif %}
+
+{% if search_results %}
+
+ {% for href, caption, context in search_results %}
+
\ No newline at end of file
diff --git a/advanced_functionality/causal-inference/causal-inference-container.ipynb b/advanced_functionality/causal-inference/causal-inference-container.ipynb
index 7dbd3a41fe..208549e10a 100644
--- a/advanced_functionality/causal-inference/causal-inference-container.ipynb
+++ b/advanced_functionality/causal-inference/causal-inference-container.ipynb
@@ -108,7 +108,7 @@
"fi\n",
"\n",
"# Get the login command from ECR and execute it directly\n",
- "aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}\n",
+ "$(aws ecr get-login --region $region --registry-ids $account --no-include-email)\n",
"\n",
"# Build the docker image locally with the image name and then push it to ECR\n",
"# with the full name.\n",
@@ -176,6 +176,8 @@
"metadata": {},
"outputs": [],
"source": [
+ "! mkdir data\n",
+ "\n",
"# S3 bucket where the training data is located.\n",
"data_bucket = f\"sagemaker-sample-files\"\n",
"data_prefix = \"datasets/tabular/uci_heart_failure/\"\n",
diff --git a/advanced_functionality/multi_model_catboost/container/Dockerfile b/advanced_functionality/multi_model_catboost/container/Dockerfile
index 089390df06..4e05fca116 100644
--- a/advanced_functionality/multi_model_catboost/container/Dockerfile
+++ b/advanced_functionality/multi_model_catboost/container/Dockerfile
@@ -16,7 +16,7 @@ RUN apt-get update && \
python3 \
vim \
&& rm -rf /var/lib/apt/lists/* \
- && curl -O https://bootstrap.pypa.io/pip/3.7/get-pip.py \
+ && curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py \
&& python3 get-pip.py
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
diff --git a/advanced_functionality/multi_model_catboost/multi_model_catboost.ipynb b/advanced_functionality/multi_model_catboost/multi_model_catboost.ipynb
index cc9c7f91a5..233f0966cf 100644
--- a/advanced_functionality/multi_model_catboost/multi_model_catboost.ipynb
+++ b/advanced_functionality/multi_model_catboost/multi_model_catboost.ipynb
@@ -469,7 +469,7 @@
"metadata": {},
"source": [
"### Invoke just one of models 1000 times \n",
- "Since the models will be in memory and loaded, these invocations will not have any latency \n"
+ "Since the models are in memory and loaded, these invocations should not have any latency \n"
]
},
{
diff --git a/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb b/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb
index 32e05c3714..746a55d1ef 100644
--- a/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb
+++ b/advanced_functionality/pipe_bring_your_own/pipe_bring_your_own.ipynb
@@ -203,6 +203,7 @@
"%%sh\n",
"REGION=$(aws configure get region)\n",
"account=$(aws sts get-caller-identity --query Account --output text)\n",
+ "docker login --username AWS --password $(aws ecr get-login-password --region us-west-2) 763104351884.dkr.ecr.us-west-2.amazonaws.com\n",
"aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${REGION}.amazonaws.com"
]
},
diff --git a/advanced_functionality/scikit_bring_your_own/container/decision_trees/train b/advanced_functionality/scikit_bring_your_own/container/decision_trees/train
index 8654139ed8..1be2e1eea6 100755
--- a/advanced_functionality/scikit_bring_your_own/container/decision_trees/train
+++ b/advanced_functionality/scikit_bring_your_own/container/decision_trees/train
@@ -44,7 +44,7 @@ def train():
'This usually indicates that the channel ({}) was incorrectly specified,\n' +
'the data specification in S3 was incorrectly specified or the role specified\n' +
'does not have permission to access the data.').format(training_path, channel_name))
- raw_data = [ pd.read_csv(file, header=None) for file in input_files ]
+ raw_data = [ pd.read_csv(file, header=None) for file in input_files if file.endswith(".csv")]
train_data = pd.concat(raw_data)
# labels are in the first column
diff --git a/advanced_functionality/scikit_bring_your_own/scikit_bring_your_own.ipynb b/advanced_functionality/scikit_bring_your_own/scikit_bring_your_own.ipynb
index 9fab0f8d5b..d6c046ce83 100644
--- a/advanced_functionality/scikit_bring_your_own/scikit_bring_your_own.ipynb
+++ b/advanced_functionality/scikit_bring_your_own/scikit_bring_your_own.ipynb
@@ -276,7 +276,7 @@
"# Build the docker image locally with the image name and then push it to ECR\n",
"# with the full name.\n",
"\n",
- "docker build -t ${algorithm_name} .\n",
+ "docker build -t ${algorithm_name} .\n",
"docker tag ${algorithm_name} ${fullname}\n",
"\n",
"docker push ${fullname}"
@@ -315,9 +315,7 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "metadata": {},
"outputs": [],
"source": [
"# S3 prefix\n",
@@ -347,9 +345,7 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "metadata": {},
"outputs": [],
"source": [
"import sagemaker as sage\n",
diff --git a/autopilot/index.rst b/autopilot/index.rst
index b8afd1c000..d8e2f4a3f7 100644
--- a/autopilot/index.rst
+++ b/autopilot/index.rst
@@ -8,6 +8,7 @@ Get started with Autopilot
sagemaker_autopilot_direct_marketing
sagemaker_autopilot_abalone_parquet_input
+ sagemaker_autopilot_neo4j_portfolio_churn
Feature selection
diff --git a/autopilot/sagemaker_autopilot_neo4j_portfolio_churn.ipynb b/autopilot/sagemaker_autopilot_neo4j_portfolio_churn.ipynb
new file mode 100644
index 0000000000..3f8774acfe
--- /dev/null
+++ b/autopilot/sagemaker_autopilot_neo4j_portfolio_churn.ipynb
@@ -0,0 +1,944 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4rQ6G65n6OxV"
+ },
+ "source": [
+ "# Portfolio Churn Prediction with Amazon SageMaker AutoPilot and Neo4j\n",
+ "This notebook describes how to use Neo4j and SageMaker together. In it you connect to a Neo4j instance, load data and compute an embedding. You then load that data into Amazon S3. Finally, you use SageMaker to train a model using the new embedding as an additional feature. \n",
+ "\n",
+ "The data set represents a binary classification problem based on data from the SEC's EDGAR database. It was scraped from the EDGAR system using the code [here](https://github.com/neo4j-partners/neo4j-sec-edgar-form13). The data set consists of Form 13 data, the quarterly filings of asset managers with $100M or more of assets under management (AUM).\n",
+ "\n",
+ "**Important:** This example notebook is for demonstrative purposes only. It is not financial advice and should not be relied on as financial or investment advice."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IdMFRbqGzSqF"
+ },
+ "source": [
+ "## Deploy Neo4j\n",
+ "You're going to need a Neo4j deployment to run this lab. The easiest way to get that is via the [AWS Marketplace](https://aws.amazon.com/marketplace/seller-profile?id=23ec694a-d2af-4641-b4d3-b7201ab2f5f9). Select \"Neo4j Enterprise Edition\" and deploy that. Suggested parameters are:\n",
+ "\n",
+ "* Stack name - neo4j-ee\n",
+ "* Graph Database Version - 4.4.9\n",
+ "* Install Graph Data Science - True\n",
+ "* Graph Data Science License Key - None\n",
+ "* Install Bloom - False\n",
+ "* Bloom License Key - None\n",
+ "* Password - Enter something here\n",
+ "* Node Count - 1\n",
+ "* Instance Type - r6i.4xlarge\n",
+ "* Disk Size - 100\n",
+ "* SSH CIDR - 0.0.0.0/0\n",
+ "\n",
+ "The Marketplace listing deploys an Auto Scaling Group (ASG) and a Load Balancer (LB) in front of that. When deployment is complete, you can get the DNS name of your LB from the console and use that to connect. You can view deployed NLBs at [Load Balancer](https://console.aws.amazon.com/ec2/v2/home?#LoadBalancers:sort=loadBalancerName).\n",
+ "\n",
+ "If you need to change any parameters after you've deployed, you'll want to delete the stack and redeploy rather than attempting to update the stack."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9MwTYwKk6OxX"
+ },
+ "source": [
+ "## Using the Neo4j API\n",
+ "Now that we have a Neo4j deployment, let's connect to Neo4j. First off, install the Neo4j Graph Data Science package."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "FT0KaLYj6OxX"
+ },
+ "outputs": [],
+ "source": [
+ "%pip install graphdatascience"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sFokFbiL6OxY"
+ },
+ "source": [
+ "Now, you're going to need the connection string and credentials from the deployment you created above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "P41l_P4zzSqF"
+ },
+ "outputs": [],
+ "source": [
+ "# Edit these variables!\n",
+ "DB_URL = \"neo4j://.amazonaws.com:7687\"\n",
+ "DB_PASS = \"\"\n",
+ "\n",
+ "# You can leave this default\n",
+ "DB_USER = \"neo4j\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "8lUkSvmozSqF"
+ },
+ "outputs": [],
+ "source": [
+ "from graphdatascience import GraphDataScience\n",
+ "\n",
+ "gds = GraphDataScience(DB_URL, auth=(DB_USER, DB_PASS))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_7-MlyTU6OxZ"
+ },
+ "source": [
+ "## Load Data into Neo4j\n",
+ "Now that we've got our connection object, let's load the dataset into Neo4j.\n",
+ "\n",
+ "The dataset is pulled from the SEC's EDGAR database. These are public filings of something called Form 13. Asset managers with over \\$100m AUM are required to submit Form 13 quarterly. That's then made available to the public over http. The csvs linked above were pulled from EDGAR using some python scripts linked above. We've filtered the data to only include filings over \\$10m in value.\n",
+ "\n",
+ "We're going to create constraints for our data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VxgUxjVQ6OxZ"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"CREATE CONSTRAINT IF NOT EXISTS ON (p:Company) ASSERT (p.cusip) IS NODE KEY;\"\n",
+ ")\n",
+ "display(result)\n",
+ "\n",
+ "result = gds.run_cypher(\n",
+ " \"CREATE CONSTRAINT IF NOT EXISTS ON (p:Manager) ASSERT (p.filingManager) IS NODE KEY;\"\n",
+ ")\n",
+ "display(result)\n",
+ "\n",
+ "result = gds.run_cypher(\n",
+ " \"CREATE CONSTRAINT IF NOT EXISTS ON (p:Holding) ASSERT (p.filingManager, p.cusip, p.reportCalendarOrQuarter) IS NODE KEY;\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BdKOItse6Oxa"
+ },
+ "source": [
+ "Now let's load the nodes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "JgCgdkCt6Oxa"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " LOAD CSV WITH HEADERS FROM \"https://neo4j-dataset.s3.amazonaws.com/form13/2021.csv\" AS row\n",
+ " MERGE (c:Company {cusip:row.cusip})\n",
+ " ON CREATE SET\n",
+ " c.nameOfIssuer=row.nameOfIssuer\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MqJZYNES6Oxa"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " LOAD CSV WITH HEADERS FROM \"https://neo4j-dataset.s3.amazonaws.com/form13/2021.csv\" AS row\n",
+ " MERGE (m:Manager {filingManager:row.filingManager})\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rERDJtCi6Oxa"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " LOAD CSV WITH HEADERS FROM \"https://neo4j-dataset.s3.amazonaws.com/form13/2021.csv\" AS row\n",
+ " MERGE (h:Holding {filingManager:row.filingManager, cusip:row.cusip, reportCalendarOrQuarter:row.reportCalendarOrQuarter})\n",
+ " ON CREATE SET\n",
+ " h.value=row.value, \n",
+ " h.shares=row.shares,\n",
+ " h.target=row.target,\n",
+ " h.nameOfIssuer=row.nameOfIssuer\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vzdC3x316Oxa"
+ },
+ "source": [
+ "Now let's create relationships between those nodes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rggD5Yho6Oxa"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " LOAD CSV WITH HEADERS FROM \"https://neo4j-dataset.s3.amazonaws.com/form13/2021.csv\" AS row\n",
+ " MATCH (m:Manager {filingManager:row.filingManager})\n",
+ " MATCH (h:Holding {filingManager:row.filingManager, cusip:row.cusip, reportCalendarOrQuarter:row.reportCalendarOrQuarter})\n",
+ " MERGE (m)-[r:OWNS]->(h)\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rpsRbdhe6Oxb"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " LOAD CSV WITH HEADERS FROM \"https://neo4j-dataset.s3.amazonaws.com/form13/2021.csv\" AS row\n",
+ " MATCH (h:Holding {filingManager:row.filingManager, cusip:row.cusip, reportCalendarOrQuarter:row.reportCalendarOrQuarter})\n",
+ " MATCH (c:Company {cusip:row.cusip})\n",
+ " MERGE (h)-[r:PARTOF]->(c)\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ZtJy4eO_zSqF"
+ },
+ "source": [
+ "## Graph Data Science\n",
+ "Now we're going to use Neo4j Graph Data Science to create an in-memory graph representation of the data. We'll enhance that representation with features we engineer using a graph embedding."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "x76ZEtR16Oxb"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " CALL gds.graph.project(\n",
+ " \"mygraph\",\n",
+ " [\"Company\", \"Manager\", \"Holding\"],\n",
+ " {\n",
+ " OWNS: {orientation: \"UNDIRECTED\"},\n",
+ " PARTOF: {orientation: \"UNDIRECTED\"}\n",
+ " }\n",
+ " )\n",
+ " YIELD\n",
+ " graphName AS graph,\n",
+ " relationshipProjection AS readProjection,\n",
+ " nodeCount AS nodes,\n",
+ " relationshipCount AS rels\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HiwL552u6Oxb"
+ },
+ "source": [
+ "If you get an error saying the graph already exists, that's probably because you ran this code before. You can destroy it using this command:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "EPZIIIJc6Oxb"
+ },
+ "outputs": [],
+ "source": [
+ "# result = gds.run_cypher(\n",
+ "# \"\"\"\n",
+ "# CALL gds.graph.drop(\"mygraph\")\n",
+ "# \"\"\"\n",
+ "# )\n",
+ "# display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zG1novOj6Oxb"
+ },
+ "source": [
+ "Now, let's list the details of the graph to make sure the projection was created as we want."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yyaw5itE6Oxb"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " CALL gds.graph.list()\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "XEQAChAa6Oxb"
+ },
+ "source": [
+ "Now we can generate an embedding from that graph. This is a new feature we can use in our predictions. We're using FastRP, which is a more full featured and higher performance of Node2Vec. You can learn more about that at the [Fast Random Projection\n",
+ "](https://neo4j.com/docs/graph-data-science/current/algorithms/fastrp/) documentation page.\n",
+ "\n",
+ "There are a bunch of parameters we could adjust in this. One of the most obvious is the embeddingDimension. The documentation covers many more."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "qLFxuPb66Oxc"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " CALL gds.fastRP.mutate(\"mygraph\",{\n",
+ " embeddingDimension: 16,\n",
+ " randomSeed: 1,\n",
+ " mutateProperty:\"embedding\"\n",
+ " })\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iRpgM-NV6Oxc"
+ },
+ "source": [
+ "That creates an embedding for each node type. However, we only want the embedding on the nodes of type holding.\n",
+ "\n",
+ "We're going to take the embedding from our projection and write it to the holding nodes in the underlying database."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "3dBS16zD6Oxc"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " CALL gds.graph.writeNodeProperties(\"mygraph\", [\"embedding\"], [\"Holding\"])\n",
+ " YIELD writeMillis\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mK6LeBne6Oxc"
+ },
+ "outputs": [],
+ "source": [
+ "result = gds.run_cypher(\n",
+ " \"\"\"\n",
+ " MATCH (n:Holding) RETURN n\n",
+ " \"\"\"\n",
+ ")\n",
+ "display(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1N_x38Ci6Oxc"
+ },
+ "source": [
+ "Note that this query will take 2-3 minutes to run as it's grabbing nearly half a million nodes along with all their properties and our new embedding."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "197ZaAH16Oxc"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.DataFrame([dict(record.items()) for record in result[\"n\"]])\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "A3esUO8s6Oxc"
+ },
+ "source": [
+ "Note that the embedding row is an array. To make this dataset more consumable, we should flatten that out into multiple individual features: embedding_0, embedding_1, ... embedding_n.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-i0_txCB6Oxc"
+ },
+ "outputs": [],
+ "source": [
+ "embeddings = pd.DataFrame(df[\"embedding\"].values.tolist()).add_prefix(\"embedding_\")\n",
+ "merged = df.drop(columns=[\"embedding\"]).merge(embeddings, left_index=True, right_index=True)\n",
+ "merged"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4Zb7lH366Oxc"
+ },
+ "source": [
+ "Now that we have the data formatted properly, let's split it into training, testing and validation sets. We'll write those to disk.\n",
+ "\n",
+ "Our data is, in some sense a time series. We're going to window over three quarters. Q4 of 2021 is used to generate labels, so it's not present in the data set. That leaves Q3 as our validation data set. Q2 becomes test and Q1 is for training.\n",
+ "\n",
+ "We take this approach rather than generating random folds or similar to avoid time based leakage."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "uLg34zlu6Oxc"
+ },
+ "outputs": [],
+ "source": [
+ "df = merged\n",
+ "\n",
+ "train = df.loc[df[\"reportCalendarOrQuarter\"] == \"03-31-2021\"]\n",
+ "train.to_csv(\"train.csv\", index=False)\n",
+ "\n",
+ "test = df.loc[df[\"reportCalendarOrQuarter\"] == \"06-30-2021\"]\n",
+ "test = test.drop([\"target\"], axis=1)\n",
+ "test.to_csv(\"test.csv\", index=False)\n",
+ "\n",
+ "validate = df.loc[df[\"reportCalendarOrQuarter\"] == \"09-30-2021\"]\n",
+ "validate = validate.drop([\"target\"], axis=1)\n",
+ "validate.to_csv(\"validate.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SageMaker Connection\n",
+ "Let's setup our SageMaker connection."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sagemaker\n",
+ "import boto3\n",
+ "\n",
+ "region = boto3.Session().region_name\n",
+ "\n",
+ "session = sagemaker.Session()\n",
+ "bucket = session.default_bucket()\n",
+ "prefix = \"sagemaker/form13\"\n",
+ "\n",
+ "role = sagemaker.get_execution_role()\n",
+ "\n",
+ "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Upload to Amazon S3\n",
+ "Now we're going to upload the training and testing data to our default SageMaker bucket."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data_s3_path = session.upload_data(path=\"train.csv\", key_prefix=prefix + \"/train\")\n",
+ "print(\"Training data uploaded to: \" + train_data_s3_path)\n",
+ "\n",
+ "test_data_s3_path = session.upload_data(path=\"test.csv\", key_prefix=prefix + \"/test\")\n",
+ "print(\"Testing data uploaded to: \" + test_data_s3_path)\n",
+ "\n",
+ "validation_data_s3_path = session.upload_data(path=\"validate.csv\", key_prefix=prefix + \"/validate\")\n",
+ "print(\"Validation data uploaded to: \" + validation_data_s3_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setting up the SageMaker AutoPilot Job\n",
+ "After uploading the dataset to Amazon S3, you can invoke AutoPilot to find the best ML pipeline to train a model on this dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "auto_ml_job_config = {\"CompletionCriteria\": {\"MaxCandidates\": 3}}\n",
+ "\n",
+ "input_data_config = [\n",
+ " {\n",
+ " \"DataSource\": {\n",
+ " \"S3DataSource\": {\n",
+ " \"S3DataType\": \"S3Prefix\",\n",
+ " \"S3Uri\": \"s3://{}/{}/train\".format(bucket, prefix),\n",
+ " }\n",
+ " },\n",
+ " \"TargetAttributeName\": \"target\",\n",
+ " }\n",
+ "]\n",
+ "\n",
+ "output_data_config = {\"S3OutputPath\": \"s3://{}/{}/output\".format(bucket, prefix)}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launching the SageMaker AutoPilot Job\n",
+ "You can now launch the AutoPilot job by calling the `create_auto_ml_job` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from time import gmtime, strftime, sleep\n",
+ "\n",
+ "timestamp_suffix = strftime(\"%d-%H-%M-%S\", gmtime())\n",
+ "\n",
+ "auto_ml_job_name = \"automl-form13-\" + timestamp_suffix\n",
+ "print(\"AutoMLJobName: \" + auto_ml_job_name)\n",
+ "\n",
+ "sm.create_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name,\n",
+ " InputDataConfig=input_data_config,\n",
+ " OutputDataConfig=output_data_config,\n",
+ " AutoMLJobConfig=auto_ml_job_config,\n",
+ " RoleArn=role,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Tracking SageMaker AutoPilot job progress\n",
+ "A SageMaker AutoPilot job consists of the following high-level steps : \n",
+ "\n",
+ "* Analyzing Data, where the dataset is analyzed and AutoPilot comes up with a list of ML pipelines that should be tried out on the dataset. The dataset is also split into train and validation sets. \n",
+ "* Feature Engineering, where AutoPilot performs feature transformation on individual features of the dataset as well as at an aggregate level. \n",
+ "* Model Tuning, where the top performing pipeline is selected along with the optimal hyperparameters for the training algorithm (the last stage of the pipeline).\n",
+ "\n",
+ "This job typically takes 20-80 minutes to run. That time presumably varies based on the underlying ML algorithm in AutoPilot as well as provisioning times for components of AutoPilot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"JobStatus - Secondary Status\")\n",
+ "print(\"----------------------------\")\n",
+ "\n",
+ "describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
+ "print(describe_response[\"AutoMLJobStatus\"] + \" - \" + describe_response[\"AutoMLJobSecondaryStatus\"])\n",
+ "job_run_status = describe_response[\"AutoMLJobStatus\"]\n",
+ "\n",
+ "while job_run_status not in (\"Failed\", \"Completed\", \"Stopped\"):\n",
+ " describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)\n",
+ " job_run_status = describe_response[\"AutoMLJobStatus\"]\n",
+ "\n",
+ " print(\n",
+ " describe_response[\"AutoMLJobStatus\"] + \" - \" + describe_response[\"AutoMLJobSecondaryStatus\"]\n",
+ " )\n",
+ " sleep(30)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Results\n",
+ "Now use the describe_auto_ml_job API to look up the best candidate selected by the SageMaker AutoPilot job."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pprint\n",
+ "\n",
+ "best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)[\"BestCandidate\"]\n",
+ "best_candidate_name = best_candidate[\"CandidateName\"]\n",
+ "\n",
+ "print(\"CandidateName: \" + best_candidate_name)\n",
+ "print(\n",
+ " \"FinalAutoMLJobObjectiveMetricName: \"\n",
+ " + best_candidate[\"FinalAutoMLJobObjectiveMetric\"][\"MetricName\"]\n",
+ ")\n",
+ "print(\n",
+ " \"FinalAutoMLJobObjectiveMetricValue: \"\n",
+ " + str(best_candidate[\"FinalAutoMLJobObjectiveMetric\"][\"Value\"])\n",
+ ")\n",
+ "print()\n",
+ "pprint.pprint(best_candidate)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Batch Inference\n",
+ "Now that we completed the SageMaker AutoPilot job on the dataset, let's create a model from the best candidate with Inference Pipelines."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_name = \"automl-form13-model-\" + timestamp_suffix\n",
+ "model = sm.create_model(\n",
+ " Containers=best_candidate[\"InferenceContainers\"], ModelName=model_name, ExecutionRoleArn=role\n",
+ ")\n",
+ "print(\"Model ARN corresponding to the best candidate is: {}\".format(model[\"ModelArn\"]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can use batch inference through Amazon SageMaker batch transform. The same model can also be deployed to perform online inference using Amazon SageMaker hosting."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transform_job_name = \"automl-form13-transform-\" + timestamp_suffix\n",
+ "\n",
+ "transform_input = {\n",
+ " \"DataSource\": {\"S3DataSource\": {\"S3DataType\": \"S3Prefix\", \"S3Uri\": test_data_s3_path}},\n",
+ " \"ContentType\": \"text/csv\",\n",
+ " \"CompressionType\": \"None\",\n",
+ " \"SplitType\": \"Line\",\n",
+ "}\n",
+ "\n",
+ "transform_output = {\n",
+ " \"S3OutputPath\": \"s3://{}/{}/inference-results\".format(bucket, prefix),\n",
+ "}\n",
+ "\n",
+ "transform_resources = {\"InstanceType\": \"ml.m5.4xlarge\", \"InstanceCount\": 1}\n",
+ "\n",
+ "sm.create_transform_job(\n",
+ " TransformJobName=transform_job_name,\n",
+ " ModelName=model_name,\n",
+ " TransformInput=transform_input,\n",
+ " TransformOutput=transform_output,\n",
+ " TransformResources=transform_resources,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we can watch the transform job for completion. That takes approximately 20 minutes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"JobStatus\")\n",
+ "print(\"---------\")\n",
+ "\n",
+ "describe_response = sm.describe_transform_job(TransformJobName=transform_job_name)\n",
+ "job_run_status = describe_response[\"TransformJobStatus\"]\n",
+ "print(job_run_status)\n",
+ "\n",
+ "while job_run_status not in (\"Failed\", \"Completed\", \"Stopped\"):\n",
+ " describe_response = sm.describe_transform_job(TransformJobName=transform_job_name)\n",
+ " job_run_status = describe_response[\"TransformJobStatus\"]\n",
+ " print(job_run_status)\n",
+ " sleep(30)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now let’s get the URL of the transform job results. You can open this in S3."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bucket = session.default_bucket()\n",
+ "key = \"{}/inference-results/test_data.csv.out\".format(prefix)\n",
+ "url = \"s3://\" + bucket + key\n",
+ "\n",
+ "print(url)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## View All Candidates\n",
+ "You can view all the candidates (pipeline evaluations with different hyperparameter combinations) that were explored by SageMaker AutoPilot and sort them by their final performance metric."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "candidates = sm.list_candidates_for_auto_ml_job(\n",
+ " AutoMLJobName=auto_ml_job_name, SortBy=\"FinalObjectiveMetricValue\"\n",
+ ")[\"Candidates\"]\n",
+ "index = 0\n",
+ "for candidate in candidates:\n",
+ " print(\n",
+ " str(index)\n",
+ " + \" \"\n",
+ " + candidate[\"CandidateName\"]\n",
+ " + \" \"\n",
+ " + str(candidate[\"FinalAutoMLJobObjectiveMetric\"][\"Value\"])\n",
+ " )\n",
+ " index += 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Candidate Generation Notebook\n",
+ "SageMaker AutoPilot also auto-generates a Candidate Definitions notebook. This notebook can be used to interactively step through the various steps taken by the SageMaker AutoPilot to arrive at the best candidate. This notebook can also be used to override various runtime parameters like parallelism, hardware used, algorithms explored, feature extraction scripts and more."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This code downloads a file from our SageMaker bucket using the SageMaker session."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def downloadNotebook(s3_path):\n",
+ " session = sagemaker.Session()\n",
+ " role = sagemaker.get_execution_role()\n",
+ "\n",
+ " # reformat the s3 URL into something boto3 can handle\n",
+ " s3_path_parts = s3_path.replace(\"s3://\", \"\").split(\"/\")\n",
+ " bucket, key, file = s3_path_parts[0], \"/\".join(s3_path_parts[1:]), s3_path_parts[-1]\n",
+ "\n",
+ " print(bucket)\n",
+ " print(key)\n",
+ " print(file)\n",
+ "\n",
+ " print(\"file\" + file)\n",
+ " notebook = session.read_s3_file(bucket, key)\n",
+ " with open(file, \"w\") as text_file:\n",
+ " text_file.write(notebook)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can download the notebook with the command:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "notebook_s3_path = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)[\"AutoMLJobArtifacts\"][\n",
+ " \"CandidateDefinitionNotebookLocation\"\n",
+ "]\n",
+ "downloadNotebook(notebook_s3_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data Exploration Notebook\n",
+ "SageMaker Autopilot also auto-generates a Data Exploration notebook. This code will download that notebook:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "notebook_s3_path = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)[\"AutoMLJobArtifacts\"][\n",
+ " \"DataExplorationNotebookLocation\"\n",
+ "]\n",
+ "downloadNotebook(notebook_s3_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Cleanup\n",
+ "SageMaker stores its data in an Amazon S3 bucket. You may want to the results of our job in that bucket once you're done working with it.\n",
+ "\n",
+ "The AWS Marketplace listing we deployed Neo4j Enterprise Edition with created a stack. To delete the deployment, you would navigate to Amazon [CloudFormation](https://console.aws.amazon.com/cloudformation) in the console and delete the stack there. Be sure to delete the entire stack as that will delete all the subcomponents of the stack."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Conclusion\n",
+ "In this notebook, you deployed Neo4j Enterprise Edition. Within SageMaker Studio, you then loaded a data set in Neo4j Graph Database. You used Neo4j Graph Data Science to compute a graph embedding on that dataset. Using that embedding, you ran a SageMaker AutoPilot job and inspected the output.\n",
+ "\n",
+ "This same flow can be repurposed to add graph embeddings to your own machine learning jobs. Graph embeddings are just one sort of graph feature that can be used in machine learning. The approach we used here would apply to incorporating other features like betweeness or neighborhood as well."
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "embedding.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3.9.5 64-bit",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.5"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/conf.py b/conf.py
index bdef605979..05259a9c13 100644
--- a/conf.py
+++ b/conf.py
@@ -63,3 +63,8 @@
html_js_files = [
"https://a0.awsstatic.com/s_code/js/3.0/awshome_s_code.js",
]
+
+html_css_files = [
+ 'pagination.css',
+ 'search_accessories.css',
+]
diff --git a/hyperparameter_tuning/r_bring_your_own/Dockerfile b/hyperparameter_tuning/r_bring_your_own/Dockerfile
index 8f441f5113..5ca1d8798a 100644
--- a/hyperparameter_tuning/r_bring_your_own/Dockerfile
+++ b/hyperparameter_tuning/r_bring_your_own/Dockerfile
@@ -8,7 +8,31 @@ RUN apt-get -y update && apt-get install -y --no-install-recommends \
r-base-dev \
ca-certificates
-RUN R -e "install.packages(c('mda', 'plumber'), repos='https://cloud.r-project.org')"
+RUN R -e "install.packages(c('Rcpp', 'BH', 'R6', 'jsonlite', 'crayon'), repos='https://cloud.r-project.org')"
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/stringi/stringi_1.2.4.tar.gz
+RUN R CMD INSTALL stringi_1.2.4.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/rlang/rlang_0.2.2.tar.gz
+RUN R CMD INSTALL rlang_0.2.2.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/magrittr/magrittr_1.5.tar.gz
+RUN R CMD INSTALL magrittr_1.5.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/later/later_0.7.5.tar.gz
+RUN R CMD INSTALL later_0.7.5.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/promises/promises_1.0.1.tar.gz
+RUN R CMD INSTALL promises_1.0.1.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/httpuv/httpuv_1.4.4.2.tar.gz
+RUN R CMD INSTALL httpuv_1.4.4.2.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/mda/mda_0.4-10.tar.gz
+RUN R CMD INSTALL mda_0.4-10.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/plumber/plumber_0.4.6.tar.gz
+RUN R CMD INSTALL plumber_0.4.6.tar.gz
COPY mars.R /opt/ml/mars.R
COPY plumber.R /opt/ml/plumber.R
diff --git a/hyperparameter_tuning/r_bring_your_own/tune_r_bring_your_own.ipynb b/hyperparameter_tuning/r_bring_your_own/tune_r_bring_your_own.ipynb
index 7a8c873773..072a57b9a7 100644
--- a/hyperparameter_tuning/r_bring_your_own/tune_r_bring_your_own.ipynb
+++ b/hyperparameter_tuning/r_bring_your_own/tune_r_bring_your_own.ipynb
@@ -225,7 +225,7 @@
")\n",
"\n",
"estimator = sagemaker.estimator.Estimator(\n",
- " image_name=\"{}.dkr.ecr.{}.{}/rmars:latest\".format(account, region, domain),\n",
+ " image_uri=\"{}.dkr.ecr.{}.{}/rmars:latest\".format(account, region, domain),\n",
" role=role,\n",
" train_instance_count=1,\n",
" train_instance_type=\"ml.m4.xlarge\",\n",
diff --git a/hyperparameter_tuning/rapids_bring_your_own/code/Dockerfile b/hyperparameter_tuning/rapids_bring_your_own/code/Dockerfile
index 212059a288..382f74e465 100644
--- a/hyperparameter_tuning/rapids_bring_your_own/code/Dockerfile
+++ b/hyperparameter_tuning/rapids_bring_your_own/code/Dockerfile
@@ -8,14 +8,18 @@ ENV CV_FOLDS="3"
# ensure printed output/log-messages retain correct order
ENV PYTHONUNBUFFERED=True
+# delete expired nvidia keys and fetch new ones
+RUN apt-key del 7fa2af80
+RUN rm /etc/apt/sources.list.d/cuda.list
+RUN rm /etc/apt/sources.list.d/nvidia-ml.list
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && dpkg -i cuda-keyring_1.0-1_all.deb
+
# add sagemaker-training-toolkit [ requires build tools ], flask [ serving ], and dask-ml
RUN apt-get update && apt-get install -y --no-install-recommends build-essential \
- && source activate rapids && pip3 install sagemaker-training \
- && conda install -c anaconda flask \
- && conda install -c conda-forge dask-ml
+ && source activate rapids && pip3 install sagemaker-training dask-ml flask
# path where SageMaker looks for code when container runs in the cloud
-ENV CLOUD_PATH="/opt/ml/code"
+ENV CLOUD_PATH "/opt/ml/code"
# copy our latest [local] code into the container
COPY . $CLOUD_PATH
diff --git a/hyperparameter_tuning/rapids_bring_your_own/rapids_sagemaker_hpo.ipynb b/hyperparameter_tuning/rapids_bring_your_own/rapids_sagemaker_hpo.ipynb
index fb466c70de..12f4137ffc 100644
--- a/hyperparameter_tuning/rapids_bring_your_own/rapids_sagemaker_hpo.ipynb
+++ b/hyperparameter_tuning/rapids_bring_your_own/rapids_sagemaker_hpo.ipynb
@@ -704,14 +704,18 @@
"# ensure printed output/log-messages retain correct order\n",
"ENV PYTHONUNBUFFERED=True\n",
"\n",
+ "# delete expired nvidia keys and fetch new ones\n",
+ "RUN apt-key del 7fa2af80\n",
+ "RUN rm /etc/apt/sources.list.d/cuda.list\n",
+ "RUN rm /etc/apt/sources.list.d/nvidia-ml.list\n",
+ "RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && dpkg -i cuda-keyring_1.0-1_all.deb \n",
+ "\n",
"# add sagemaker-training-toolkit [ requires build tools ], flask [ serving ], and dask-ml\n",
"RUN apt-get update && apt-get install -y --no-install-recommends build-essential \\ \n",
- " && source activate rapids && pip3 install sagemaker-training \\\n",
- " && conda install -c anaconda flask \\\n",
- " && conda install -c conda-forge dask-ml\n",
+ " && source activate rapids && pip3 install sagemaker-training dask-ml flask\n",
"\n",
"# path where SageMaker looks for code when container runs in the cloud\n",
- "ENV CLOUD_PATH=\"/opt/ml/code\"\n",
+ "ENV CLOUD_PATH \"/opt/ml/code\"\n",
"\n",
"# copy our latest [local] code into the container \n",
"COPY . $CLOUD_PATH\n",
diff --git a/introduction_to_amazon_algorithms/image_classification_tensorflow/Amazon_TensorFlow_Image_Classification.ipynb b/introduction_to_amazon_algorithms/image_classification_tensorflow/Amazon_TensorFlow_Image_Classification.ipynb
new file mode 100644
index 0000000000..acb43d9117
--- /dev/null
+++ b/introduction_to_amazon_algorithms/image_classification_tensorflow/Amazon_TensorFlow_Image_Classification.ipynb
@@ -0,0 +1,914 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "491b904e",
+ "metadata": {},
+ "source": [
+ "# Introduction to SageMaker TensorFlow - Image Classification"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8df953a",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Welcome to [Amazon SageMaker Built-in Algorithms](https://sagemaker.readthedocs.io/en/stable/algorithms/index.html)! You can use SageMaker Built-in algorithms to solve many Machine Learning tasks through [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/overview.html). You can also use these algorithms through one-click in SageMaker Studio via [JumpStart](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-jumpstart.html).\n",
+ "\n",
+ "In this demo notebook, we demonstrate how to use the TensorFlow Image Classification algorithm. Image Classification refers to classifying an image to one of the class labels of the training dataset. We demonstrate two use cases of TensorFlow Image Classification models:\n",
+ "\n",
+ "* How to use a model pre-trained on ImageNet dataset to classify an image. [ImageNetLabels](https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt).\n",
+ "* How to fine-tune a pre-trained model to a custom dataset, and then run inference on the fine-tuned model.\n",
+ "\n",
+ "Note: This notebook was tested on ml.t3.medium instance in Amazon SageMaker Studio with Python 3 (Data Science) kernel and in Amazon SageMaker Notebook instance with conda_python3 kernel.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3acf3836",
+ "metadata": {},
+ "source": [
+ "1. [Set Up](#1.-Set-Up)\n",
+ "2. [Select a pre-trained model](#2.-Select-a-pre-trained-model)\n",
+ "3. [Run inference on the pre-trained model](#3.-Run-inference-on-the-pre-trained-model)\n",
+ " * [Retrieve Artifacts & Deploy an Endpoint](#3.1.-Retrieve-Artifacts-&-Deploy-an-Endpoint)\n",
+ " * [Download example images for inference](#3.2.-Download-example-images-for-inference)\n",
+ " * [Query endpoint and parse response](#3.3.-Query-endpoint-and-parse-response)\n",
+ " * [Clean up the endpoint](#3.4.-Clean-up-the-endpoint)\n",
+ "4. [Fine-tune the pre-trained model on a custom dataset](#4.-Fine-tune-the-pre-trained-model-on-a-custome-dataset)\n",
+ " * [Retrieve Training artifacts](#4.1.-Retrieve-Training-artifacts)\n",
+ " * [Set Training parameters](#4.2.-Set-Training-parameters)\n",
+ " * [Train with Automatic Model Tuning (HPO)](#AMT)\n",
+ " * [Start Training](#4.4.-Start-Training)\n",
+ " * [Extract Training performance metrics](#4.5.-Extract-Training-performance-metrics)\n",
+ " * [Deploy & run Inference on the fine-tuned model](#4.6.-Deploy-&-run-Inference-on-the-fine-tuned-model)\n",
+ " * [Incrementally train the fine-tuned model](#4.7.-Incrementally-train-the-fine-tuned-model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "99e04731",
+ "metadata": {},
+ "source": [
+ "## 1. Set Up\n",
+ "***\n",
+ "Before executing the notebook, there are some initial steps required for setup. This notebook requires latest version of sagemaker and ipywidgets.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a536a5dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install sagemaker ipywidgets --upgrade --quiet"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "951e8b8a",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "To train and host on Amazon Sagemaker, we need to setup and authenticate the use of AWS services. Here, we use the execution role associated with the current notebook instance as the AWS account role with SageMaker access. It has necessary permissions, including access to your data in S3. \n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ab99140",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sagemaker, boto3, json\n",
+ "from sagemaker.session import Session\n",
+ "\n",
+ "sagemaker_session = Session()\n",
+ "aws_role = sagemaker_session.get_caller_identity_arn()\n",
+ "aws_region = boto3.Session().region_name\n",
+ "sess = sagemaker.Session()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "634dd01d",
+ "metadata": {},
+ "source": [
+ "## 2. Select a pre-trained model\n",
+ "***\n",
+ "You can continue with the default model, or can choose a different model from the dropdown generated upon running the next cell. A complete list of SageMaker pre-trained models can also be accessed at [Sagemaker pre-trained Models](https://sagemaker.readthedocs.io/en/stable/doc_utils/pretrainedmodels.html#).\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3f1d777",
+ "metadata": {
+ "jumpStartAlterations": [
+ "modelIdVersion"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "model_id, model_version = \"tensorflow-ic-imagenet-mobilenet-v2-100-224-classification-4\", \"*\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "772154b7",
+ "metadata": {},
+ "source": [
+ "***\n",
+ "[Optional] Select a different Sagemaker pre-trained model. Here, we download the model_manifest file from the Built-In Algorithms s3 bucket, filter-out all the Image Classification models and select a model for inference.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d7cb33f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import IPython\n",
+ "from ipywidgets import Dropdown\n",
+ "from sagemaker.jumpstart.notebook_utils import list_jumpstart_models\n",
+ "from sagemaker.jumpstart.filters import And\n",
+ "\n",
+ "# Retrieves all TensorFlow Image Classification models made available by SageMaker Built-In Algorithms.\n",
+ "filter_value = And(\"task == ic\", \"framework == tensorflow\")\n",
+ "ic_models = list_jumpstart_models(filter=filter_value)\n",
+ "\n",
+ "# display the model-ids in a dropdown, for user to select a model.\n",
+ "dropdown = Dropdown(\n",
+ " options=ic_models,\n",
+ " value=model_id,\n",
+ " description=\"SageMaker Built-In TensorFlow Image Classification Models:\",\n",
+ " style={\"description_width\": \"initial\"},\n",
+ " layout={\"width\": \"max-content\"},\n",
+ ")\n",
+ "display(IPython.display.Markdown(\"## Select a SageMaker pre-trained model from the dropdown below\"))\n",
+ "display(dropdown)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c50ca21f",
+ "metadata": {},
+ "source": [
+ "## 3. Run inference on the pre-trained model\n",
+ "***\n",
+ "Using SageMaker, we can perform inference on the pre-trained model, even without fine-tuning it first on a custom dataset. For this example, that means on an input image, predicting the [class label from one of the 1000 classes of the ImageNet dataset](https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt).\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "25d49542",
+ "metadata": {},
+ "source": [
+ "### 3.1. Retrieve Artifacts & Deploy an Endpoint\n",
+ "***\n",
+ "We retrieve the deploy_image_uri, deploy_source_uri, and base_model_uri for the pre-trained model. To host the pre-trained base-model, we create an instance of [`sagemaker.model.Model`](https://sagemaker.readthedocs.io/en/stable/api/inference/model.html) and deploy it.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6e0b50d5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import image_uris, model_uris, script_uris\n",
+ "from sagemaker.model import Model\n",
+ "from sagemaker.predictor import Predictor\n",
+ "from sagemaker.utils import name_from_base\n",
+ "\n",
+ "# model_version=\"*\" fetches the latest version of the model.\n",
+ "infer_model_id, infer_model_version = dropdown.value, \"*\"\n",
+ "\n",
+ "endpoint_name = name_from_base(f\"jumpstart-example-{infer_model_id}\")\n",
+ "\n",
+ "inference_instance_type = \"ml.p2.xlarge\"\n",
+ "\n",
+ "# Retrieve the inference docker container uri.\n",
+ "deploy_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " image_scope=\"inference\",\n",
+ " model_id=infer_model_id,\n",
+ " model_version=infer_model_version,\n",
+ " instance_type=inference_instance_type,\n",
+ ")\n",
+ "# Retrieve the inference script uri.\n",
+ "deploy_source_uri = script_uris.retrieve(\n",
+ " model_id=infer_model_id, model_version=infer_model_version, script_scope=\"inference\"\n",
+ ")\n",
+ "# Retrieve the base model uri.\n",
+ "base_model_uri = model_uris.retrieve(\n",
+ " model_id=infer_model_id, model_version=infer_model_version, model_scope=\"inference\"\n",
+ ")\n",
+ "# Create the SageMaker model instance. Note that we need to pass Predictor class when we deploy model through Model class,\n",
+ "# for being able to run inference through the sagemaker API.\n",
+ "model = Model(\n",
+ " image_uri=deploy_image_uri,\n",
+ " source_dir=deploy_source_uri,\n",
+ " model_data=base_model_uri,\n",
+ " entry_point=\"inference.py\",\n",
+ " role=aws_role,\n",
+ " predictor_cls=Predictor,\n",
+ " name=endpoint_name,\n",
+ ")\n",
+ "# deploy the Model.\n",
+ "base_model_predictor = model.deploy(\n",
+ " initial_instance_count=1,\n",
+ " instance_type=inference_instance_type,\n",
+ " endpoint_name=endpoint_name,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ea5496e",
+ "metadata": {},
+ "source": [
+ "### 3.2. Download example images for inference\n",
+ "***\n",
+ "We download example images from a public S3 bucket.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "60b98034",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_bucket = f\"jumpstart-cache-prod-{aws_region}\"\n",
+ "key_prefix = \"inference-notebook-assets\"\n",
+ "\n",
+ "\n",
+ "def download_from_s3(images):\n",
+ " for filename, image_key in images.items():\n",
+ " boto3.client(\"s3\").download_file(s3_bucket, f\"{key_prefix}/{image_key}\", filename)\n",
+ "\n",
+ "\n",
+ "images = {\"img1.jpg\": \"cat.jpg\", \"img2.jpg\": \"dog.jpg\"}\n",
+ "download_from_s3(images)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a435058a",
+ "metadata": {},
+ "source": [
+ "### 3.3. Query endpoint and parse response\n",
+ "***\n",
+ "Input to the endpoint is a single image in binary format. Response from the endpoint is a dictionary containing the top-1 predicted class label, and a list of class probabilities.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2306b489",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from IPython.core.display import HTML\n",
+ "\n",
+ "\n",
+ "def predict_top_k_labels(probabilities, labels, k):\n",
+ " topk_prediction_ids = sorted(\n",
+ " range(len(probabilities)), key=lambda index: probabilities[index], reverse=True\n",
+ " )[:k]\n",
+ " topk_class_labels = \", \".join([labels[id] for id in topk_prediction_ids])\n",
+ " return topk_class_labels\n",
+ "\n",
+ "\n",
+ "for image_filename in images.keys():\n",
+ " with open(image_filename, \"rb\") as file:\n",
+ " img = file.read()\n",
+ " query_response = base_model_predictor.predict(\n",
+ " img, {\"ContentType\": \"application/x-image\", \"Accept\": \"application/json;verbose\"}\n",
+ " )\n",
+ " model_predictions = json.loads(query_response)\n",
+ " labels, probabilities = model_predictions[\"labels\"], model_predictions[\"probabilities\"]\n",
+ " top5_class_labels = predict_top_k_labels(probabilities, labels, 5)\n",
+ " display(\n",
+ " HTML(\n",
+ " f''\n",
+ " f\"Top-5 predictions: {top5_class_labels} \"\n",
+ " )\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "797169e7",
+ "metadata": {},
+ "source": [
+ "### 3.4. Clean up the endpoint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "835e888b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Delete the SageMaker endpoint and the attached resources\n",
+ "base_model_predictor.delete_model()\n",
+ "base_model_predictor.delete_endpoint()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "504466ea",
+ "metadata": {},
+ "source": [
+ "## 4. Fine-tune the pre-trained model on a custom dataset\n",
+ "***\n",
+ "Previously, we saw how to run inference on a pre-trained model. Next, we discuss how a model can be fine-tuned to a custom dataset with any number of classes. \n",
+ "\n",
+ "The model available for fine-tuning attaches a classification layer to the corresponding feature extractor model available on TensorFlow/PyTorch hub, and initializes the layer parameters to random values. The output dimension of the classification layer is determined based on the number of classes in the input data. The fine-tuning step fine-tunes the model parameters. The objective is to minimize classification error on the input data. The model returned by fine-tuning can be further deployed for inference. Below are the instructions for how the training data should be formatted for input to the model.\n",
+ "\n",
+ "- **Input:** A directory with as many sub-directories as the number of classes. \n",
+ " - Each sub-directory should have images belonging to that class in .jpg format. \n",
+ "- **Output:** A trained model that can be deployed for inference. \n",
+ " - A label mapping file is saved along with the trained model file on the s3 bucket. \n",
+ " \n",
+ "The input directory should look like below if the training data contains images from two classes: roses and dandelion. The s3 path should look like `s3://bucket_name/input_directory/`. Note the trailing `/` is required. The names of the folders and 'roses', 'dandelion', and the .jpg filenames can be anything. The label mapping file that is saved along with the trained model on the s3 bucket maps the folder names 'roses' and 'dandelion' to the indices in the list of class probabilities the model outputs. The mapping follows alphabetical ordering of the folder names. In the example below, index 0 in the model output list would correspond to 'dandelion' and index 1 would correspond to 'roses'.\n",
+ "\n",
+ " input_directory\n",
+ " |--roses\n",
+ " |--abc.jpg\n",
+ " |--def.jpg\n",
+ " |--dandelion\n",
+ " |--ghi.jpg\n",
+ " |--jkl.jpg\n",
+ "\n",
+ "We provide tf_flowers dataset as a default dataset for fine-tuning the model. tf_flower comprises images of five types of flowers. The dataset has been downloaded from [TensorFlow](https://www.tensorflow.org/datasets/catalog/tf_flowers) under [Apache 2.0 License](https://jumpstart-cache-prod-us-west-2.s3-us-west-2.amazonaws.com/licenses/Apache-License/LICENSE-2.0.txt).\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bbe2c89a",
+ "metadata": {},
+ "source": [
+ "### 4.1. Retrieve Training artifacts\n",
+ "***\n",
+ "Here, for the selected model, we retrieve the training docker container, the training algorithm source, the pre-trained base model, and a python dictionary of the training hyper-parameters that the algorithm accepts with their default values. Note that the model_version=\"*\" fetches the lates model. Also, we do need to specify the training_instance_type to fetch train_image_uri.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d47fb1e8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import image_uris, model_uris, script_uris, hyperparameters\n",
+ "\n",
+ "model_id, model_version = dropdown.value, \"*\"\n",
+ "training_instance_type = \"ml.p3.2xlarge\"\n",
+ "\n",
+ "# Retrieve the docker image\n",
+ "train_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " model_id=model_id,\n",
+ " model_version=model_version,\n",
+ " image_scope=\"training\",\n",
+ " instance_type=training_instance_type,\n",
+ ")\n",
+ "# Retrieve the training script\n",
+ "train_source_uri = script_uris.retrieve(\n",
+ " model_id=model_id, model_version=model_version, script_scope=\"training\"\n",
+ ")\n",
+ "# Retrieve the pre-trained model tarball to further fine-tune\n",
+ "train_model_uri = model_uris.retrieve(\n",
+ " model_id=model_id, model_version=model_version, model_scope=\"training\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "483cbb5b",
+ "metadata": {},
+ "source": [
+ "### 4.2. Set Training parameters\n",
+ "***\n",
+ "Now that we are done with all the setup that is needed, we are ready to fine-tune our Image Classification model. To begin, let us create a [``sageMaker.estimator.Estimator``](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) object. This estimator will launch the training job. \n",
+ "\n",
+ "There are two kinds of parameters that need to be set for training. \n",
+ "\n",
+ "The first one are the parameters for the training job. These include: (i) Training data path. This is S3 folder in which the input data is stored, (ii) Output path: This the s3 folder in which the training output is stored. (iii) Training instance type: This indicates the type of machine on which to run the training. Typically, we use GPU instances for these training. We defined the training instance type above to fetch the correct train_image_uri. \n",
+ "\n",
+ "The second set of parameters are algorithm specific training hyper-parameters.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a6897f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sample training data is available in this bucket\n",
+ "training_data_bucket = f\"jumpstart-cache-prod-{aws_region}\"\n",
+ "training_data_prefix = \"training-datasets/tf_flowers/\"\n",
+ "\n",
+ "training_dataset_s3_path = f\"s3://{training_data_bucket}/{training_data_prefix}\"\n",
+ "\n",
+ "output_bucket = sess.default_bucket()\n",
+ "output_prefix = \"jumpstart-example-ic-training\"\n",
+ "\n",
+ "s3_output_location = f\"s3://{output_bucket}/{output_prefix}/output\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "410123e7",
+ "metadata": {},
+ "source": [
+ "***\n",
+ "For algorithm specific hyper-parameters, we start by fetching python dictionary of the training hyper-parameters that the algorithm accepts with their default values. This can then be overridden to custom values.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d4265a8f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import hyperparameters\n",
+ "\n",
+ "# Retrieve the default hyper-parameters for fine-tuning the model\n",
+ "hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)\n",
+ "\n",
+ "# [Optional] Override default hyperparameters with custom values\n",
+ "hyperparameters[\"epochs\"] = \"5\"\n",
+ "print(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0095df25",
+ "metadata": {},
+ "source": [
+ "### 4.3. Train with Automatic Model Tuning ([HPO](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning.html)) \n",
+ "***\n",
+ "Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. We will use a [HyperparameterTuner](https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html) object to interact with Amazon SageMaker hyperparameter tuning APIs.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "147d2dc8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.tuner import ContinuousParameter\n",
+ "\n",
+ "# Use AMT for tuning and selecting the best model\n",
+ "use_amt = False\n",
+ "\n",
+ "# Define objective metric per framework, based on which the best model will be selected.\n",
+ "amt_metric_definitions = {\n",
+ " \"metrics\": [{\"Name\": \"val_accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"}],\n",
+ " \"type\": \"Maximize\",\n",
+ "}\n",
+ "\n",
+ "# You can select from the hyperparameters supported by the model, and configure ranges of values to be searched for training the optimal model.(https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html)\n",
+ "hyperparameter_ranges = {\n",
+ " \"adam-learning-rate\": ContinuousParameter(0.0001, 0.1, scaling_type=\"Logarithmic\")\n",
+ "}\n",
+ "\n",
+ "# Increase the total number of training jobs run by AMT, for increased accuracy (and training time).\n",
+ "max_jobs = 6\n",
+ "# Change parallel training jobs run by AMT to reduce total training time, constrained by your account limits.\n",
+ "# if max_jobs=max_parallel_jobs then Bayesian search turns to Random.\n",
+ "max_parallel_jobs = 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3011dd2",
+ "metadata": {},
+ "source": [
+ "### 4.4. Start Training\n",
+ "***\n",
+ "We start by creating the estimator object with all the required assets and then launch the training job.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5068463b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.estimator import Estimator\n",
+ "from sagemaker.utils import name_from_base\n",
+ "from sagemaker.tuner import HyperparameterTuner\n",
+ "\n",
+ "training_job_name = name_from_base(f\"jumpstart-example-{model_id}-transfer-learning\")\n",
+ "\n",
+ "training_metric_definitions = [\n",
+ " {\"Name\": \"val_accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"val_loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train_accuracy\", \"Regex\": \"- accuracy: ([0-9\\\\.]+)\"},\n",
+ " {\"Name\": \"train_loss\", \"Regex\": \"- loss: ([0-9\\\\.]+)\"},\n",
+ "]\n",
+ "\n",
+ "# Create SageMaker Estimator instance\n",
+ "ic_estimator = Estimator(\n",
+ " role=aws_role,\n",
+ " image_uri=train_image_uri,\n",
+ " source_dir=train_source_uri,\n",
+ " model_uri=train_model_uri,\n",
+ " entry_point=\"transfer_learning.py\",\n",
+ " instance_count=1,\n",
+ " instance_type=training_instance_type,\n",
+ " max_run=360000,\n",
+ " hyperparameters=hyperparameters,\n",
+ " output_path=s3_output_location,\n",
+ " base_job_name=training_job_name,\n",
+ " metric_definitions=training_metric_definitions,\n",
+ ")\n",
+ "\n",
+ "if use_amt:\n",
+ "\n",
+ " hp_tuner = HyperparameterTuner(\n",
+ " ic_estimator,\n",
+ " amt_metric_definitions[\"metrics\"][0][\"Name\"],\n",
+ " hyperparameter_ranges,\n",
+ " amt_metric_definitions[\"metrics\"],\n",
+ " max_jobs=max_jobs,\n",
+ " max_parallel_jobs=max_parallel_jobs,\n",
+ " objective_type=amt_metric_definitions[\"type\"],\n",
+ " base_tuning_job_name=training_job_name,\n",
+ " )\n",
+ "\n",
+ " # Launch a SageMaker Tuning job to search for the best hyperparameters\n",
+ " hp_tuner.fit({\"training\": training_dataset_s3_path})\n",
+ "else:\n",
+ " # Launch a SageMaker Training job by passing s3 path of the training data\n",
+ " ic_estimator.fit({\"training\": training_dataset_s3_path}, logs=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e75e44c",
+ "metadata": {},
+ "source": [
+ "### 4.5. Extract Training performance metrics\n",
+ "***\n",
+ "Performance metrics such as training accuracy/loss and validation accuracy/loss can be accessed through cloudwatch while the training. Code below provides the link to the cloudwatch log where these metrics can be found. \n",
+ "\n",
+ "Note that default resolution in Amazon Cloudwatch is one minute i.e. it averages the metrics logged within a single minute interval. Amazon CloudWatch also supports [high-resolution custom metrics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/publishingMetrics.html), and its finest resolution is 1 second. However, the finer the resolution, the shorter the lifespan of the CloudWatch metrics. For the 1-second frequency resolution, the CloudWatch metrics are available for 3 hours. For more information about the resolution and the lifespan of the CloudWatch metrics, see [GetMetricStatistics](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_GetMetricStatistics.html) in the Amazon CloudWatch API Reference.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6120c260",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if use_amt:\n",
+ " training_job_name = hp_tuner.best_training_job()\n",
+ "else:\n",
+ " training_job_name = ic_estimator.latest_training_job.job_name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "422ac8fc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sagemaker\n",
+ "from IPython.core.display import Markdown\n",
+ "\n",
+ "sagemaker_session = sagemaker.Session()\n",
+ "\n",
+ "link = (\n",
+ " \"https://console.aws.amazon.com/cloudwatch/home?region=\"\n",
+ " + sagemaker_session.boto_region_name\n",
+ " + \"#metricsV2:query=%7B/aws/sagemaker/TrainingJobs,TrainingJobName%7D%20\"\n",
+ " + training_job_name\n",
+ ")\n",
+ "display(Markdown(\"CloudWatch metrics: [link](\" + link + \")\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd15c4bb",
+ "metadata": {},
+ "source": [
+ "***\n",
+ "Alternatively, we can also fetch these metrics and analyze them within the notebook.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d915b42b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import TrainingJobAnalytics\n",
+ "\n",
+ "df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()\n",
+ "\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd0ab950",
+ "metadata": {},
+ "source": [
+ "***\n",
+ "We can filter out different metrics by names as well.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "df44f7f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metric_names = [metric[\"Name\"] for metric in training_metric_definitions]\n",
+ "\n",
+ "metrics_df = {\n",
+ " metric_name: df.query(f\"metric_name == '{metric_name}'\") for metric_name in metric_names\n",
+ "}\n",
+ "\n",
+ "metrics_df[\"val_loss\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08072894",
+ "metadata": {},
+ "source": [
+ "## 4.6. Deploy & run Inference on the fine-tuned model\n",
+ "***\n",
+ "A trained model does nothing on its own. We now want to use the model to perform inference. For this example, that means predicting the class label of an image. We follow the same steps as in the [Section 3 - Run inference on the pre-trained model](#3.-Run-inference-on-the-pre-trained-model). We start by retrieving the artifacts for deploying an endpoint. However, instead of base_predictor, we deploy the `ic_estimator` that we fine-tuned.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7915265a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inference_instance_type = \"ml.p2.xlarge\"\n",
+ "\n",
+ "# Retrieve the inference docker container uri\n",
+ "deploy_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " image_scope=\"inference\",\n",
+ " model_id=model_id,\n",
+ " model_version=model_version,\n",
+ " instance_type=inference_instance_type,\n",
+ ")\n",
+ "# Retrieve the inference script uri\n",
+ "deploy_source_uri = script_uris.retrieve(\n",
+ " model_id=model_id, model_version=model_version, script_scope=\"inference\"\n",
+ ")\n",
+ "\n",
+ "endpoint_name = name_from_base(f\"jumpstart-example-FT-{model_id}-\")\n",
+ "\n",
+ "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n",
+ "finetuned_predictor = (hp_tuner if use_amt else ic_estimator).deploy(\n",
+ " initial_instance_count=1,\n",
+ " instance_type=inference_instance_type,\n",
+ " entry_point=\"inference.py\",\n",
+ " image_uri=deploy_image_uri,\n",
+ " source_dir=deploy_source_uri,\n",
+ " endpoint_name=endpoint_name,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bccf7925",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Next, we download example images of a rose and a sunflower from the S3 bucket for inference.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f6eb2261",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_bucket = f\"jumpstart-cache-prod-{aws_region}\"\n",
+ "key_prefix = \"training-datasets/tf_flowers\"\n",
+ "\n",
+ "\n",
+ "def download_from_s3(images):\n",
+ " for filename, image_key in images.items():\n",
+ " boto3.client(\"s3\").download_file(s3_bucket, f\"{key_prefix}/{image_key}\", filename)\n",
+ "\n",
+ "\n",
+ "flower_images = {\n",
+ " \"img1.jpg\": \"roses/10503217854_e66a804309.jpg\",\n",
+ " \"img2.jpg\": \"sunflowers/1008566138_6927679c8a.jpg\",\n",
+ "}\n",
+ "download_from_s3(flower_images)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2a3f382f",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Next, we query the fine-tuned model, parse the response and display the predictions.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "94dc4f40",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from IPython.core.display import HTML\n",
+ "\n",
+ "for image_filename in flower_images.keys():\n",
+ " with open(image_filename, \"rb\") as file:\n",
+ " img = file.read()\n",
+ " query_response = finetuned_predictor.predict(\n",
+ " img, {\"ContentType\": \"application/x-image\", \"Accept\": \"application/json;verbose\"}\n",
+ " )\n",
+ " model_predictions = json.loads(query_response)\n",
+ " predicted_label = model_predictions[\"predicted_label\"]\n",
+ " display(\n",
+ " HTML(\n",
+ " f''\n",
+ " f\"Predicted Label: {predicted_label}\"\n",
+ " )\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8c672f19",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Next, we clean up the deployed endpoint.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ad9f7b01",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Delete the SageMaker endpoint and the attached resources\n",
+ "finetuned_predictor.delete_model()\n",
+ "finetuned_predictor.delete_endpoint()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "25ae6c5d",
+ "metadata": {},
+ "source": [
+ "## 4.7. Incrementally train the fine-tuned model\n",
+ "\n",
+ "***\n",
+ "Incremental training allows you to train a new model using an expanded dataset that contains an underlying pattern that was not accounted for in the previous training and which resulted in poor model performance. You can use the artifacts from an existing model and use an expanded dataset to train a new model. Incremental training saves both time and resources as you don’t need to retrain a model from scratch.\n",
+ "\n",
+ "One may use any dataset (old or new) as long as the dataset format remain the same (set of classes). Incremental training step is similar to the finetuning step discussed above with the following difference: In fine-tuning above, we start with a pre-trained model whereas in incremental training, we start with an existing fine-tuned model.\n",
+ "***"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3e55c2b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Identify the previously trained model path based on the output location where artifacts are stored previously and the training job name.\n",
+ "\n",
+ "if use_amt: # If using amt, select the model for the best training job.\n",
+ " sage_client = boto3.Session().client(\"sagemaker\")\n",
+ " tuning_job_result = sage_client.describe_hyper_parameter_tuning_job(\n",
+ " HyperParameterTuningJobName=hp_tuner._current_job_name\n",
+ " )\n",
+ " last_training_job_name = tuning_job_result[\"BestTrainingJob\"][\"TrainingJobName\"]\n",
+ "else:\n",
+ " last_training_job_name = ic_estimator._current_job_name\n",
+ "\n",
+ "last_trained_model_path = f\"{s3_output_location}/{last_training_job_name}/output/model.tar.gz\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "83d48880",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "incremental_train_output_prefix = \"jumpstart-example-ic-incremental-training\"\n",
+ "\n",
+ "incremental_s3_output_location = f\"s3://{output_bucket}/{incremental_train_output_prefix}/output\"\n",
+ "\n",
+ "incremental_training_job_name = name_from_base(f\"jumpstart-example-{model_id}-incremental-training\")\n",
+ "\n",
+ "incremental_train_estimator = Estimator(\n",
+ " role=aws_role,\n",
+ " image_uri=train_image_uri,\n",
+ " source_dir=train_source_uri,\n",
+ " model_uri=last_trained_model_path,\n",
+ " entry_point=\"transfer_learning.py\",\n",
+ " instance_count=1,\n",
+ " instance_type=training_instance_type,\n",
+ " max_run=360000,\n",
+ " hyperparameters=hyperparameters,\n",
+ " output_path=incremental_s3_output_location,\n",
+ " base_job_name=incremental_training_job_name,\n",
+ " metric_definitions=training_metric_definitions,\n",
+ ")\n",
+ "\n",
+ "incremental_train_estimator.fit({\"training\": training_dataset_s3_path}, logs=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ceb937a0",
+ "metadata": {},
+ "source": [
+ "Once trained, we can use the same steps as in [Deploy & run Inference on the fine-tuned model](#4.5.-Deploy-&-run-Inference-on-the-fine-tuned-model) to deploy the model."
+ ]
+ }
+ ],
+ "metadata": {
+ "instance_type": "ml.t3.medium",
+ "kernelspec": {
+ "display_name": "conda_python3",
+ "language": "python",
+ "name": "conda_python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/introduction_to_amazon_algorithms/image_classification_tensorflow/README.md b/introduction_to_amazon_algorithms/image_classification_tensorflow/README.md
new file mode 100644
index 0000000000..600c76de0f
--- /dev/null
+++ b/introduction_to_amazon_algorithms/image_classification_tensorflow/README.md
@@ -0,0 +1,2 @@
+### SageMaker TensorFlow Image classification Training & Deployment
+This notebook `Amazon_TensorFlow_Image_Classification.ipynb` demos how to fine-tune and deploy a pre-trained image classification model using SageMaker API. It shows how to select a pre-trained TensorFlow image classification model and fine-tune it on an example dataset containing raw .jpg/.png images, while varying training hyperparameters such as learning rate, batch-size and number of epochs. AMT (Automatic Model Tuning) is used to search for the best hyperparameters. Once the training is complete, the notebook shows how to host the trained model for inference. It also shows how to host the pre-trained model as-it-is without first fine-tuning it.
diff --git a/introduction_to_applying_machine_learning/README.md b/introduction_to_applying_machine_learning/README.md
index 9b40687dc6..45416e7ddb 100644
--- a/introduction_to_applying_machine_learning/README.md
+++ b/introduction_to_applying_machine_learning/README.md
@@ -5,6 +5,7 @@
These examples provide a gentle introduction to machine learning concepts as they are applied in practical use cases across a variety of sectors.
- [Predicting Customer Churn](xgboost_customer_churn) uses customer interaction and service usage data to find those most likely to churn, and then walks through the cost/benefit trade-offs of providing retention incentives. This uses Amazon SageMaker's implementation of [XGBoost](https://github.com/dmlc/xgboost) to create a highly predictive model.
+- [Predicting Customer Churn](lightgbm_catboost_tabtransformer_autogluon_churn) uses Amazon SageMaker's implementation of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), [CatBoost](https://catboost.ai/), [TabTransformer](https://arxiv.org/abs/2012.06678), and [AutoGluon-Tabular](https://auto.gluon.ai/stable/index.html) with [SageMaker Automatic Model Tuning](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning.html) to create four predictive models on customer churn dataset, and evaluate their performance on the same test data.
- [Cancer Prediction](breast_cancer_prediction) predicts Breast Cancer based on features derived from images, using SageMaker's Linear Learner.
- [Ensembling](ensemble_modeling) predicts income using two Amazon SageMaker models to show the advantages in ensembling.
- [Video Game Sales](video_game_sales) develops a binary prediction model for the success of video games based on review scores.
diff --git a/introduction_to_applying_machine_learning/lightgbm_catboost_tabtransformer_autogluon_churn/churn-prediction-lightgbm-catboost-tabtransformer-autogluon.ipynb b/introduction_to_applying_machine_learning/lightgbm_catboost_tabtransformer_autogluon_churn/churn-prediction-lightgbm-catboost-tabtransformer-autogluon.ipynb
new file mode 100644
index 0000000000..954455049b
--- /dev/null
+++ b/introduction_to_applying_machine_learning/lightgbm_catboost_tabtransformer_autogluon_churn/churn-prediction-lightgbm-catboost-tabtransformer-autogluon.ipynb
@@ -0,0 +1,1857 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7ef64164",
+ "metadata": {},
+ "source": [
+ "# Customer Churn Prediction using Amazon SageMaker LightGBM, CatBoost, TabTransformer, and AutoGluon-Tabular with SageMaker AMT (Automatic Model Tuning)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ca3e116",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Losing customers is costly for any business. Identifying unhappy customers early on gives you a chance to offer them incentives to stay. This notebook describes using machine learning (ML) for the automated identification of unhappy customers, also known as customer churn prediction. ML models rarely give perfect predictions though, so this notebook is also about how to incorporate the relative costs of prediction mistakes when determining the financial outcome of using ML.\n",
+ "\n",
+ "This notebook demonstrates the use of Amazon SageMaker’s implementation of the [LightGBM](https://lightgbm.readthedocs.io/en/latest/), [CatBoost](https://catboost.ai/en/docs/), [TabTransformer](https://arxiv.org/abs/2012.06678), and [AutoGluon-Tabular](https://auto.gluon.ai/stable/tutorials/tabular_prediction/index.html) algorithm to train and host a customer churn prediction model with [SageMaker AMT](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning.html)(Automatic Model tuning).\n",
+ "\n",
+ "In this notebook, we demonstrate two use cases for each algorithm:\n",
+ "\n",
+ "* How to train a tabular model on the customer churn dataset with AMT.\n",
+ "* How to use the trained tabular model to perform inference, i.e., classifying new samples.\n",
+ "\n",
+ "In the end, we compare the performance of four models trained with AMT on the same test data.\n",
+ "\n",
+ "Note: This notebook was tested in Amazon SageMaker Studio on ml.t3.medium instance with Python 3 (Data Science) kernel.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5291f501",
+ "metadata": {},
+ "source": [
+ "1. [Set Up](#1.-Set-Up)\n",
+ "2. [Data Preparation and Visualization](#2.-Data-Preparation-and-Visualization)\n",
+ "3. [Train A LightGBM Model with AMT](#3.-Train-A-LightGBM-Model-with-AMT)\n",
+ " * [Retrieve Training Artifacts](#3.1.-Retrieve-Training-Artifacts)\n",
+ " * [Set Training Parameters](#3.2.-Set-Training-Parameters)\n",
+ " * [Train with Automatic Model Tuning](#3.3.-Train-with-Automatic-Model-Tuning) \n",
+ " * [Start Training](#3.4.-Start-Training)\n",
+ " * [Deploy and Run Inference on the Trained Tabular Model](#3.5.-Deploy-and-Run-Inference-on-the-Trained-Tabular-Model)\n",
+ " * [Evaluate the Prediction Results Returned from the Endpoint](#3.6.-Evaluate-the-Prediction-Results-Returned-from-the-Endpoint)\n",
+ "4. [Train A CatBoost model with AMT](#4.-Train-A-CatBoost-model-with-AMT)\n",
+ " * [Train with Automatic Model Tuning](#4.1.-Train-with-Automatic-Model-Tuning) \n",
+ " * [Deploy and Run Inference on the Trained Tabular Model](#4.2.-Deploy-and-Run-Inference-on-the-Trained-Tabular-Model)\n",
+ "5. [Train A TabTransformer model with AMT](#5.-Train-A-TabTransformer-model-with-AMT)\n",
+ " * [Train with Automatic Model Tuning](#5.1.-Train-with-Automatic-Model-Tuning) \n",
+ " * [Deploy and Run Inference on the Trained Tabular Model](#5.2.-Deploy-and-Run-Inference-on-the-Trained-Tabular-Model)\n",
+ "6. [Train An AutoGluon-Tabular model](#6.-Train-An-AutoGluon-Tabular-model)\n",
+ " * [Train with AutoGluon-Tabular model](#6.1.-Train-with-AutoGluon-Tabular-model) \n",
+ " * [Deploy and Run Inference on the Trained Tabular Model](#6.2.-Deploy-and-Run-Inference-on-the-Trained-Tabular-Model)\n",
+ "7. [Compare Prediction Results of Four Trained Models on the Same Test Data](#7.-Compare-Prediction-Results-of-Four-Trained-Models-on-the-Same-Test-Data)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "62af3c2e",
+ "metadata": {},
+ "source": [
+ "## 1. Set Up\n",
+ "\n",
+ "---\n",
+ "Before executing the notebook, there are some initial steps required for setup. This notebook requires latest version of sagemaker and ipywidgets.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "def1e09f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install sagemaker ipywidgets --upgrade --quiet"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26a8ccde",
+ "metadata": {},
+ "source": [
+ "\n",
+ "---\n",
+ "To train and host on Amazon SageMaker, we need to setup and authenticate the use of AWS services. Here, we use the execution role associated with the current notebook instance as the AWS account role with SageMaker access. It has necessary permissions, including access to your data in S3.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7516a221",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sagemaker, boto3, json\n",
+ "from sagemaker import get_execution_role\n",
+ "\n",
+ "aws_role = get_execution_role()\n",
+ "aws_region = boto3.Session().region_name\n",
+ "sess = sagemaker.Session()\n",
+ "\n",
+ "bucket = sess.default_bucket()\n",
+ "prefix = \"sagemaker/DEMO-churn\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6087cdb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import io\n",
+ "import os\n",
+ "import sys\n",
+ "import time\n",
+ "import json\n",
+ "from IPython.display import display\n",
+ "from time import strftime, gmtime\n",
+ "from sagemaker.inputs import TrainingInput\n",
+ "from sagemaker.serializers import CSVSerializer\n",
+ "from sklearn import preprocessing"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "efe1573f",
+ "metadata": {},
+ "source": [
+ "## 2. Data Preparation and Visualization\n",
+ "\n",
+ "Mobile operators have historical records on which customers ultimately ended up churning and which continued using the service. We can use this historical information to construct an ML model of one mobile operator’s churn using a process called training. After training the model, we can pass the profile information of an arbitrary customer (the same profile information that we used to train the model) to the model, and have the model predict whether this customer is going to churn. Of course, we expect the model to make mistakes. After all, predicting the future is tricky business! But we’ll learn how to deal with prediction errors.\n",
+ "\n",
+ "The dataset we use is publicly available and was mentioned in the book [Discovering Knowledge in Data](https://www.amazon.com/dp/0470908742/) by Daniel T. Larose. It is attributed by the author to the University of California Irvine Repository of Machine Learning Datasets. Let’s download and read that dataset in now:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "985aeaf4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3 = boto3.client(\"s3\")\n",
+ "s3.download_file(f\"sagemaker-sample-files\", \"datasets/tabular/synthetic/churn.txt\", \"churn.txt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47abdc80",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "churn = pd.read_csv(\"./churn.txt\")\n",
+ "pd.set_option(\"display.max_columns\", 500)\n",
+ "churn.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f41f0f8a",
+ "metadata": {},
+ "source": [
+ "By modern standards, it’s a relatively small dataset, with only 5,000 records, where each record uses 21 attributes to describe the profile of a customer of an unknown US mobile operator. The attributes are:\n",
+ "\n",
+ "`State`: the US state in which the customer resides, indicated by a two-letter abbreviation; for example, OH or NJ\n",
+ "\n",
+ "`Account Length`: the number of days that this account has been active\n",
+ "\n",
+ "`Area Code`: the three-digit area code of the corresponding customer’s phone number\n",
+ "\n",
+ "`Phone`: the remaining seven-digit phone number\n",
+ "\n",
+ "`Int’l Plan`: whether the customer has an international calling plan: yes/no\n",
+ "\n",
+ "`VMail Plan`: whether the customer has a voice mail feature: yes/no\n",
+ "\n",
+ "`VMail Message`: the average number of voice mail messages per month\n",
+ "\n",
+ "`Day Mins`: the total number of calling minutes used during the day\n",
+ "\n",
+ "`Day Calls`: the total number of calls placed during the day\n",
+ "\n",
+ "`Day Charge`: the billed cost of daytime calls\n",
+ "\n",
+ "`Eve Mins`, `Eve Calls`, `Eve Charge`: the billed cost for calls placed during the evening\n",
+ "\n",
+ "`Night Mins`, `Night Calls`, `Night Charge`: the billed cost for calls placed during nighttime\n",
+ "\n",
+ "`Intl Mins`, `Intl Calls`, `Intl Charge`: the billed cost for international calls\n",
+ "\n",
+ "`CustServ Calls`: the number of calls placed to Customer Service\n",
+ "\n",
+ "`Churn?`: whether the customer left the service: true/false\n",
+ "\n",
+ "The last attribute, `Churn?`, is known as the target attribute: the attribute that we want the ML model to predict. Because the target attribute is binary, our model will be performing binary prediction, also known as binary classification.\n",
+ "\n",
+ "Let’s begin exploring the data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ddb61970",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Histograms for each numeric features\n",
+ "display(churn.describe())\n",
+ "%matplotlib inline\n",
+ "hist = churn.hist(bins=30, sharey=True, figsize=(10, 10))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a2339e7e",
+ "metadata": {},
+ "source": [
+ "We can see immediately that: - `State` appears to be quite evenly distributed. - `Phone` takes on too many unique values to be of any practical use. It’s possible that parsing out the prefix could have some value, but without more context on how these are allocated, we should avoid using it. - Most of the numeric features are surprisingly nicely distributed, with many showing bell-like gaussianity. `VMail Message` is a notable exception."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cfb7d029",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "churn = churn.drop(\"Phone\", axis=1)\n",
+ "churn[\"Area Code\"] = churn[\"Area Code\"].astype(object)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7100fb95",
+ "metadata": {},
+ "source": [
+ "Next let’s look at the relationship between each of the features and our target variable."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5f5b300",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for column in churn.select_dtypes(include=[\"object\"]).columns:\n",
+ " if column != \"Churn?\":\n",
+ " display(pd.crosstab(index=churn[column], columns=churn[\"Churn?\"], normalize=\"columns\"))\n",
+ "\n",
+ "for column in churn.select_dtypes(exclude=[\"object\"]).columns:\n",
+ " print(column)\n",
+ " hist = churn[[column, \"Churn?\"]].hist(by=\"Churn?\", bins=30)\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ead0fead",
+ "metadata": {},
+ "source": [
+ "We convert the target attribute to binary value and move it to the first column of the dataset to meet requirements of SageMaker built-in tabular algorithms (For an example, see [SageMaker LightGBM documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/lightgbm.html))."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "df47dff8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "churn[\"target\"] = churn[\"Churn?\"].map({\"True.\": 1, \"False.\": 0})\n",
+ "churn.drop([\"Churn?\"], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9769380f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "churn = churn[[\"target\"] + churn.columns.tolist()[:-1]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "076403fe",
+ "metadata": {},
+ "source": [
+ "We identify the column indexes of the categorical attribute, which is required by LightGBM, CatBoost, and TabTransformer algorithm (AutoGluon-Tabular has built-in feature engineering to identify the categorical attribute automatically, and thus does not require such input)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0421ab18",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cat_columns = [\n",
+ " \"State\",\n",
+ " \"Account Length\",\n",
+ " \"Area Code\",\n",
+ " \"Phone\",\n",
+ " \"Int'l Plan\",\n",
+ " \"VMail Plan\",\n",
+ " \"VMail Message\",\n",
+ " \"Day Calls\",\n",
+ " \"Eve Calls\",\n",
+ " \"Night Calls\",\n",
+ " \"Intl Calls\",\n",
+ " \"CustServ Calls\",\n",
+ "]\n",
+ "\n",
+ "cat_idx = []\n",
+ "for idx, col_name in enumerate(churn.columns.tolist()):\n",
+ " if col_name in cat_columns:\n",
+ " cat_idx.append(idx)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a865ba04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(\"cat_idx.json\", \"w\") as outfile:\n",
+ " json.dump({\"cat_idx\": cat_idx}, outfile)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4092e255",
+ "metadata": {},
+ "source": [
+ "[LightGBM official documentation](https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support) requires that all categorical features should be encoded as non-negative integers. We do it consistently for all the other algorithms."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "740e6b02",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for idx, col_name in enumerate(churn.columns.tolist()):\n",
+ " if col_name in cat_columns:\n",
+ " le = preprocessing.LabelEncoder()\n",
+ " churn[col_name] = le.fit_transform(churn[col_name])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a11d76b",
+ "metadata": {},
+ "source": [
+ "We split the churn dataset into train, validation, and test set using stratified sampling. Validation set is used for early stopping and AMT. Test set is used for performance evaluations in the end. Next, we upload them into a S3 path for training.\n",
+ "\n",
+ "The structure of the S3 path for training should be structured as below. The `cat_idx.json` is categorical column indexes.\n",
+ "\n",
+ "-- `train` \n",
+ " -- `data.csv` \n",
+ "-- `validation` \n",
+ " -- `data.csv` \n",
+ "-- `cat_idx.json`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fee4296f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "train, val_n_test = train_test_split(\n",
+ " churn, test_size=0.3, random_state=42, stratify=churn[\"target\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48080aca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "val, test = train_test_split(\n",
+ " val_n_test, test_size=0.3, random_state=42, stratify=val_n_test[\"target\"]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1771b769",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train.to_csv(\"train.csv\", header=False, index=False)\n",
+ "val.to_csv(\"validation.csv\", header=False, index=False)\n",
+ "test.to_csv(\"test.csv\", header=False, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c26e7053",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n",
+ " os.path.join(prefix, \"train/data.csv\")\n",
+ ").upload_file(\"train.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c297dff2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n",
+ " os.path.join(prefix, \"validation/data.csv\")\n",
+ ").upload_file(\"validation.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3cb55d7a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n",
+ " os.path.join(prefix, \"test/data.csv\")\n",
+ ").upload_file(\"test.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "042b6f55",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "boto3.Session().resource(\"s3\").Bucket(bucket).Object(\n",
+ " os.path.join(prefix, \"cat_idx.json\")\n",
+ ").upload_file(\"cat_idx.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b278de2a",
+ "metadata": {},
+ "source": [
+ "## 3. Train A LightGBM Model with AMT"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26d18ad9",
+ "metadata": {},
+ "source": [
+ "### 3.1. Retrieve Training Artifacts\n",
+ "\n",
+ "___\n",
+ "\n",
+ "Here, we retrieve the training docker container, the training algorithm source, and the tabular algorithm. Note that model_version=\"*\" fetches the latest model.\n",
+ "\n",
+ "For the training algorithm, we have four choices in this demonstration for classification task.\n",
+ "* [LightGBM](https://lightgbm.readthedocs.io/en/latest/): To use this algorithm, specify `train_model_id` as `lightgbm-classification-model` in the cell below.\n",
+ "* [CatBoost](https://catboost.ai/en/docs/): To use this algorithm, specify `train_model_id` as `catboost-classification-model` in the cell below.\n",
+ "* [TabTransformer](https://arxiv.org/abs/2012.06678): To use this algorithm, specify `train_model_id` as `pytorch-tabtransformerclassification-model` in the cell below.\n",
+ "* [AutoGluon Tabular](https://auto.gluon.ai/stable/tutorials/tabular_prediction/index.html): To use this algorithm, specify `train_model_id` as `autogluon-classification-ensemble` in the cell below.\n",
+ "\n",
+ "Note. [XGBoost](https://xgboost.readthedocs.io/en/latest/) (`train_model_id: xgboost-classification-model`) and [Linear Learner](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression) (`train_model_id: sklearn-classification-linear`) are the other choices in the tabular classification category. Since they have different input-format requirements, please check separate notebooks `xgboost_linear_learner_tabular/Amazon_Tabular_Classification_XGBoost_LinearLearner.ipynb`, `tabtransformer_tabular/Amazon_Tabular_Classification_TabTransformer.ipynb`, and `autogluon_tabular/Amazon_Tabular_Classification_AutoGluon.ipynb` for details.\n",
+ "\n",
+ "For regression task, you just need replace `classification` in the `train_model_id` with `regression`.\n",
+ "\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ad11b96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import image_uris, model_uris, script_uris\n",
+ "\n",
+ "train_model_id, train_model_version, train_scope = \"lightgbm-classification-model\", \"*\", \"training\"\n",
+ "training_instance_type = \"ml.m5.4xlarge\"\n",
+ "\n",
+ "# Retrieve the docker image\n",
+ "train_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " image_scope=train_scope,\n",
+ " instance_type=training_instance_type,\n",
+ ")\n",
+ "# Retrieve the training script\n",
+ "train_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=train_scope\n",
+ ")\n",
+ "# Retrieve the pre-trained model tarball to further fine-tune\n",
+ "train_model_uri = model_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, model_scope=train_scope\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8a4d3d3",
+ "metadata": {},
+ "source": [
+ "### 3.2. Set Training Parameters\n",
+ "\n",
+ "---\n",
+ "\n",
+ "Now that we are done with all the setup that is needed, we are ready to train our tabular algorithm. To begin, let us create a [``sageMaker.estimator.Estimator``](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) object. This estimator will launch the training job. \n",
+ "\n",
+ "There are two kinds of parameters that need to be set for training. The first one are the parameters for the training job. These include: (i) Training data path. This is S3 folder in which the input data is stored, (ii) Output path: This the s3 folder in which the training output is stored. (iii) Training instance type: This indicates the type of machine on which to run the training.\n",
+ "\n",
+ "The second set of parameters are algorithm specific training hyper-parameters. \n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a1f8559",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_dataset_s3_path = f\"s3://{bucket}/{prefix}\"\n",
+ "\n",
+ "output_prefix = \"jumpstart-example-tabular-training\"\n",
+ "s3_output_location = f\"s3://{bucket}/{output_prefix}/output_lgb\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8828563c",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "For algorithm specific hyper-parameters, we start by fetching python dictionary of the training hyper-parameters that the algorithm accepts with their default values. This can then be overridden to custom values. For the evaluation metric that is used by early stopping and automatic model tuning, we choose `auc` score. Note. LightGBM does not have built-in F1 score supported. See [LightGBM documentation](https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters).\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8cd5d2fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import hyperparameters\n",
+ "\n",
+ "# Retrieve the default hyper-parameters for fine-tuning the model\n",
+ "hyperparameters = hyperparameters.retrieve_default(\n",
+ " model_id=train_model_id, model_version=train_model_version\n",
+ ")\n",
+ "\n",
+ "# [Optional] Override default hyperparameters with custom values\n",
+ "hyperparameters[\n",
+ " \"num_boost_round\"\n",
+ "] = \"500\" # The same hyperparameter is named as \"iterations\" for CatBoost\n",
+ "\n",
+ "\n",
+ "hyperparameters[\"metric\"] = \"auc\"\n",
+ "print(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f43ec07c",
+ "metadata": {},
+ "source": [
+ "### 3.3. Train with Automatic Model Tuning \n",
+ "\n",
+ "\n",
+ "Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. We will use a HyperparameterTuner object to interact with Amazon SageMaker hyperparameter tuning APIs.\n",
+ "\n",
+ "* Note. In this notebook, we set AMT budget (total tuning jobs) as 10 for each of the tabular algorithm except AutoGluon-Tabular. For [AutoGluon-Tabular](https://arxiv.org/abs/2003.06505), it succeeds by ensembling multiple models and stacking them in multiple layers. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b136b897",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner\n",
+ "\n",
+ "use_amt = True\n",
+ "\n",
+ "hyperparameter_ranges_lgb = {\n",
+ " \"learning_rate\": ContinuousParameter(1e-4, 1, scaling_type=\"Logarithmic\"),\n",
+ " \"num_boost_round\": IntegerParameter(2, 30),\n",
+ " \"num_leaves\": IntegerParameter(10, 50),\n",
+ " \"feature_fraction\": ContinuousParameter(0, 1),\n",
+ " \"bagging_fraction\": ContinuousParameter(0, 1),\n",
+ " \"bagging_freq\": IntegerParameter(1, 10),\n",
+ " \"max_depth\": IntegerParameter(5, 30),\n",
+ " \"min_data_in_leaf\": IntegerParameter(5, 50),\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f209be30",
+ "metadata": {},
+ "source": [
+ "### 3.4. Start Training"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "caf86ae9",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "We start by creating the estimator object with all the required assets and then launch the training job.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6c6d9bab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.estimator import Estimator\n",
+ "from sagemaker.utils import name_from_base\n",
+ "\n",
+ "training_job_name = name_from_base(f\"jumpstart-{train_model_id}-train\")\n",
+ "\n",
+ "# Create SageMaker Estimator instance\n",
+ "tabular_estimator = Estimator(\n",
+ " role=aws_role,\n",
+ " image_uri=train_image_uri,\n",
+ " source_dir=train_source_uri,\n",
+ " model_uri=train_model_uri,\n",
+ " entry_point=\"transfer_learning.py\",\n",
+ " instance_count=1,\n",
+ " instance_type=training_instance_type,\n",
+ " max_run=360000,\n",
+ " hyperparameters=hyperparameters,\n",
+ " output_path=s3_output_location,\n",
+ ")\n",
+ "\n",
+ "if use_amt:\n",
+ "\n",
+ " tuner = HyperparameterTuner(\n",
+ " tabular_estimator,\n",
+ " \"auc\",\n",
+ " hyperparameter_ranges_lgb,\n",
+ " [{\"Name\": \"auc\", \"Regex\": \"auc: ([0-9\\\\.]+)\"}],\n",
+ " max_jobs=10,\n",
+ " max_parallel_jobs=5,\n",
+ " objective_type=\"Maximize\",\n",
+ " base_tuning_job_name=training_job_name,\n",
+ " )\n",
+ "\n",
+ " tuner.fit({\"training\": training_dataset_s3_path}, logs=True)\n",
+ "else:\n",
+ " # Launch a SageMaker Training job by passing s3 path of the training data\n",
+ " tabular_estimator.fit(\n",
+ " {\"training\": training_dataset_s3_path}, logs=True, job_name=training_job_name\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1f1c8f37",
+ "metadata": {},
+ "source": [
+ "### 3.5. Deploy and Run Inference on the Trained Tabular Model\n",
+ "\n",
+ "---\n",
+ "\n",
+ "In this section, you learn how to query an existing endpoint and make predictions of the examples you input. For each example, the model will output the probability of the sample for each class in the model. \n",
+ "Next, the predicted class label is obtained by taking the class label with the maximum probability over others.\n",
+ "\n",
+ "\n",
+ "We start by retrieving the artifacts and deploy the `tabular_estimator` that we trained.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d0d18d65",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inference_instance_type = \"ml.m5.large\"\n",
+ "\n",
+ "# Retrieve the inference docker container uri\n",
+ "deploy_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " image_scope=\"inference\",\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " instance_type=inference_instance_type,\n",
+ ")\n",
+ "# Retrieve the inference script uri\n",
+ "deploy_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=\"inference\"\n",
+ ")\n",
+ "\n",
+ "endpoint_name = name_from_base(f\"jumpstart-lgb-churn-{train_model_id}-\")\n",
+ "\n",
+ "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n",
+ "predictor = (tuner if use_amt else tabular_estimator).deploy(\n",
+ " initial_instance_count=1,\n",
+ " instance_type=inference_instance_type,\n",
+ " entry_point=\"inference.py\",\n",
+ " image_uri=deploy_image_uri,\n",
+ " source_dir=deploy_source_uri,\n",
+ " endpoint_name=endpoint_name,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "57a3c147",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Next, we read the customer churn test data into pandas data frame, prepare the ground truth target and predicting features to send into the endpoint. \n",
+ "\n",
+ "Below is the screenshot of the first 5 examples in the test set.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0f8fdb7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "newline, bold, unbold = \"\\n\", \"\\033[1m\", \"\\033[0m\"\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# read the data\n",
+ "test_data_file_name = \"test.csv\"\n",
+ "test_data = pd.read_csv(test_data_file_name, header=None)\n",
+ "test_data.columns = [\"Target\"] + [f\"Feature_{i}\" for i in range(1, test_data.shape[1])]\n",
+ "\n",
+ "num_examples, num_columns = test_data.shape\n",
+ "print(\n",
+ " f\"{bold}The test dataset contains {num_examples} examples and {num_columns} columns.{unbold}\\n\"\n",
+ ")\n",
+ "\n",
+ "# prepare the ground truth target and predicting features to send into the endpoint.\n",
+ "ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]\n",
+ "\n",
+ "print(f\"{bold}The first 5 observations of the data: {unbold} \\n\")\n",
+ "test_data.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f628562",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "The following code queries the endpoint you have created to get the prediction for each test example. \n",
+ "The `query_endpoint()` function returns an array-like of shape (num_examples, num_classes), where each row indicates \n",
+ "the probability of the example for each class in the model. The num_classes is 2 in above test data. \n",
+ "Next, the predicted class label is obtained by taking the class label with the maximum probability over others for each example. \n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da19a629",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "content_type = \"text/csv\"\n",
+ "\n",
+ "\n",
+ "def query_endpoint(encoded_tabular_data, endpoint_name):\n",
+ " client = boto3.client(\"runtime.sagemaker\")\n",
+ " response = client.invoke_endpoint(\n",
+ " EndpointName=endpoint_name,\n",
+ " ContentType=content_type,\n",
+ " Body=encoded_tabular_data,\n",
+ " )\n",
+ " return response\n",
+ "\n",
+ "\n",
+ "def parse_response(query_response):\n",
+ " model_predictions = json.loads(query_response[\"Body\"].read())\n",
+ " predicted_probabilities = model_predictions[\"probabilities\"]\n",
+ " return np.array(predicted_probabilities)\n",
+ "\n",
+ "\n",
+ "# split the test data into smaller size of batches to query the endpoint if test data has large size.\n",
+ "batch_size = 1500\n",
+ "predict_prob = []\n",
+ "for i in np.arange(0, num_examples, step=batch_size):\n",
+ " query_response_batch = query_endpoint(\n",
+ " features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode(\"utf-8\"),\n",
+ " endpoint_name,\n",
+ " )\n",
+ " predict_prob_batch = parse_response(query_response_batch) # prediction probability per batch\n",
+ " predict_prob.append(predict_prob_batch)\n",
+ "\n",
+ "\n",
+ "predict_prob = np.concatenate(predict_prob, axis=0)\n",
+ "predict_label = np.argmax(predict_prob, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aabdee3e",
+ "metadata": {},
+ "source": [
+ "## 3.6. Evaluate the Prediction Results Returned from the Endpoint\n",
+ "\n",
+ "---\n",
+ "We evaluate the predictions results returned from the endpoint by following two ways.\n",
+ "\n",
+ "* Visualize the predictions results by plotting the confusion matrix.\n",
+ "\n",
+ "* Measure the prediction results quantitatively.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f3610bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Visualize the predictions results by plotting the confusion matrix.\n",
+ "conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label)\n",
+ "fig, ax = plt.subplots(figsize=(7.5, 7.5))\n",
+ "ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)\n",
+ "for i in range(conf_matrix.shape[0]):\n",
+ " for j in range(conf_matrix.shape[1]):\n",
+ " ax.text(x=j, y=i, s=conf_matrix[i, j], va=\"center\", ha=\"center\", size=\"xx-large\")\n",
+ "\n",
+ "plt.xlabel(\"Predictions\", fontsize=18)\n",
+ "plt.ylabel(\"Actuals\", fontsize=18)\n",
+ "plt.title(\"Confusion Matrix\", fontsize=18)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a59c801e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Measure the prediction results quantitatively.\n",
+ "eval_accuracy = accuracy_score(ground_truth_label.values, predict_label)\n",
+ "eval_f1 = f1_score(ground_truth_label.values, predict_label)\n",
+ "eval_auc = roc_auc_score(ground_truth_label.values, predict_prob[:, 1])\n",
+ "\n",
+ "lgb_results = pd.DataFrame.from_dict(\n",
+ " {\n",
+ " \"Accuracy\": eval_accuracy,\n",
+ " \"F1\": eval_f1,\n",
+ " \"AUC\": eval_auc,\n",
+ " },\n",
+ " orient=\"index\",\n",
+ " columns=[\"LightGBM with AMT\"],\n",
+ ")\n",
+ "\n",
+ "lgb_results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab7d2f6d",
+ "metadata": {},
+ "source": [
+ "## 4. Train A CatBoost model with AMT\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c49487ca",
+ "metadata": {},
+ "source": [
+ "### 4.1. Train with Automatic Model Tuning\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e3350a3",
+ "metadata": {},
+ "source": [
+ "Retrieve Training Artifacts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a67cce3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import image_uris, model_uris, script_uris\n",
+ "\n",
+ "train_model_id, train_model_version, train_scope = \"catboost-classification-model\", \"*\", \"training\"\n",
+ "training_instance_type = \"ml.m5.4xlarge\"\n",
+ "\n",
+ "# Retrieve the docker image\n",
+ "train_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " image_scope=train_scope,\n",
+ " instance_type=training_instance_type,\n",
+ ")\n",
+ "# Retrieve the training script\n",
+ "train_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=train_scope\n",
+ ")\n",
+ "# Retrieve the pre-trained model tarball to further fine-tune\n",
+ "train_model_uri = model_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, model_scope=train_scope\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5798369b",
+ "metadata": {},
+ "source": [
+ "Set training parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b7ada7ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import hyperparameters\n",
+ "\n",
+ "# Retrieve the default hyper-parameters for fine-tuning the model\n",
+ "hyperparameters = hyperparameters.retrieve_default(\n",
+ " model_id=train_model_id, model_version=train_model_version\n",
+ ")\n",
+ "\n",
+ "# [Optional] Override default hyperparameters with custom values\n",
+ "hyperparameters[\"iterations\"] = \"500\"\n",
+ "\n",
+ "\n",
+ "hyperparameters[\"eval_metric\"] = \"AUC\"\n",
+ "print(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c02b4aa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_output_location_cat = f\"s3://{bucket}/{output_prefix}/output_cat\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c69d3f7a",
+ "metadata": {},
+ "source": [
+ "Train with Automatic Model Tuning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0cd0ac41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hyperparameter_ranges_cat = {\n",
+ " \"learning_rate\": ContinuousParameter(0.00001, 0.1, scaling_type=\"Logarithmic\"),\n",
+ " \"iterations\": IntegerParameter(50, 1000),\n",
+ " \"depth\": IntegerParameter(1, 10),\n",
+ " \"l2_leaf_reg\": IntegerParameter(1, 10),\n",
+ " \"random_strength\": ContinuousParameter(0.01, 10, scaling_type=\"Logarithmic\"),\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "17053327",
+ "metadata": {},
+ "source": [
+ "Start training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cb34ccbe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.estimator import Estimator\n",
+ "from sagemaker.utils import name_from_base\n",
+ "\n",
+ "training_job_name = name_from_base(f\"jumpstart-{train_model_id}-training\")\n",
+ "\n",
+ "# Create SageMaker Estimator instance\n",
+ "tabular_estimator_cat = Estimator(\n",
+ " role=aws_role,\n",
+ " image_uri=train_image_uri,\n",
+ " source_dir=train_source_uri,\n",
+ " model_uri=train_model_uri,\n",
+ " entry_point=\"transfer_learning.py\",\n",
+ " instance_count=1,\n",
+ " instance_type=training_instance_type,\n",
+ " max_run=360000,\n",
+ " hyperparameters=hyperparameters,\n",
+ " output_path=s3_output_location_cat,\n",
+ ")\n",
+ "\n",
+ "if use_amt:\n",
+ "\n",
+ " tuner_cat = HyperparameterTuner(\n",
+ " tabular_estimator_cat,\n",
+ " \"AUC\",\n",
+ " hyperparameter_ranges_cat,\n",
+ " [{\"Name\": \"AUC\", \"Regex\": \"bestTest = ([0-9\\\\.]+)\"}],\n",
+ " max_jobs=10,\n",
+ " max_parallel_jobs=5,\n",
+ " objective_type=\"Maximize\",\n",
+ " base_tuning_job_name=training_job_name,\n",
+ " )\n",
+ "\n",
+ " tuner_cat.fit({\"training\": training_dataset_s3_path}, logs=True)\n",
+ "else:\n",
+ " # Launch a SageMaker Training job by passing s3 path of the training data\n",
+ " tabular_estimator_cat.fit(\n",
+ " {\"training\": training_dataset_s3_path}, logs=True, job_name=training_job_name\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33ad5e7a",
+ "metadata": {},
+ "source": [
+ "### 4.2. Deploy and Run Inference on the Trained Tabular Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2159fc95",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inference_instance_type = \"ml.m5.large\"\n",
+ "\n",
+ "# Retrieve the inference docker container uri\n",
+ "deploy_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " image_scope=\"inference\",\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " instance_type=inference_instance_type,\n",
+ ")\n",
+ "# Retrieve the inference script uri\n",
+ "deploy_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=\"inference\"\n",
+ ")\n",
+ "\n",
+ "endpoint_name_cat = name_from_base(f\"jumpstart-cat-churn-{train_model_id}-\")\n",
+ "\n",
+ "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n",
+ "predictor_cat = (tuner_cat if use_amt else tabular_estimator_cat).deploy(\n",
+ " initial_instance_count=1,\n",
+ " instance_type=inference_instance_type,\n",
+ " entry_point=\"inference.py\",\n",
+ " image_uri=deploy_image_uri,\n",
+ " source_dir=deploy_source_uri,\n",
+ " endpoint_name=endpoint_name_cat,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd36650b",
+ "metadata": {},
+ "source": [
+ "Query the endpoint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fa560463",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# split the test data into smaller size of batches to query the endpoint if the test data has large size.\n",
+ "batch_size = 1500\n",
+ "predict_prob_cat = []\n",
+ "for i in np.arange(0, num_examples, step=batch_size):\n",
+ " query_response_batch = query_endpoint(\n",
+ " features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode(\"utf-8\"),\n",
+ " endpoint_name_cat,\n",
+ " )\n",
+ " predict_prob_batch = parse_response(query_response_batch) # prediction probability per batch\n",
+ " predict_prob_cat.append(predict_prob_batch)\n",
+ "\n",
+ "\n",
+ "predict_prob_cat = np.concatenate(predict_prob_cat, axis=0)\n",
+ "predict_label_cat = np.argmax(predict_prob_cat, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c62c458",
+ "metadata": {},
+ "source": [
+ "Evaluate the prediction results returned from the endpoint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b012badc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Visualize the predictions results by plotting the confusion matrix.\n",
+ "conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label_cat)\n",
+ "fig, ax = plt.subplots(figsize=(7.5, 7.5))\n",
+ "ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)\n",
+ "for i in range(conf_matrix.shape[0]):\n",
+ " for j in range(conf_matrix.shape[1]):\n",
+ " ax.text(x=j, y=i, s=conf_matrix[i, j], va=\"center\", ha=\"center\", size=\"xx-large\")\n",
+ "\n",
+ "plt.xlabel(\"Predictions\", fontsize=18)\n",
+ "plt.ylabel(\"Actuals\", fontsize=18)\n",
+ "plt.title(\"Confusion Matrix\", fontsize=18)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1e6c3a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Measure the prediction results quantitatively.\n",
+ "eval_accuracy_cat = accuracy_score(ground_truth_label.values, predict_label_cat)\n",
+ "eval_f1_cat = f1_score(ground_truth_label.values, predict_label_cat)\n",
+ "eval_auc_cat = roc_auc_score(ground_truth_label.values, predict_prob_cat[:, 1])\n",
+ "\n",
+ "cat_results = pd.DataFrame.from_dict(\n",
+ " {\n",
+ " \"Accuracy\": eval_accuracy_cat,\n",
+ " \"F1\": eval_f1_cat,\n",
+ " \"AUC\": eval_auc_cat,\n",
+ " },\n",
+ " orient=\"index\",\n",
+ " columns=[\"CatBoost with AMT\"],\n",
+ ")\n",
+ "\n",
+ "results_lab_cat = pd.concat([lgb_results, cat_results], axis=1)\n",
+ "results_lab_cat"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "026fc463",
+ "metadata": {},
+ "source": [
+ "## 5. Train A TabTransformer model with AMT"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a618e4af",
+ "metadata": {},
+ "source": [
+ "### 5.1. Train with Automatic Model Tuning"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f20e80bc",
+ "metadata": {},
+ "source": [
+ "Retrieve Training Artifacts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f420b4d1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_model_id, train_model_version, train_scope = (\n",
+ " \"pytorch-tabtransformerclassification-model\",\n",
+ " \"*\",\n",
+ " \"training\",\n",
+ ")\n",
+ "training_instance_type = \"ml.p3.2xlarge\"\n",
+ "\n",
+ "# Retrieve the docker image\n",
+ "train_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " image_scope=train_scope,\n",
+ " instance_type=training_instance_type,\n",
+ ")\n",
+ "# Retrieve the training script\n",
+ "train_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=train_scope\n",
+ ")\n",
+ "# Retrieve the pre-trained model tarball to further fine-tune\n",
+ "train_model_uri = model_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, model_scope=train_scope\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e133b1ed",
+ "metadata": {},
+ "source": [
+ "Set training parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e4348e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import hyperparameters\n",
+ "\n",
+ "# Retrieve the default hyper-parameters for fine-tuning the model\n",
+ "hyperparameters = hyperparameters.retrieve_default(\n",
+ " model_id=train_model_id, model_version=train_model_version\n",
+ ")\n",
+ "\n",
+ "# [Optional] Override default hyperparameters with custom values\n",
+ "hyperparameters[\"n_epochs\"] = 40 # The same hyperparameter is named as \"iterations\" for CatBoost\n",
+ "hyperparameters[\"patience\"] = 10\n",
+ "\n",
+ "print(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0079c15c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_output_location_tab = f\"s3://{bucket}/{output_prefix}/output_tab\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7bcce249",
+ "metadata": {},
+ "source": [
+ "Train with Automatic Model Tuning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d9baa338",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.tuner import (\n",
+ " ContinuousParameter,\n",
+ " IntegerParameter,\n",
+ " HyperparameterTuner,\n",
+ " CategoricalParameter,\n",
+ ")\n",
+ "\n",
+ "hyperparameter_ranges_tab = {\n",
+ " \"learning_rate\": ContinuousParameter(0.001, 0.01, scaling_type=\"Auto\"),\n",
+ " \"batch_size\": CategoricalParameter([64, 128, 256, 512]),\n",
+ " \"attn_dropout\": ContinuousParameter(0.0, 0.8, scaling_type=\"Auto\"),\n",
+ " \"mlp_dropout\": ContinuousParameter(0.0, 0.8, scaling_type=\"Auto\"),\n",
+ " \"input_dim\": CategoricalParameter([\"16\", \"32\", \"64\", \"128\", \"256\"]),\n",
+ " \"frac_shared_embed\": ContinuousParameter(0.0, 0.5, scaling_type=\"Auto\"),\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "edba0682",
+ "metadata": {},
+ "source": [
+ "Start training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c1b2be2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_job_name = name_from_base(f\"jumpstart-{train_model_id}-training\")\n",
+ "\n",
+ "# Create SageMaker Estimator instance\n",
+ "tabular_estimator_tab = Estimator(\n",
+ " role=aws_role,\n",
+ " image_uri=train_image_uri,\n",
+ " source_dir=train_source_uri,\n",
+ " model_uri=train_model_uri,\n",
+ " entry_point=\"transfer_learning.py\",\n",
+ " instance_count=1,\n",
+ " instance_type=training_instance_type,\n",
+ " max_run=360000,\n",
+ " hyperparameters=hyperparameters,\n",
+ " output_path=s3_output_location_tab,\n",
+ ")\n",
+ "\n",
+ "if use_amt:\n",
+ "\n",
+ " tuner_tab = HyperparameterTuner(\n",
+ " tabular_estimator_tab,\n",
+ " \"f1_score\", # Note, TabTransformer currently does not support AUC score, thus we use its default setting F1 score as an alternative evaluation metric.\n",
+ " hyperparameter_ranges_tab,\n",
+ " [{\"Name\": \"f1_score\", \"Regex\": \"metrics={'f1': (\\\\S+)}\"}],\n",
+ " max_jobs=10,\n",
+ " max_parallel_jobs=5, # reduce max_parallel_jobs number if the instance type is limited in your account\n",
+ " objective_type=\"Maximize\",\n",
+ " base_tuning_job_name=training_job_name,\n",
+ " )\n",
+ "\n",
+ " tuner_tab.fit({\"training\": training_dataset_s3_path}, logs=True)\n",
+ "else:\n",
+ " # Launch a SageMaker Training job by passing s3 path of the training data\n",
+ " tabular_estimator_tab.fit(\n",
+ " {\"training\": training_dataset_s3_path}, logs=True, job_name=training_job_name\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f5a8b89",
+ "metadata": {},
+ "source": [
+ " \n",
+ "### 5.2. Deploy and Run Inference on the Trained Tabular Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d1d6afb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inference_instance_type = \"ml.m5.2xlarge\"\n",
+ "\n",
+ "# Retrieve the inference docker container uri\n",
+ "deploy_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " image_scope=\"inference\",\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " instance_type=inference_instance_type,\n",
+ ")\n",
+ "# Retrieve the inference script uri\n",
+ "deploy_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=\"inference\"\n",
+ ")\n",
+ "\n",
+ "endpoint_name_tab = name_from_base(f\"jumpstart-tabtransformer-churn-{train_model_id}-\")\n",
+ "\n",
+ "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n",
+ "predictor_tab = (tuner_tab if use_amt else tabular_estimator_tab).deploy(\n",
+ " initial_instance_count=1,\n",
+ " instance_type=inference_instance_type,\n",
+ " entry_point=\"inference.py\",\n",
+ " image_uri=deploy_image_uri,\n",
+ " source_dir=deploy_source_uri,\n",
+ " endpoint_name=endpoint_name_tab,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e6d70ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# split the test data into smaller size of batches to query the endpoint if the test data has large size.\n",
+ "batch_size = 1500\n",
+ "predict_prob_tab = []\n",
+ "for i in np.arange(0, num_examples, step=batch_size):\n",
+ " query_response_batch = query_endpoint(\n",
+ " features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode(\"utf-8\"),\n",
+ " endpoint_name_tab,\n",
+ " )\n",
+ " predict_prob_batch = parse_response(query_response_batch) # prediction probability per batch\n",
+ " predict_prob_tab.append(predict_prob_batch)\n",
+ "\n",
+ "\n",
+ "predict_prob_tab = np.concatenate(predict_prob_tab, axis=0)\n",
+ "predict_label_tab = np.argmax(predict_prob_tab, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c7533d36",
+ "metadata": {},
+ "source": [
+ "Evaluate the prediction results returned from the endpoint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "641f8234",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Visualize the predictions results by plotting the confusion matrix.\n",
+ "conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label_tab)\n",
+ "fig, ax = plt.subplots(figsize=(7.5, 7.5))\n",
+ "ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)\n",
+ "for i in range(conf_matrix.shape[0]):\n",
+ " for j in range(conf_matrix.shape[1]):\n",
+ " ax.text(x=j, y=i, s=conf_matrix[i, j], va=\"center\", ha=\"center\", size=\"xx-large\")\n",
+ "\n",
+ "plt.xlabel(\"Predictions\", fontsize=18)\n",
+ "plt.ylabel(\"Actuals\", fontsize=18)\n",
+ "plt.title(\"Confusion Matrix\", fontsize=18)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "17e29efd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Measure the prediction results quantitatively.\n",
+ "eval_accuracy_tab = accuracy_score(ground_truth_label.values, predict_label_tab)\n",
+ "eval_f1_tab = f1_score(ground_truth_label.values, predict_label_tab)\n",
+ "eval_auc_tab = roc_auc_score(ground_truth_label.values, predict_prob_tab[:, 1])\n",
+ "\n",
+ "tab_results = pd.DataFrame.from_dict(\n",
+ " {\n",
+ " \"Accuracy\": eval_accuracy_tab,\n",
+ " \"F1\": eval_f1_tab,\n",
+ " \"AUC\": eval_auc_tab,\n",
+ " },\n",
+ " orient=\"index\",\n",
+ " columns=[\"TabTransformer with AMT\"],\n",
+ ")\n",
+ "\n",
+ "results_lab_cat_tab = pd.concat([results_lab_cat, tab_results], axis=1)\n",
+ "results_lab_cat_tab"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ea964d81",
+ "metadata": {},
+ "source": [
+ "## 6. Train An AutoGluon-Tabular model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c1fd4df",
+ "metadata": {},
+ "source": [
+ "### 6.1. Train with AutoGluon-Tabular model\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9c6393b",
+ "metadata": {},
+ "source": [
+ "Retrieve Training Artifacts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b247833f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import image_uris, model_uris, script_uris\n",
+ "\n",
+ "# Currently, not all the object detection models in jumpstart support finetuning. Thus, we manually select a model\n",
+ "# which supports finetuning.\n",
+ "train_model_id, train_model_version, train_scope = (\n",
+ " \"autogluon-classification-ensemble\",\n",
+ " \"*\",\n",
+ " \"training\",\n",
+ ")\n",
+ "training_instance_type = \"ml.g4dn.2xlarge\" # set a different GPU type to avoid instance insufficiency for p3 instance that is used by TabTransformer\n",
+ "\n",
+ "# Retrieve the docker image\n",
+ "train_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " image_scope=train_scope,\n",
+ " instance_type=training_instance_type,\n",
+ ")\n",
+ "# Retrieve the training script\n",
+ "train_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=train_scope\n",
+ ")\n",
+ "# Retrieve the pre-trained model tarball to further fine-tune\n",
+ "train_model_uri = model_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, model_scope=train_scope\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c288f5a8",
+ "metadata": {},
+ "source": [
+ "Set training parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "577586e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker import hyperparameters\n",
+ "\n",
+ "# Retrieve the default hyper-parameters for fine-tuning the model\n",
+ "hyperparameters = hyperparameters.retrieve_default(\n",
+ " model_id=train_model_id, model_version=train_model_version\n",
+ ")\n",
+ "\n",
+ "hyperparameters[\"eval_metric\"] = \"roc_auc\"\n",
+ "print(hyperparameters)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55b4b386",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_output_location_ag = f\"s3://{bucket}/{output_prefix}/output_ag\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "278f7178",
+ "metadata": {},
+ "source": [
+ "Start training\n",
+ "\n",
+ "Note. We do not perform automatic model tuning as AutoGluon-Tabular do not focus on hyperparameter selections. Instead, it ensembles multiple models and stacks them in multiple layers. For details, see [paper](https://arxiv.org/abs/2003.06505)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c07b6103",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.estimator import Estimator\n",
+ "from sagemaker.utils import name_from_base\n",
+ "\n",
+ "training_job_name = name_from_base(f\"jumpstart-{train_model_id}-training\")\n",
+ "\n",
+ "# Create SageMaker Estimator instance\n",
+ "tabular_estimator_ag = Estimator(\n",
+ " role=aws_role,\n",
+ " image_uri=train_image_uri,\n",
+ " source_dir=train_source_uri,\n",
+ " model_uri=train_model_uri,\n",
+ " entry_point=\"transfer_learning.py\",\n",
+ " instance_count=1,\n",
+ " instance_type=training_instance_type,\n",
+ " max_run=360000,\n",
+ " hyperparameters=hyperparameters,\n",
+ " output_path=s3_output_location_ag,\n",
+ ")\n",
+ "\n",
+ "\n",
+ "# Launch a SageMaker Training job by passing s3 path of the training data\n",
+ "tabular_estimator_ag.fit(\n",
+ " {\"training\": training_dataset_s3_path}, logs=True, job_name=training_job_name\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d6b8361d",
+ "metadata": {},
+ "source": [
+ "### 6.2. Deploy and Run Inference on the Trained Tabular Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f6dc44a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inference_instance_type = \"ml.m5.2xlarge\"\n",
+ "\n",
+ "# Retrieve the inference docker container uri\n",
+ "deploy_image_uri = image_uris.retrieve(\n",
+ " region=None,\n",
+ " framework=None,\n",
+ " image_scope=\"inference\",\n",
+ " model_id=train_model_id,\n",
+ " model_version=train_model_version,\n",
+ " instance_type=inference_instance_type,\n",
+ ")\n",
+ "# Retrieve the inference script uri\n",
+ "deploy_source_uri = script_uris.retrieve(\n",
+ " model_id=train_model_id, model_version=train_model_version, script_scope=\"inference\"\n",
+ ")\n",
+ "\n",
+ "endpoint_name_ag = name_from_base(f\"jumpstart-ag-churn-{train_model_id}-\")\n",
+ "\n",
+ "# Use the estimator from the previous step to deploy to a SageMaker endpoint\n",
+ "predictor_ag = tabular_estimator_ag.deploy(\n",
+ " initial_instance_count=1,\n",
+ " instance_type=inference_instance_type,\n",
+ " entry_point=\"inference.py\",\n",
+ " image_uri=deploy_image_uri,\n",
+ " source_dir=deploy_source_uri,\n",
+ " endpoint_name=endpoint_name_ag,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5cf7b37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# split the test data into smaller size of batches to query the endpoint if the test data has large size.\n",
+ "batch_size = 1500\n",
+ "predict_prob_ag = []\n",
+ "for i in np.arange(0, num_examples, step=batch_size):\n",
+ " query_response_batch = query_endpoint(\n",
+ " features.iloc[i : (i + batch_size), :].to_csv(header=False, index=False).encode(\"utf-8\"),\n",
+ " endpoint_name_ag,\n",
+ " )\n",
+ " predict_prob_batch = parse_response(query_response_batch) # prediction probability per batch\n",
+ " predict_prob_ag.append(predict_prob_batch)\n",
+ "\n",
+ "\n",
+ "predict_prob_ag = np.concatenate(predict_prob_ag, axis=0)\n",
+ "predict_label_ag = np.argmax(predict_prob_ag, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4d86ccb2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Visualize the predictions results by plotting the confusion matrix.\n",
+ "conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label_ag)\n",
+ "fig, ax = plt.subplots(figsize=(7.5, 7.5))\n",
+ "ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)\n",
+ "for i in range(conf_matrix.shape[0]):\n",
+ " for j in range(conf_matrix.shape[1]):\n",
+ " ax.text(x=j, y=i, s=conf_matrix[i, j], va=\"center\", ha=\"center\", size=\"xx-large\")\n",
+ "\n",
+ "plt.xlabel(\"Predictions\", fontsize=18)\n",
+ "plt.ylabel(\"Actuals\", fontsize=18)\n",
+ "plt.title(\"Confusion Matrix\", fontsize=18)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2bfbab51",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Measure the prediction results quantitatively.\n",
+ "eval_accuracy_ag = accuracy_score(ground_truth_label.values, predict_label_ag)\n",
+ "eval_f1_ag = f1_score(ground_truth_label.values, predict_label_ag)\n",
+ "eval_auc_ag = roc_auc_score(ground_truth_label.values, predict_prob_ag[:, 1])\n",
+ "\n",
+ "ag_results = pd.DataFrame.from_dict(\n",
+ " {\n",
+ " \"Accuracy\": eval_accuracy_ag,\n",
+ " \"F1\": eval_f1_ag,\n",
+ " \"AUC\": eval_auc_ag,\n",
+ " },\n",
+ " orient=\"index\",\n",
+ " columns=[\"AutoGluon-Tabular\"],\n",
+ ")\n",
+ "\n",
+ "results_lab_cat_tab_ag = pd.concat([results_lab_cat_tab, ag_results], axis=1)\n",
+ "results_lab_cat_tab_ag"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cbbfc102",
+ "metadata": {},
+ "source": [
+ "## 7. Compare Prediction Results of Four Trained Models on the Same Test Data\n",
+ "\n",
+ "For the three evaluation metrics accuracy, f1 score, and roc_auc, larger value indicates better results. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "25ebee1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results_lab_cat_tab_ag"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7f3a1eb",
+ "metadata": {},
+ "source": [
+ "Now you can use this template to evaluate the performance of LightGBM, CatBoost, TabTransformer, and AutoGluon-Tabular on your own dataset."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95194916",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "Next, we delete the endpoint corresponding to the trained model.\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8491a547",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Delete the SageMaker endpoint and the attached resources\n",
+ "predictor.delete_model()\n",
+ "predictor.delete_endpoint()\n",
+ "predictor_cat.delete_model()\n",
+ "predictor_cat.delete_endpoint()\n",
+ "predictor_tab.delete_model()\n",
+ "predictor_tab.delete_endpoint()\n",
+ "predictor_ag.delete_model()\n",
+ "predictor_ag.delete_endpoint()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "conda_python3",
+ "language": "python",
+ "name": "conda_python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/licenses/2-CLAUSE-BSD b/licenses/2-CLAUSE-BSD
new file mode 100644
index 0000000000..b79ea4a15c
--- /dev/null
+++ b/licenses/2-CLAUSE-BSD
@@ -0,0 +1,28 @@
+2-Clause BSD License
+=====================
+
+Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file).
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/r_examples/r_api_serving_examples/API Serving Examples.ipynb b/r_examples/r_api_serving_examples/API Serving Examples.ipynb
deleted file mode 100644
index cb85c5bac6..0000000000
--- a/r_examples/r_api_serving_examples/API Serving Examples.ipynb
+++ /dev/null
@@ -1,610 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# R API Serving Examples\n",
- "\n",
- "In this example, we demonstrate how to quickly compare the runtimes of three methods for serving a model from an R hosted REST API. The following SageMaker examples discuss each method in detail:\n",
- "\n",
- "* **Plumber**\n",
- " * Website: [https://www.rplumber.io/](https://www.rplumber.io)\n",
- " * SageMaker Example: [r_serving_with_plumber](../r_serving_with_plumber)\n",
- "* **RestRServe**\n",
- " * Website: [https://restrserve.org](https://restrserve.org)\n",
- " * SageMaker Example: [r_serving_with_restrserve](../r_serving_with_restrserve)\n",
- "* **FastAPI** (reticulated from Python)\n",
- " * Website: [https://fastapi.tiangolo.com](https://fastapi.tiangolo.com)\n",
- " * SageMaker Example: [r_serving_with_fastapi](../r_serving_with_fastapi)\n",
- " \n",
- "We will reuse the docker images from each of these examples. Each one is configured to serve a small XGBoost model which has already been trained on the classical Iris dataset."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Building Docker Images for Serving\n",
- "\n",
- "First, we will build each docker image from the provided SageMaker Examples."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Plumber Serving Image"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "!cd .. && docker build -t r-plumber -f r_serving_with_plumber/Dockerfile r_serving_with_plumber"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### RestRServe Serving Image"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "!cd .. && docker build -t r-restrserve -f r_serving_with_restrserve/Dockerfile r_serving_with_restrserve"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### FastAPI Serving Image"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "!cd .. && docker build -t r-fastapi -f r_serving_with_fastapi/Dockerfile r_serving_with_fastapi"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Launch Serving Containers"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next, we will launch each search container. The containers will be launch on the following ports to avoid port collisions on your local machine or SageMaker Notebook instance:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ports = {\n",
- " \"plumber\": 5000,\n",
- " \"restrserve\": 5001,\n",
- " \"fastapi\": 5002,\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!bash launch.sh"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!docker container list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Define Simple Client"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import requests\n",
- "from tqdm import tqdm\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_predictions(examples, instance=requests, port=5000):\n",
- " payload = {\"features\": examples}\n",
- " return instance.post(f\"http://127.0.0.1:{port}/invocations\", json=payload)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_health(instance=requests, port=5000):\n",
- " instance.get(f\"http://127.0.0.1:{port}/ping\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Define Example Inputs"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next, we define a example inputs from the classical [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset.\n",
- "* Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "column_names = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\", \"Label\"]\n",
- "iris = pd.read_csv(\n",
- " \"s3://sagemaker-sample-files/datasets/tabular/iris/iris.data\", names=column_names\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "iris_features = iris[[\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "example = iris_features.values[:1].tolist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "many_examples = iris_features.values[:100].tolist()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Testing\n",
- "\n",
- "Now it's time to test how each API server performs under stress."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We will test two use cases:\n",
- "* **New Requests**: In this scenario, we test how quickly the server can respond with predictions when each client request establishes a new connection with the server. This simulates the server's ability to handle real-time requests. We could make this more realistic by creating an asynchronous environment that tests the server's ability to fulfill concurrent rather than sequential requests.\n",
- "* **Keep Alive / Reuse Session**: In this scenario, we test how quickly the server can respond with predictions when each client request uses a session to keep its connection to the server alive between requests. This simulates the server's ability to handle sequential batch requests from the same client."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "For each of the two use cases, we will test the performance on following situations:\n",
- "\n",
- "* 1000 requests of a single example\n",
- "* 1000 requests of 100 examples\n",
- "* 1000 pings for health status"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## New Requests"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Plumber"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# verify the prediction output\n",
- "get_predictions(example, port=ports[\"plumber\"]).json()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(example, port=ports[\"plumber\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(many_examples, port=ports[\"plumber\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " get_health(port=ports[\"plumber\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### RestRserve"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# verify the prediction output\n",
- "get_predictions(example, port=ports[\"restrserve\"]).json()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(example, port=ports[\"restrserve\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(many_examples, port=ports[\"restrserve\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " get_health(port=ports[\"restrserve\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### FastAPI"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# verify the prediction output\n",
- "get_predictions(example, port=ports[\"fastapi\"]).json()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(example, port=ports[\"fastapi\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(many_examples, port=ports[\"fastapi\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " get_health(port=ports[\"fastapi\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Keep Alive (Reuse Session)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now, let's test how each one performs when each request reuses a session connection. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# reuse the session for each post and get request\n",
- "instance = requests.Session()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Plumber"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(example, instance=instance, port=ports[\"plumber\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(many_examples, instance=instance, port=ports[\"plumber\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " get_health(instance=instance, port=ports[\"plumber\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### RestRserve"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(example, instance=instance, port=ports[\"restrserve\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(many_examples, instance=instance, port=ports[\"restrserve\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " get_health(instance=instance, port=ports[\"restrserve\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### FastAPI"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(example, instance=instance, port=ports[\"fastapi\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " _ = get_predictions(many_examples, instance=instance, port=ports[\"fastapi\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in tqdm(range(1000)):\n",
- " get_health(instance=instance, port=ports[\"fastapi\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Stop All Serving Containers"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Finally, we will shut down the serving containers we launched for the tests."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!docker kill $(docker ps -q)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Conclusion"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this example, we demonstrated how to conduct a simple performance benchmark across three R model serving solutions. We leave the choice of serving solution up to the reader since in some cases it might be appropriate to customize the benchmark in the following ways:\n",
- "\n",
- "* Update the serving example to serve a specific model\n",
- "* Perform the tests across multiple instances types\n",
- "* Modify the serving example and client to test asynchronous requests.\n",
- "* Deploy the serving examples to SageMaker Endpoints to test within an autoscaling environment.\n",
- "\n",
- "For more information on serving your models in custom containers on SageMaker, please see our [support documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-main.html) for the latest updates and best practices."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "conda_python3",
- "language": "python",
- "name": "conda_python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/r_examples/r_api_serving_examples/iris.csv b/r_examples/r_api_serving_examples/iris.csv
deleted file mode 100644
index 8b6393099a..0000000000
--- a/r_examples/r_api_serving_examples/iris.csv
+++ /dev/null
@@ -1,151 +0,0 @@
-Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
-5.1,3.5,1.4,0.2,setosa
-4.9,3,1.4,0.2,setosa
-4.7,3.2,1.3,0.2,setosa
-4.6,3.1,1.5,0.2,setosa
-5,3.6,1.4,0.2,setosa
-5.4,3.9,1.7,0.4,setosa
-4.6,3.4,1.4,0.3,setosa
-5,3.4,1.5,0.2,setosa
-4.4,2.9,1.4,0.2,setosa
-4.9,3.1,1.5,0.1,setosa
-5.4,3.7,1.5,0.2,setosa
-4.8,3.4,1.6,0.2,setosa
-4.8,3,1.4,0.1,setosa
-4.3,3,1.1,0.1,setosa
-5.8,4,1.2,0.2,setosa
-5.7,4.4,1.5,0.4,setosa
-5.4,3.9,1.3,0.4,setosa
-5.1,3.5,1.4,0.3,setosa
-5.7,3.8,1.7,0.3,setosa
-5.1,3.8,1.5,0.3,setosa
-5.4,3.4,1.7,0.2,setosa
-5.1,3.7,1.5,0.4,setosa
-4.6,3.6,1,0.2,setosa
-5.1,3.3,1.7,0.5,setosa
-4.8,3.4,1.9,0.2,setosa
-5,3,1.6,0.2,setosa
-5,3.4,1.6,0.4,setosa
-5.2,3.5,1.5,0.2,setosa
-5.2,3.4,1.4,0.2,setosa
-4.7,3.2,1.6,0.2,setosa
-4.8,3.1,1.6,0.2,setosa
-5.4,3.4,1.5,0.4,setosa
-5.2,4.1,1.5,0.1,setosa
-5.5,4.2,1.4,0.2,setosa
-4.9,3.1,1.5,0.2,setosa
-5,3.2,1.2,0.2,setosa
-5.5,3.5,1.3,0.2,setosa
-4.9,3.6,1.4,0.1,setosa
-4.4,3,1.3,0.2,setosa
-5.1,3.4,1.5,0.2,setosa
-5,3.5,1.3,0.3,setosa
-4.5,2.3,1.3,0.3,setosa
-4.4,3.2,1.3,0.2,setosa
-5,3.5,1.6,0.6,setosa
-5.1,3.8,1.9,0.4,setosa
-4.8,3,1.4,0.3,setosa
-5.1,3.8,1.6,0.2,setosa
-4.6,3.2,1.4,0.2,setosa
-5.3,3.7,1.5,0.2,setosa
-5,3.3,1.4,0.2,setosa
-7,3.2,4.7,1.4,versicolor
-6.4,3.2,4.5,1.5,versicolor
-6.9,3.1,4.9,1.5,versicolor
-5.5,2.3,4,1.3,versicolor
-6.5,2.8,4.6,1.5,versicolor
-5.7,2.8,4.5,1.3,versicolor
-6.3,3.3,4.7,1.6,versicolor
-4.9,2.4,3.3,1,versicolor
-6.6,2.9,4.6,1.3,versicolor
-5.2,2.7,3.9,1.4,versicolor
-5,2,3.5,1,versicolor
-5.9,3,4.2,1.5,versicolor
-6,2.2,4,1,versicolor
-6.1,2.9,4.7,1.4,versicolor
-5.6,2.9,3.6,1.3,versicolor
-6.7,3.1,4.4,1.4,versicolor
-5.6,3,4.5,1.5,versicolor
-5.8,2.7,4.1,1,versicolor
-6.2,2.2,4.5,1.5,versicolor
-5.6,2.5,3.9,1.1,versicolor
-5.9,3.2,4.8,1.8,versicolor
-6.1,2.8,4,1.3,versicolor
-6.3,2.5,4.9,1.5,versicolor
-6.1,2.8,4.7,1.2,versicolor
-6.4,2.9,4.3,1.3,versicolor
-6.6,3,4.4,1.4,versicolor
-6.8,2.8,4.8,1.4,versicolor
-6.7,3,5,1.7,versicolor
-6,2.9,4.5,1.5,versicolor
-5.7,2.6,3.5,1,versicolor
-5.5,2.4,3.8,1.1,versicolor
-5.5,2.4,3.7,1,versicolor
-5.8,2.7,3.9,1.2,versicolor
-6,2.7,5.1,1.6,versicolor
-5.4,3,4.5,1.5,versicolor
-6,3.4,4.5,1.6,versicolor
-6.7,3.1,4.7,1.5,versicolor
-6.3,2.3,4.4,1.3,versicolor
-5.6,3,4.1,1.3,versicolor
-5.5,2.5,4,1.3,versicolor
-5.5,2.6,4.4,1.2,versicolor
-6.1,3,4.6,1.4,versicolor
-5.8,2.6,4,1.2,versicolor
-5,2.3,3.3,1,versicolor
-5.6,2.7,4.2,1.3,versicolor
-5.7,3,4.2,1.2,versicolor
-5.7,2.9,4.2,1.3,versicolor
-6.2,2.9,4.3,1.3,versicolor
-5.1,2.5,3,1.1,versicolor
-5.7,2.8,4.1,1.3,versicolor
-6.3,3.3,6,2.5,virginica
-5.8,2.7,5.1,1.9,virginica
-7.1,3,5.9,2.1,virginica
-6.3,2.9,5.6,1.8,virginica
-6.5,3,5.8,2.2,virginica
-7.6,3,6.6,2.1,virginica
-4.9,2.5,4.5,1.7,virginica
-7.3,2.9,6.3,1.8,virginica
-6.7,2.5,5.8,1.8,virginica
-7.2,3.6,6.1,2.5,virginica
-6.5,3.2,5.1,2,virginica
-6.4,2.7,5.3,1.9,virginica
-6.8,3,5.5,2.1,virginica
-5.7,2.5,5,2,virginica
-5.8,2.8,5.1,2.4,virginica
-6.4,3.2,5.3,2.3,virginica
-6.5,3,5.5,1.8,virginica
-7.7,3.8,6.7,2.2,virginica
-7.7,2.6,6.9,2.3,virginica
-6,2.2,5,1.5,virginica
-6.9,3.2,5.7,2.3,virginica
-5.6,2.8,4.9,2,virginica
-7.7,2.8,6.7,2,virginica
-6.3,2.7,4.9,1.8,virginica
-6.7,3.3,5.7,2.1,virginica
-7.2,3.2,6,1.8,virginica
-6.2,2.8,4.8,1.8,virginica
-6.1,3,4.9,1.8,virginica
-6.4,2.8,5.6,2.1,virginica
-7.2,3,5.8,1.6,virginica
-7.4,2.8,6.1,1.9,virginica
-7.9,3.8,6.4,2,virginica
-6.4,2.8,5.6,2.2,virginica
-6.3,2.8,5.1,1.5,virginica
-6.1,2.6,5.6,1.4,virginica
-7.7,3,6.1,2.3,virginica
-6.3,3.4,5.6,2.4,virginica
-6.4,3.1,5.5,1.8,virginica
-6,3,4.8,1.8,virginica
-6.9,3.1,5.4,2.1,virginica
-6.7,3.1,5.6,2.4,virginica
-6.9,3.1,5.1,2.3,virginica
-5.8,2.7,5.1,1.9,virginica
-6.8,3.2,5.9,2.3,virginica
-6.7,3.3,5.7,2.5,virginica
-6.7,3,5.2,2.3,virginica
-6.3,2.5,5,1.9,virginica
-6.5,3,5.2,2,virginica
-6.2,3.4,5.4,2.3,virginica
-5.9,3,5.1,1.8,virginica
diff --git a/r_examples/r_api_serving_examples/launch.sh b/r_examples/r_api_serving_examples/launch.sh
deleted file mode 100644
index e456602d35..0000000000
--- a/r_examples/r_api_serving_examples/launch.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo "Launching Plumber"
-docker run -d --rm -p 5000:8080 r-plumber
-
-echo "Launching RestRServer"
-docker run -d --rm -p 5001:8080 r-restrserve
-
-echo "Launching FastAPI"
-docker run -d --rm -p 5002:8080 r-fastapi
-
diff --git a/r_examples/r_serving_with_fastapi/FastAPI Example.ipynb b/r_examples/r_serving_with_fastapi/FastAPI_Example.ipynb
similarity index 100%
rename from r_examples/r_serving_with_fastapi/FastAPI Example.ipynb
rename to r_examples/r_serving_with_fastapi/FastAPI_Example.ipynb
diff --git a/r_examples/r_serving_with_plumber/Plumber Example.ipynb b/r_examples/r_serving_with_plumber/Plumber_Example.ipynb
similarity index 100%
rename from r_examples/r_serving_with_plumber/Plumber Example.ipynb
rename to r_examples/r_serving_with_plumber/Plumber_Example.ipynb
diff --git a/r_examples/r_serving_with_restrserve/Dockerfile b/r_examples/r_serving_with_restrserve/Dockerfile
index 69dc88b8c2..5aaaf57689 100644
--- a/r_examples/r_serving_with_restrserve/Dockerfile
+++ b/r_examples/r_serving_with_restrserve/Dockerfile
@@ -2,7 +2,9 @@ FROM r-base:3.6.3
MAINTAINER Amazon SageMaker Examples
-RUN R -e "install.packages(c('RestRserve','xgboost','dplyr'), repos='https://cloud.r-project.org')"
+RUN R -e "install.packages(c('RestRserve','data.table', 'stringi', 'dplyr'), repos='https://cloud.r-project.org')"
+RUN wget http://cran.r-project.org/src/contrib/Archive/xgboost/xgboost_1.4.1.1.tar.gz
+RUN R CMD INSTALL xgboost_1.4.1.1.tar.gz
COPY xgb.model /opt/ml/xgb.model
COPY restrserve.R /opt/ml/restrserve.R
diff --git a/r_examples/r_serving_with_restrserve/RestRServe Example.ipynb b/r_examples/r_serving_with_restrserve/RestRServe_Example.ipynb
similarity index 100%
rename from r_examples/r_serving_with_restrserve/RestRServe Example.ipynb
rename to r_examples/r_serving_with_restrserve/RestRServe_Example.ipynb
diff --git a/sagemaker-inference-recommender/inference-recommender.ipynb b/sagemaker-inference-recommender/inference-recommender.ipynb
index bbbbf958fd..95eb54c7e8 100644
--- a/sagemaker-inference-recommender/inference-recommender.ipynb
+++ b/sagemaker-inference-recommender/inference-recommender.ipynb
@@ -24,7 +24,7 @@
"source": [
"## 2. Setup \n",
"\n",
- "Note that we are using the `conda_tensorflow2_p36` kernel in SageMaker Notebook Instances. This is running Python 3.6 and TensorFlow 2.1.3. If you'd like to use the same setup, in the AWS Management Console, go to the Amazon SageMaker console. Choose Notebook Instances, and click create a new notebook instance. Upload the current notebook and set the kernel. You can also run this in SageMaker Studio Notebooks with the `TensorFlow 2.1 Python 3.6 CPU Optimized` kernel.\n",
+ "Note that we are using the `conda_tensorflow2_p36` kernel in SageMaker Notebook Instances. This is running Python 3.6 and TensorFlow 2.1.3. If you'd like to use the same setup, in the AWS Management Console, go to the Amazon SageMaker console. Choose Notebook Instances, and click create a new notebook instance. Upload the current notebook and set the kernel. You can also run this in SageMaker Studio Notebooks with the `TensorFlow 2.6 Python 3.8 CPU Optimized` kernel.\n",
"\n",
"In the next steps, you'll import standard methods and libraries as well as set variables that will be used in this notebook. The `get_execution_role` function retrieves the AWS Identity and Access Management (IAM) role you created at the time of creating your notebook instance."
]
diff --git a/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform_outputs.ipynb b/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform_outputs.ipynb
index d20c0f6d83..21102fda26 100644
--- a/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform_outputs.ipynb
+++ b/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform_outputs.ipynb
@@ -272,9 +272,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\r\n",
- "\u001b[33mWARNING: You are using pip version 21.1.3; however, version 22.1.2 is available.\r\n",
- "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n"
+ "\u001B[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001B[0m\r\n",
+ "\u001B[33mWARNING: You are using pip version 21.1.3; however, version 22.1.2 is available.\r\n",
+ "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001B[0m\r\n"
]
}
],
@@ -3441,4 +3441,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/sagemaker-pipelines/tabular/custom_callback_pipelines_step/sagemaker-pipelines-callback-step.ipynb b/sagemaker-pipelines/tabular/custom_callback_pipelines_step/sagemaker-pipelines-callback-step.ipynb
index a36145affd..263936108a 100644
--- a/sagemaker-pipelines/tabular/custom_callback_pipelines_step/sagemaker-pipelines-callback-step.ipynb
+++ b/sagemaker-pipelines/tabular/custom_callback_pipelines_step/sagemaker-pipelines-callback-step.ipynb
@@ -899,7 +899,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!pip install \"sagemaker==2.91.1\""
+ "!pip install \"sagemaker>=2.99.0\""
]
},
{
@@ -977,6 +977,7 @@
"outputs": [],
"source": [
"from sagemaker.workflow.callback_step import CallbackStep, CallbackOutput, CallbackOutputTypeEnum\n",
+ "from sagemaker.workflow.functions import Join\n",
"\n",
"callback1_output = CallbackOutput(\n",
" output_name=\"s3_data_out\", output_type=CallbackOutputTypeEnum.String\n",
@@ -987,7 +988,9 @@
" sqs_queue_url=queue_url,\n",
" inputs={\n",
" \"input_location\": f\"s3://{default_bucket}/{taxi_prefix}/\",\n",
- " \"output_location\": f\"s3://{default_bucket}/{taxi_prefix}_{id_out}/\",\n",
+ " \"output_location\": Join(\n",
+ " on=\"/\", values=[\"s3:/\", default_bucket, f\"{taxi_prefix}_output\", id_out]\n",
+ " ),\n",
" },\n",
" outputs=[callback1_output],\n",
")"
@@ -1000,9 +1003,9 @@
"source": [
"#### 2 - Training Step \n",
"\n",
- "Next, we'll configure the training step by first configuring the estimator for random cut forest. Then, we'll configure the training step. \n",
+ "Next, we'll configure the training step by first configuring the estimator for random cut forest. Then, we use the output of the estimator's .fit() method as arguments to the TrainingStep. By passing the pipeline_session to the sagemaker_session, calling .fit() does not launch the training job. Instead, it returns the arguments needed to run the job as a step in the pipeline.\n",
"\n",
- "The training step will accept the following **inputs**: \n",
+ "To generate the step arguments for the training step, it will accept the following **inputs**: \n",
" * S3 location of processed data to be used for model training\n",
" * ECR containing the training image for rcf\n",
" * Estimator configuration\n",
@@ -1018,6 +1021,8 @@
"metadata": {},
"outputs": [],
"source": [
+ "from sagemaker.workflow.pipeline_context import PipelineSession\n",
+ "\n",
"containers = {\n",
" \"us-west-2\": \"174872318107.dkr.ecr.us-west-2.amazonaws.com/randomcutforest:latest\",\n",
" \"us-east-1\": \"382416733822.dkr.ecr.us-east-1.amazonaws.com/randomcutforest:latest\",\n",
@@ -1028,7 +1033,7 @@
"container = containers[region_name]\n",
"model_prefix = \"model\"\n",
"\n",
- "session = sagemaker.Session()\n",
+ "pipeline_session = PipelineSession()\n",
"\n",
"rcf = sagemaker.estimator.Estimator(\n",
" container,\n",
@@ -1036,7 +1041,7 @@
" output_path=\"s3://{}/{}/output\".format(default_bucket, model_prefix),\n",
" instance_count=training_instance_count,\n",
" instance_type=\"ml.c5.xlarge\",\n",
- " sagemaker_session=session,\n",
+ " sagemaker_session=pipeline_session,\n",
")\n",
"\n",
"rcf.set_hyperparameters(num_samples_per_tree=200, num_trees=50, feature_dim=1)"
@@ -1052,9 +1057,7 @@
"from sagemaker.inputs import TrainingInput\n",
"from sagemaker.workflow.steps import TrainingStep\n",
"\n",
- "step_train = TrainingStep(\n",
- " name=\"TrainModel\",\n",
- " estimator=rcf,\n",
+ "train_step_args = rcf.fit(\n",
" inputs={\n",
" \"train\": TrainingInput(\n",
" # s3_data = Output of the previous call back step\n",
@@ -1063,6 +1066,10 @@
" distribution=\"ShardedByS3Key\",\n",
" ),\n",
" },\n",
+ ")\n",
+ "step_train = TrainingStep(\n",
+ " name=\"TrainModel\",\n",
+ " step_args=train_step_args,\n",
")"
]
},
@@ -1073,9 +1080,9 @@
"source": [
"#### 3 - Create Model\n",
"\n",
- "Next, we'll package the trained model for deployment. \n",
+ "Next, we'll package the trained model for deployment. To achieve this, we define the ModelStep by providing the return values from `model.create()` as the step arguments. Similarly, the `pipeline_session` is required when defining the model, which puts off the model creation to the pipeline execution time.\n",
"\n",
- "The create model step will accept the following **inputs**: \n",
+ "To generate the step arguments for the model step, it will accept the following **inputs**: \n",
" * S3 location of the trained model artifact\n",
" * ECR containing the inference image for rcf\n",
" \n",
@@ -1100,7 +1107,7 @@
"model = Model(\n",
" image_uri=image_uri,\n",
" model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,\n",
- " sagemaker_session=sagemaker_session,\n",
+ " sagemaker_session=pipeline_session,\n",
" role=role,\n",
")"
]
@@ -1112,19 +1119,14 @@
"metadata": {},
"outputs": [],
"source": [
- "from sagemaker.inputs import CreateModelInput\n",
- "from sagemaker.workflow.steps import CreateModelStep\n",
+ "from sagemaker.workflow.model_step import ModelStep\n",
"\n",
"\n",
- "inputs = CreateModelInput(\n",
+ "model_step_args = model.create(\n",
" instance_type=\"ml.m5.large\",\n",
")\n",
"\n",
- "create_model = CreateModelStep(\n",
- " name=\"TaxiModel\",\n",
- " model=model,\n",
- " inputs=inputs,\n",
- ")"
+ "create_model = ModelStep(name=\"TaxiModel\", step_args=model_step_args)"
]
},
{
@@ -1134,9 +1136,9 @@
"source": [
"#### 4 - Batch Transform\n",
"\n",
- "Next, we'll deploy the model using batch transform then do a quick evaluation with our data to compute anomaly scores for each of our data points on input. \n",
+ "Next, we'll deploy the model using batch transform then do a quick evaluation with our data to compute anomaly scores for each of our data points on input.\n",
"\n",
- "The batch transform step will accept the following **inputs**: \n",
+ "To generate the step arguments for the batch transform step, it will accept the following **inputs**: \n",
" * SageMaker packaged model\n",
" * S3 location of the input data\n",
" * ECR containing the inference image for rcf\n",
@@ -1164,6 +1166,7 @@
" accept=\"text/csv\",\n",
" instance_count=1,\n",
" output_path=f\"s3://{default_bucket}/{output_prefix}/\",\n",
+ " sagemaker_session=pipeline_session,\n",
")"
]
},
@@ -1179,17 +1182,18 @@
"\n",
"batch_data = step_callback_data.properties.Outputs[\"s3_data_out\"]\n",
"\n",
+ "transform_step_args = transformer.transform(\n",
+ " data=batch_data,\n",
+ " content_type=\"text/csv\",\n",
+ " split_type=\"Line\",\n",
+ " input_filter=\"$[0]\",\n",
+ " join_source=\"Input\",\n",
+ " output_filter=\"$[0,-1]\",\n",
+ ")\n",
+ "\n",
"step_transform = TransformStep(\n",
" name=\"TaxiTransform\",\n",
- " transformer=transformer,\n",
- " inputs=TransformInput(\n",
- " data=batch_data,\n",
- " content_type=\"text/csv\",\n",
- " split_type=\"Line\",\n",
- " input_filter=\"$[0]\",\n",
- " join_source=\"Input\",\n",
- " output_filter=\"$[0,-1]\",\n",
- " ),\n",
+ " step_args=transform_step_args,\n",
")"
]
},
@@ -1201,19 +1205,6 @@
"### Configure Pipeline Using Created Steps"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e646229c",
- "metadata": {},
- "outputs": [],
- "source": [
- "import uuid\n",
- "\n",
- "id_out = uuid.uuid4().hex\n",
- "print(\"Unique ID:\", id_out)"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -1222,8 +1213,9 @@
"outputs": [],
"source": [
"from sagemaker.workflow.pipeline import Pipeline\n",
+ "from sagemaker.utils import unique_name_from_base\n",
"\n",
- "pipeline_name = f\"GluePipeline-{id_out}\"\n",
+ "pipeline_name = unique_name_from_base(\"GluePipeline\")\n",
"pipeline = Pipeline(\n",
" name=pipeline_name,\n",
" parameters=[\n",
@@ -1318,9 +1310,9 @@
"metadata": {
"instance_type": "ml.t3.medium",
"kernelspec": {
- "display_name": "Python 3 (Data Science)",
+ "display_name": "Python 3",
"language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -1332,9 +1324,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.10"
+ "version": "3.6.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/sagemaker-pipelines/tabular/custom_callback_pipelines_step/setup_iam_roles.py b/sagemaker-pipelines/tabular/custom_callback_pipelines_step/setup_iam_roles.py
index ef5ae80d1b..2da54d19d5 100644
--- a/sagemaker-pipelines/tabular/custom_callback_pipelines_step/setup_iam_roles.py
+++ b/sagemaker-pipelines/tabular/custom_callback_pipelines_step/setup_iam_roles.py
@@ -1,241 +1,212 @@
import json
import boto3
-iam = boto3.client('iam')
+iam = boto3.client("iam")
def create_ecs_task_role(role_name):
try:
response = iam.create_role(
- RoleName = role_name,
- AssumeRolePolicyDocument = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Principal": {
- "Service": "ecs-tasks.amazonaws.com"
- },
- "Action": "sts:AssumeRole"
- }
- ]
- }),
- Description='Role for ECS task execution'
+ RoleName=role_name,
+ AssumeRolePolicyDocument=json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {"Service": "ecs-tasks.amazonaws.com"},
+ "Action": "sts:AssumeRole",
+ }
+ ],
+ }
+ ),
+ Description="Role for ECS task execution",
)
- role_arn = response['Role']['Arn']
+ role_arn = response["Role"]["Arn"]
response = iam.attach_role_policy(
RoleName=role_name,
- PolicyArn='arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy'
+ PolicyArn="arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy",
)
-
+
response = iam.put_role_policy(
RoleName=role_name,
- PolicyName='create_log_group',
- PolicyDocument='{"Version":"2012-10-17","Statement":{"Effect":"Allow","Action":"logs:CreateLogGroup","Resource":"*"}}'
+ PolicyName="create_log_group",
+ PolicyDocument='{"Version":"2012-10-17","Statement":{"Effect":"Allow","Action":"logs:CreateLogGroup","Resource":"*"}}',
)
-
+
return role_arn
-
+
except iam.exceptions.EntityAlreadyExistsException:
- print(f'Using ARN from existing role: {role_name}')
+ print(f"Using ARN from existing role: {role_name}")
response = iam.get_role(RoleName=role_name)
- return response['Role']['Arn']
+ return response["Role"]["Arn"]
def create_task_runner_role(role_name):
try:
response = iam.create_role(
- RoleName = role_name,
- AssumeRolePolicyDocument = json.dumps({
+ RoleName=role_name,
+ AssumeRolePolicyDocument=json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {"Service": "ecs-tasks.amazonaws.com"},
+ "Action": "sts:AssumeRole",
+ }
+ ],
+ }
+ ),
+ Description="Role for ECS tasks",
+ )
+
+ role_arn = response["Role"]["Arn"]
+
+ role_policy_document = json.dumps(
+ {
"Version": "2012-10-17",
"Statement": [
+ {"Effect": "Allow", "Action": "sagemaker:*", "Resource": "*"},
{
"Effect": "Allow",
- "Principal": {
- "Service": "ecs-tasks.amazonaws.com"
- },
- "Action": "sts:AssumeRole"
- }
- ]
- }),
- Description='Role for ECS tasks'
+ "Action": ["glue:StartJobRun", "glue:GetJobRun"],
+ "Resource": "*",
+ },
+ {"Effect": "Allow", "Action": "logs:CreateLogGroup", "Resource": "*"},
+ ],
+ }
)
- role_arn = response['Role']['Arn']
-
- role_policy_document = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Action": "sagemaker:*",
- "Resource": "*"
- },
- {
- "Effect": "Allow",
- "Action": [
- "glue:StartJobRun",
- "glue:GetJobRun"
- ],
- "Resource": "*"
- },
- {
- "Effect": "Allow",
- "Action": "logs:CreateLogGroup",
- "Resource": "*"
- }
- ]
- })
-
response = iam.put_role_policy(
RoleName=role_name,
- PolicyName='glue_logs_sagemaker',
- PolicyDocument=role_policy_document
+ PolicyName="glue_logs_sagemaker",
+ PolicyDocument=role_policy_document,
)
-
+
response = iam.put_role_policy(
RoleName=role_name,
- PolicyName='create_log_group',
- PolicyDocument='{"Version":"2012-10-17","Statement":{"Effect":"Allow","Action":"logs:CreateLogGroup","Resource":"*"}}'
+ PolicyName="create_log_group",
+ PolicyDocument='{"Version":"2012-10-17","Statement":{"Effect":"Allow","Action":"logs:CreateLogGroup","Resource":"*"}}',
)
-
+
return role_arn
except iam.exceptions.EntityAlreadyExistsException:
- print(f'Using ARN from existing role: {role_name}')
+ print(f"Using ARN from existing role: {role_name}")
response = iam.get_role(RoleName=role_name)
- return response['Role']['Arn']
+ return response["Role"]["Arn"]
def create_glue_pipeline_role(role_name, bucket):
try:
response = iam.create_role(
- RoleName = role_name,
- AssumeRolePolicyDocument = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Principal": {
- "Service": "glue.amazonaws.com"
- },
- "Action": "sts:AssumeRole"
- }
- ]
- }),
- Description='Role for Glue ETL job'
+ RoleName=role_name,
+ AssumeRolePolicyDocument=json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {"Service": "glue.amazonaws.com"},
+ "Action": "sts:AssumeRole",
+ }
+ ],
+ }
+ ),
+ Description="Role for Glue ETL job",
)
- role_arn = response['Role']['Arn']
+ role_arn = response["Role"]["Arn"]
response = iam.attach_role_policy(
- RoleName=role_name,
- PolicyArn='arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole'
+ RoleName=role_name, PolicyArn="arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
)
-
- role_policy_document = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Action": "s3:*",
- "Resource": f"arn:aws:s3:::{bucket}"
- }
- ]
- })
-
+
+ role_policy_document = json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {"Effect": "Allow", "Action": "s3:*", "Resource": f"arn:aws:s3:::{bucket}"}
+ ],
+ }
+ )
+
response = iam.put_role_policy(
- RoleName=role_name,
- PolicyName='glue_s3_bucket',
- PolicyDocument=role_policy_document
+ RoleName=role_name, PolicyName="glue_s3_bucket", PolicyDocument=role_policy_document
)
-
- role_policy_document = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Action": "s3:*",
- "Resource": f"arn:aws:s3:::{bucket}/*"
- }
- ]
- })
-
+
+ role_policy_document = json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {"Effect": "Allow", "Action": "s3:*", "Resource": f"arn:aws:s3:::{bucket}/*"}
+ ],
+ }
+ )
+
response = iam.put_role_policy(
- RoleName=role_name,
- PolicyName='glue_s3_objects',
- PolicyDocument=role_policy_document
+ RoleName=role_name, PolicyName="glue_s3_objects", PolicyDocument=role_policy_document
)
-
+
return role_arn
except iam.exceptions.EntityAlreadyExistsException:
- print(f'Using ARN from existing role: {role_name}')
+ print(f"Using ARN from existing role: {role_name}")
response = iam.get_role(RoleName=role_name)
- return response['Role']['Arn']
-
+ return response["Role"]["Arn"]
+
+
def create_lambda_sm_pipeline_role(role_name, ecs_role_arn, task_role_arn):
try:
response = iam.create_role(
- RoleName = role_name,
- AssumeRolePolicyDocument = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Principal": {
- "Service": "lambda.amazonaws.com"
- },
- "Action": "sts:AssumeRole"
- }
- ]
- }),
- Description='Role for Lambda to call ECS Fargate task'
+ RoleName=role_name,
+ AssumeRolePolicyDocument=json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Principal": {"Service": "lambda.amazonaws.com"},
+ "Action": "sts:AssumeRole",
+ }
+ ],
+ }
+ ),
+ Description="Role for Lambda to call ECS Fargate task",
)
- role_arn = response['Role']['Arn']
+ role_arn = response["Role"]["Arn"]
response = iam.attach_role_policy(
RoleName=role_name,
- PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole'
+ PolicyArn="arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole",
)
- role_policy_document = json.dumps({
- "Version": "2012-10-17",
- "Statement": [
- {
- "Effect": "Allow",
- "Action": "ecs:RunTask",
- "Resource": ["*"]
- },
- {
- "Effect": "Allow",
- "Action": "sqs:*",
- "Resource": ["*"]
- },
- {
- "Effect": "Allow",
- "Action": "sagemaker:*",
- "Resource": ["*"]
- },
- {
- "Effect": "Allow",
- "Action": "iam:PassRole",
- "Resource": [ecs_role_arn, task_role_arn]
- },
- ]
- })
+ role_policy_document = json.dumps(
+ {
+ "Version": "2012-10-17",
+ "Statement": [
+ {"Effect": "Allow", "Action": "ecs:RunTask", "Resource": ["*"]},
+ {"Effect": "Allow", "Action": "sqs:*", "Resource": ["*"]},
+ {"Effect": "Allow", "Action": "sagemaker:*", "Resource": ["*"]},
+ {
+ "Effect": "Allow",
+ "Action": "iam:PassRole",
+ "Resource": [ecs_role_arn, task_role_arn],
+ },
+ ],
+ }
+ )
response = iam.put_role_policy(
- RoleName=role_name,
- PolicyName='ecs_sqs_sagemaker',
- PolicyDocument=role_policy_document
+ RoleName=role_name, PolicyName="ecs_sqs_sagemaker", PolicyDocument=role_policy_document
)
return role_arn
except iam.exceptions.EntityAlreadyExistsException:
- print(f'Using ARN from existing role: {role_name}')
+ print(f"Using ARN from existing role: {role_name}")
response = iam.get_role(RoleName=role_name)
- return response['Role']['Arn']
\ No newline at end of file
+ return response["Role"]["Arn"]
diff --git a/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step.ipynb b/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step.ipynb
index 695995a096..1036bce5cb 100644
--- a/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step.ipynb
+++ b/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step.ipynb
@@ -1051,4 +1051,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step_outputs.ipynb b/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step_outputs.ipynb
index fa431f8a44..1e10714587 100644
--- a/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step_outputs.ipynb
+++ b/sagemaker-pipelines/tabular/lambda-step/sagemaker-pipelines-lambda-step_outputs.ipynb
@@ -329,9 +329,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\r\n",
- "\u001b[33mWARNING: You are using pip version 21.1.3; however, version 22.1.2 is available.\r\n",
- "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n"
+ "\u001B[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001B[0m\r\n",
+ "\u001B[33mWARNING: You are using pip version 21.1.3; however, version 22.1.2 is available.\r\n",
+ "You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001B[0m\r\n"
]
}
],
@@ -1917,4 +1917,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/sagemaker-pipelines/tabular/local-mode/sagemaker-pipelines-local-mode.ipynb b/sagemaker-pipelines/tabular/local-mode/sagemaker-pipelines-local-mode.ipynb
new file mode 100644
index 0000000000..242a34de5e
--- /dev/null
+++ b/sagemaker-pipelines/tabular/local-mode/sagemaker-pipelines-local-mode.ipynb
@@ -0,0 +1,1489 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Use SageMaker Pipelines to Run Your Jobs Locally\n",
+ "\n",
+ "This notebook demonstrates how to orchestrate SageMaker jobs locally using SageMaker Pipelines. \n",
+ "\n",
+ "Using a `LocalPipelineSession` object, you can now run your pipelines on your local machine before running them in the cloud. \n",
+ "\n",
+ "The `LocalPipelineSession` object is used while defining each pipeline step and when defining the complete Pipeline object. To run this pipeline in the cloud, each step along with the Pipeline object must be redefined using `PipelineSession`.\n",
+ "\n",
+ "**Note**: This notebook will not run in SageMaker Studio. You can run this on SageMaker Classic Notebook instances OR your local IDE."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SageMaker Pipelines Local Mode\n",
+ "\n",
+ "SageMaker Pipelines Local Mode supports the following activities, which are demonstrated in this notebook:\n",
+ "\n",
+ "* ProcessingStep\n",
+ "* TrainingStep\n",
+ "* ConditionStep\n",
+ "* ModelStep\n",
+ "* TransformStep\n",
+ "* FailStep"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Dataset\n",
+ "\n",
+ "The dataset you use is the [UCI Machine Learning Abalone Dataset](https://archive.ics.uci.edu/ml/datasets/abalone) [1]. The aim for this task is to determine the age of an abalone snail from its physical measurements. At the core, this is a regression problem.\n",
+ "\n",
+ "The dataset contains several features: length (the longest shell measurement), diameter (the diameter perpendicular to length), height (the height with meat in the shell), whole_weight (the weight of whole abalone), shucked_weight (the weight of meat), viscera_weight (the gut weight after bleeding), shell_weight (the weight after being dried), sex ('M', 'F', 'I' where 'I' is Infant), and rings (integer).\n",
+ "\n",
+ "The number of rings turns out to be a good approximation for age (age is rings + 1.5). However, to obtain this number requires cutting the shell through the cone, staining the section, and counting the number of rings through a microscope, which is a time-consuming task. However, the other physical measurements are easier to determine. You use the dataset to build a predictive model of the variable rings through these other physical measurements.\n",
+ "\n",
+ "Before you upload the data to an S3 bucket, install the SageMaker Python SDK and gather some constants you can use later in this notebook.\n",
+ "\n",
+ "> [1] Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Install the latest version of the SageMaker Python SDK. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install 'sagemaker' --upgrade"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "import boto3\n",
+ "import sagemaker\n",
+ "from sagemaker.workflow.pipeline_context import LocalPipelineSession\n",
+ "\n",
+ "# Create a `LocalPipelineSession` object so that each pipeline step will run locally\n",
+ "# To run this pipeline in the cloud, you must change `LocalPipelineSession()` to `PipelineSession()`\n",
+ "local_pipeline_session = LocalPipelineSession()\n",
+ "\n",
+ "region = local_pipeline_session.boto_region_name\n",
+ "\n",
+ "default_bucket = local_pipeline_session.default_bucket()\n",
+ "prefix = \"sagemaker-pipelines-local-mode-example\"\n",
+ "\n",
+ "role = None # Role is set below"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Please Note: Provide SageMaker Execution Role ARN if not running on SageMaker Notebook environment\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
đŸ’¡ Set Execution Role for Permissions \n",
+ "If you are running this notebook from a local machine, as opposed to within the SageMaker Jupyter environment, you will need to add the code below, after filling in the name for a valid SageMaker Execution Role. \n",
+ " \n",
+ " Click here to lookup IAM SageMaker Execution Roles \n",
+ " The except block below will lookup the ARN from the role name.\n",
+ "\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# try:\n",
+ "# role = sagemaker.get_execution_role()\n",
+ "# except ValueError:\n",
+ "# iam = boto3.client('iam')\n",
+ "# role = iam.get_role(RoleName='')['Role']['Arn']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if role is None:\n",
+ " role = sagemaker.get_execution_role()\n",
+ "\n",
+ "print(role)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, upload the data into the default bucket. You can select our own data set for the `input_data_uri` as is appropriate."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Pull the dataset from SageMaker's public S3 bucket and upload it to your own S3 bucket\n",
+ "\n",
+ "local_path = \"data/abalone-dataset.csv\"\n",
+ "\n",
+ "s3 = boto3.resource(\"s3\")\n",
+ "s3.Bucket(f\"sagemaker-sample-files\").download_file(\n",
+ " \"datasets/tabular/uci_abalone/abalone.csv\", local_path\n",
+ ")\n",
+ "\n",
+ "base_uri = f\"s3://{default_bucket}/{prefix}/abalone-data-set\"\n",
+ "input_data_uri = sagemaker.s3.S3Uploader.upload(\n",
+ " local_path=local_path,\n",
+ " desired_s3_uri=base_uri,\n",
+ ")\n",
+ "print(input_data_uri)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.parameters import ParameterString, ParameterFloat\n",
+ "\n",
+ "processing_instance_count = 1\n",
+ "training_instance_count = 1\n",
+ "transform_instance_count = 1\n",
+ "instance_type = \"ml.m5.xlarge\"\n",
+ "\n",
+ "input_data = ParameterString(\n",
+ " name=\"InputData\",\n",
+ " default_value=input_data_uri,\n",
+ ")\n",
+ "\n",
+ "mse_threshold = ParameterFloat(name=\"MseThreshold\", default_value=7.0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a Processing Step for Feature Engineering\n",
+ "\n",
+ "First, develop a preprocessing script that is specified in the Processing step.\n",
+ "\n",
+ "This notebook cell writes a file `preprocessing_abalone.py`, which contains the preprocessing script. You can update the script, and rerun this cell to overwrite. The preprocessing script uses `scikit-learn` to do the following:\n",
+ "\n",
+ "* Fill in missing sex category data and encode it so that it is suitable for training.\n",
+ "* Scale and normalize all numerical fields, aside from sex and rings numerical data.\n",
+ "* Split the data into training, validation, and test datasets.\n",
+ "\n",
+ "The Processing step executes the script on the input data. The Training step uses the preprocessed training features and labels to train a model. The Evaluation step uses the trained model and preprocessed test features and labels to evaluate the model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!mkdir -p code"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile code/preprocessing.py\n",
+ "import argparse\n",
+ "import os\n",
+ "import requests\n",
+ "import tempfile\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
+ "\n",
+ "\n",
+ "# Since we get a headerless CSV file, we specify the column names here.\n",
+ "feature_columns_names = [\n",
+ " \"sex\",\n",
+ " \"length\",\n",
+ " \"diameter\",\n",
+ " \"height\",\n",
+ " \"whole_weight\",\n",
+ " \"shucked_weight\",\n",
+ " \"viscera_weight\",\n",
+ " \"shell_weight\",\n",
+ "]\n",
+ "label_column = \"rings\"\n",
+ "\n",
+ "feature_columns_dtype = {\n",
+ " \"sex\": str,\n",
+ " \"length\": np.float64,\n",
+ " \"diameter\": np.float64,\n",
+ " \"height\": np.float64,\n",
+ " \"whole_weight\": np.float64,\n",
+ " \"shucked_weight\": np.float64,\n",
+ " \"viscera_weight\": np.float64,\n",
+ " \"shell_weight\": np.float64,\n",
+ "}\n",
+ "label_column_dtype = {\"rings\": np.float64}\n",
+ "\n",
+ "\n",
+ "def merge_two_dicts(x, y):\n",
+ " z = x.copy()\n",
+ " z.update(y)\n",
+ " return z\n",
+ "\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " base_dir = \"/opt/ml/processing\"\n",
+ "\n",
+ " df = pd.read_csv(\n",
+ " f\"{base_dir}/input/abalone-dataset.csv\",\n",
+ " header=None,\n",
+ " names=feature_columns_names + [label_column],\n",
+ " dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype),\n",
+ " )\n",
+ " numeric_features = list(feature_columns_names)\n",
+ " numeric_features.remove(\"sex\")\n",
+ " numeric_transformer = Pipeline(\n",
+ " steps=[\n",
+ " (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
+ " (\"scaler\", StandardScaler()),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " categorical_features = [\"sex\"]\n",
+ " categorical_transformer = Pipeline(\n",
+ " steps=[\n",
+ " (\"imputer\", SimpleImputer(strategy=\"constant\", fill_value=\"missing\")),\n",
+ " (\"onehot\", OneHotEncoder(handle_unknown=\"ignore\")),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " preprocess = ColumnTransformer(\n",
+ " transformers=[\n",
+ " (\"num\", numeric_transformer, numeric_features),\n",
+ " (\"cat\", categorical_transformer, categorical_features),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " y = df.pop(\"rings\")\n",
+ " X_pre = preprocess.fit_transform(df)\n",
+ " y_pre = y.to_numpy().reshape(len(y), 1)\n",
+ "\n",
+ " X = np.concatenate((y_pre, X_pre), axis=1)\n",
+ "\n",
+ " np.random.shuffle(X)\n",
+ " train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))])\n",
+ "\n",
+ " pd.DataFrame(train).to_csv(f\"{base_dir}/train/train.csv\", header=False, index=False)\n",
+ " pd.DataFrame(validation).to_csv(\n",
+ " f\"{base_dir}/validation/validation.csv\", header=False, index=False\n",
+ " )\n",
+ " pd.DataFrame(test).to_csv(f\"{base_dir}/test/test.csv\", header=False, index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next, create an instance of a `SKLearnProcessor` processor and use that in our `ProcessingStep`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.sklearn.processing import SKLearnProcessor\n",
+ "\n",
+ "framework_version = \"1.0-1\"\n",
+ "\n",
+ "sklearn_processor = SKLearnProcessor(\n",
+ " framework_version=framework_version,\n",
+ " instance_type=instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " base_job_name=\"sklearn-abalone-process\",\n",
+ " role=role,\n",
+ " sagemaker_session=local_pipeline_session,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Finally, we take the output of the processor's `run` method and pass that as arguments to the `ProcessingStep`. By passing the `local_pipeline_session` to the `sagemaker_session`, calling `.run()` does not launch the processing job, it returns the arguments needed to run the job as a step in the pipeline.\n",
+ "\n",
+ "Note the `\"train_data\"` and `\"test_data\"` named channels specified in the output configuration for the processing job. Step `Properties` can be used in subsequent steps and resolve to their runtime values at execution. Specifically, this usage is called out when you define the training step."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
+ "from sagemaker.workflow.steps import ProcessingStep\n",
+ "\n",
+ "processor_args = sklearn_processor.run(\n",
+ " inputs=[\n",
+ " ProcessingInput(source=input_data, destination=\"/opt/ml/processing/input\"),\n",
+ " ],\n",
+ " outputs=[\n",
+ " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n",
+ " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n",
+ " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n",
+ " ],\n",
+ " code=\"code/preprocessing.py\",\n",
+ ")\n",
+ "\n",
+ "step_process = ProcessingStep(name=\"AbaloneProcess\", step_args=processor_args)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile code/abalone.py\n",
+ "\n",
+ "import argparse\n",
+ "import json\n",
+ "import logging\n",
+ "import os\n",
+ "import pathlib\n",
+ "import pickle as pkl\n",
+ "import tarfile\n",
+ "\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import xgboost as xgb\n",
+ "\n",
+ "logging.basicConfig(level=logging.INFO)\n",
+ "\n",
+ "TRAIN_VALIDATION_FRACTION = 0.2\n",
+ "RANDOM_STATE_SAMPLING = 200\n",
+ "\n",
+ "logging.basicConfig(level=logging.INFO)\n",
+ "\n",
+ "\n",
+ "def prepare_data(train_dir, validation_dir):\n",
+ " \"\"\"Read data from train and validation channel, and return predicting features and target variables.\n",
+ "\n",
+ " Args:\n",
+ " data_dir (str): directory which saves the training data.\n",
+ "\n",
+ " Returns:\n",
+ " Tuple of training features, training target, validation features, validation target.\n",
+ " \"\"\"\n",
+ " df_train = pd.read_csv(\n",
+ " os.path.join(train_dir, \"train.csv\"),\n",
+ " header=None,\n",
+ " )\n",
+ " df_train = df_train.iloc[np.random.permutation(len(df_train))]\n",
+ " df_train.columns = [\"target\"] + [f\"feature_{x}\" for x in range(df_train.shape[1] - 1)]\n",
+ "\n",
+ " try:\n",
+ " df_validation = pd.read_csv(\n",
+ " os.path.join(validation_dir, \"validation.csv\"),\n",
+ " header=None,\n",
+ " )\n",
+ " df_validation.columns = [\"target\"] + [\n",
+ " f\"feature_{x}\" for x in range(df_validation.shape[1] - 1)\n",
+ " ]\n",
+ "\n",
+ " except FileNotFoundError: # when validation data is not available in the directory\n",
+ " logging.info(\n",
+ " f\"Validation data is not found. {TRAIN_VALIDATION_FRACTION * 100}% of training data is \"\n",
+ " f\"randomly selected as validation data. The seed for random sampling is {RANDOM_STATE_SAMPLING}.\"\n",
+ " )\n",
+ " df_validation = df_train.sample(\n",
+ " frac=TRAIN_VALIDATION_FRACTION,\n",
+ " random_state=RANDOM_STATE_SAMPLING,\n",
+ " )\n",
+ " df_train.drop(df_validation.index, inplace=True)\n",
+ " df_validation.reset_index(drop=True, inplace=True)\n",
+ " df_train.reset_index(drop=True, inplace=True)\n",
+ "\n",
+ " X_train, y_train = df_train.iloc[:, 1:], df_train.iloc[:, :1]\n",
+ " X_val, y_val = df_validation.iloc[:, 1:], df_validation.iloc[:, :1]\n",
+ "\n",
+ " return X_train.values, y_train.values, X_val.values, y_val.values\n",
+ "\n",
+ "\n",
+ "def main():\n",
+ " \"\"\"Run training.\"\"\"\n",
+ " parser = argparse.ArgumentParser()\n",
+ "\n",
+ " parser.add_argument(\n",
+ " \"--max_depth\",\n",
+ " type=int,\n",
+ " )\n",
+ " parser.add_argument(\"--eta\", type=float)\n",
+ " parser.add_argument(\"--gamma\", type=int)\n",
+ " parser.add_argument(\"--min_child_weight\", type=int)\n",
+ " parser.add_argument(\"--subsample\", type=float)\n",
+ " parser.add_argument(\"--verbosity\", type=int)\n",
+ " parser.add_argument(\"--objective\", type=str)\n",
+ " parser.add_argument(\"--num_round\", type=int)\n",
+ " parser.add_argument(\"--tree_method\", type=str, default=\"auto\")\n",
+ " parser.add_argument(\"--predictor\", type=str, default=\"auto\")\n",
+ " parser.add_argument(\"--learning_rate\", type=str, default=\"auto\")\n",
+ " parser.add_argument(\"--output_data_dir\", type=str, default=os.environ.get(\"SM_OUTPUT_DATA_DIR\"))\n",
+ " parser.add_argument(\"--model_dir\", type=str, default=os.environ.get(\"SM_MODEL_DIR\"))\n",
+ " parser.add_argument(\"--train\", type=str, default=os.environ.get(\"SM_CHANNEL_TRAIN\"))\n",
+ " parser.add_argument(\"--validation\", type=str, default=os.environ.get(\"SM_CHANNEL_VALIDATION\"))\n",
+ " parser.add_argument(\"--sm_hosts\", type=str, default=os.environ.get(\"SM_HOSTS\"))\n",
+ " parser.add_argument(\"--sm_current_host\", type=str, default=os.environ.get(\"SM_CURRENT_HOST\"))\n",
+ "\n",
+ " args, _ = parser.parse_known_args()\n",
+ "\n",
+ " X_train, y_train, X_val, y_val = prepare_data(args.train, args.validation)\n",
+ "\n",
+ " # create dataset for lightgbm\n",
+ " dtrain = xgb.DMatrix(data=X_train, label=y_train)\n",
+ " dval = xgb.DMatrix(data=X_val, label=y_val)\n",
+ " watchlist = [(dtrain, \"train\"), (dval, \"validation\")]\n",
+ "\n",
+ " # specify your configurations as a dict\n",
+ " params = {\n",
+ " \"booster\": \"gbtree\",\n",
+ " \"objective\": args.objective,\n",
+ " \"learning_rate\": args.learning_rate,\n",
+ " \"gamma\": args.gamma,\n",
+ " \"min_child_weight\": args.min_child_weight,\n",
+ " \"max_depth\": args.max_depth,\n",
+ " \"subsample\": args.subsample,\n",
+ " \"colsample_bytree\": 1,\n",
+ " \"reg_lambda\": 1,\n",
+ " \"reg_alpha\": 0,\n",
+ " \"eval_metric\": \"rmse\",\n",
+ " }\n",
+ "\n",
+ " bst = xgb.train(\n",
+ " params=params,\n",
+ " dtrain=dtrain,\n",
+ " num_boost_round=args.num_round,\n",
+ " evals=watchlist,\n",
+ " xgb_model=None,\n",
+ " )\n",
+ "\n",
+ " model_location = args.model_dir + \"/xgboost-model\"\n",
+ " pkl.dump(bst, open(model_location, \"wb\"))\n",
+ " logging.info(\"Stored trained model at {}\".format(model_location))\n",
+ "\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " main()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.estimator import Estimator\n",
+ "from sagemaker.inputs import TrainingInput\n",
+ "\n",
+ "model_path = f\"s3://{default_bucket}/{prefix}/model\"\n",
+ "image_uri = sagemaker.image_uris.retrieve(\n",
+ " framework=\"xgboost\",\n",
+ " region=region,\n",
+ " version=\"1.5-1\",\n",
+ " py_version=\"py3\",\n",
+ " instance_type=instance_type,\n",
+ ")\n",
+ "\n",
+ "xgb_train = Estimator(\n",
+ " image_uri=image_uri,\n",
+ " entry_point=\"code/abalone.py\",\n",
+ " instance_type=instance_type,\n",
+ " instance_count=training_instance_count,\n",
+ " output_path=model_path,\n",
+ " role=role,\n",
+ " sagemaker_session=local_pipeline_session,\n",
+ ")\n",
+ "\n",
+ "xgb_train.set_hyperparameters(\n",
+ " objective=\"reg:squarederror\",\n",
+ " learning_rate=0.01,\n",
+ " num_round=50,\n",
+ " max_depth=5,\n",
+ " eta=0.2,\n",
+ " gamma=4,\n",
+ " min_child_weight=6,\n",
+ " subsample=0.7,\n",
+ ")\n",
+ "\n",
+ "train_args = xgb_train.fit(\n",
+ " inputs={\n",
+ " \"train\": TrainingInput(\n",
+ " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\"train\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " \"validation\": TrainingInput(\n",
+ " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n",
+ " \"validation\"\n",
+ " ].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " }\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Finally, we use the output of the estimator's `.fit()` method as arguments to the `TrainingStep`. By passing the `local_pipeline_session` to the `sagemaker_session`, calling `.fit()` does not launch the training job, it returns the arguments needed to run the job as a step in the pipeline.\n",
+ "\n",
+ "Pass in the `S3Uri` of the `\"train_data\"` output channel to the `.fit()` method. Also, use the other `\"test_data\"` output channel for model evaluation in the pipeline. The `properties` attribute of a Pipeline step matches the object model of the corresponding response of a describe call. These properties can be referenced as placeholder values and are resolved at runtime. For example, the `ProcessingStep` `properties` attribute matches the object model of the [DescribeProcessingJob](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeProcessingJob.html) response object."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.inputs import TrainingInput\n",
+ "from sagemaker.workflow.steps import TrainingStep\n",
+ "\n",
+ "step_train = TrainingStep(\n",
+ " name=\"AbaloneTrain\",\n",
+ " step_args=train_args,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a Model Evaluation Step to Evaluate the Trained Model\n",
+ "\n",
+ "First, develop an evaluation script that is specified in a Processing step that performs the model evaluation.\n",
+ "\n",
+ "After pipeline execution, you can examine the resulting `evaluation.json` for analysis.\n",
+ "\n",
+ "The evaluation script uses `xgboost` to do the following:\n",
+ "\n",
+ "* Load the model.\n",
+ "* Read the test data.\n",
+ "* Issue predictions against the test data.\n",
+ "* Build a classification report, including accuracy and ROC curve.\n",
+ "* Save the evaluation report to the evaluation directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile code/evaluation.py\n",
+ "import json\n",
+ "import pathlib\n",
+ "import pickle\n",
+ "import tarfile\n",
+ "\n",
+ "import joblib\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import xgboost\n",
+ "import math\n",
+ "\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " model_path = f\"/opt/ml/processing/model/model.tar.gz\"\n",
+ " with tarfile.open(model_path) as tar:\n",
+ " tar.extractall(path=\".\")\n",
+ "\n",
+ " model = pickle.load(open(\"xgboost-model\", \"rb\"))\n",
+ "\n",
+ " test_path = \"/opt/ml/processing/test/test.csv\"\n",
+ " df = pd.read_csv(test_path, header=None)\n",
+ " df.columns = [\"target\"] + [f\"feature_{x}\" for x in range(df.shape[1] - 1)]\n",
+ "\n",
+ " y_test = df.iloc[:, 0].to_numpy()\n",
+ " df.drop(df.columns[0], axis=1, inplace=True)\n",
+ "\n",
+ " X_test = xgboost.DMatrix(df.values)\n",
+ "\n",
+ " predictions = model.predict(X_test)\n",
+ "\n",
+ " mse = mean_squared_error(y_test, predictions)\n",
+ " std = np.std(y_test - predictions)\n",
+ " report_dict = {\n",
+ " \"regression_metrics\": {\n",
+ " \"mse\": {\"value\": math.sqrt(mse), \"standard_deviation\": std},\n",
+ " },\n",
+ " }\n",
+ "\n",
+ " output_dir = \"/opt/ml/processing/evaluation\"\n",
+ " pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)\n",
+ "\n",
+ " evaluation_path = f\"{output_dir}/evaluation.json\"\n",
+ " with open(evaluation_path, \"w\") as f:\n",
+ " f.write(json.dumps(report_dict))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next, create an instance of a `ScriptProcessor` processor and use it in the `ProcessingStep`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.processing import ScriptProcessor\n",
+ "\n",
+ "script_eval = ScriptProcessor(\n",
+ " image_uri=image_uri,\n",
+ " command=[\"python3\"],\n",
+ " instance_type=instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " base_job_name=\"script-abalone-eval\",\n",
+ " role=role,\n",
+ " sagemaker_session=local_pipeline_session,\n",
+ ")\n",
+ "\n",
+ "eval_args = script_eval.run(\n",
+ " inputs=[\n",
+ " ProcessingInput(\n",
+ " source=step_train.properties.ModelArtifacts.S3ModelArtifacts,\n",
+ " destination=\"/opt/ml/processing/model\",\n",
+ " ),\n",
+ " ProcessingInput(\n",
+ " source=step_process.properties.ProcessingOutputConfig.Outputs[\"test\"].S3Output.S3Uri,\n",
+ " destination=\"/opt/ml/processing/test\",\n",
+ " ),\n",
+ " ],\n",
+ " outputs=[\n",
+ " ProcessingOutput(output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"),\n",
+ " ],\n",
+ " code=\"code/evaluation.py\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Use the processor's arguments returned by `.run()` to construct a `ProcessingStep`, along with the input and output channels and the code that will be executed when the pipeline invokes pipeline execution. \n",
+ "\n",
+ "Specifically, the `S3ModelArtifacts` from the `step_train` `properties` and the `S3Uri` of the `\"test_data\"` output channel of the `step_process` `properties` are passed as inputs. The `TrainingStep` and `ProcessingStep` `properties` attribute matches the object model of the [DescribeTrainingJob](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeTrainingJob.html) and [DescribeProcessingJob](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeProcessingJob.html) response objects, respectively."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.properties import PropertyFile\n",
+ "\n",
+ "evaluation_report = PropertyFile(\n",
+ " name=\"EvaluationReport\", output_name=\"evaluation\", path=\"evaluation.json\"\n",
+ ")\n",
+ "step_eval = ProcessingStep(\n",
+ " name=\"AbaloneEval\",\n",
+ " step_args=eval_args,\n",
+ " property_files=[evaluation_report],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a Create Model Step to Create a Model\n",
+ "\n",
+ "In order to perform batch transformation using the example model, create a SageMaker model. \n",
+ "\n",
+ "Specifically, pass in the `S3ModelArtifacts` from the `TrainingStep`, `step_train` properties. The `TrainingStep` `properties` attribute matches the object model of the [DescribeTrainingJob](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeTrainingJob.html) response object.\n",
+ "\n",
+ "We provide a custom inference script that defines the logic for the batch transform job"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile code/inference.py\n",
+ "\n",
+ "import json\n",
+ "import os\n",
+ "import pickle as pkl\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import sagemaker_xgboost_container.encoder as xgb_encoders\n",
+ "import xgboost as xgb\n",
+ "import io\n",
+ "import logging\n",
+ "\n",
+ "logging.basicConfig(level=logging.INFO)\n",
+ "\n",
+ "\n",
+ "def model_fn(model_dir):\n",
+ " \"\"\"\n",
+ " Deserialize and return fitted model.\n",
+ " \"\"\"\n",
+ " model_file = \"xgboost-model\"\n",
+ " booster = pkl.load(open(os.path.join(model_dir, model_file), \"rb\"))\n",
+ " return booster\n",
+ "\n",
+ "\n",
+ "def transform_fn(model, request_body, request_content_type, accept):\n",
+ " \"\"\" \"\"\"\n",
+ " if request_content_type == \"text/libsvm\":\n",
+ " input_data = xgb_encoders.libsvm_to_dmatrix(request_body)\n",
+ " if request_content_type == \"text/csv\":\n",
+ " df = pd.read_csv(io.StringIO(request_body.strip(\"\\n\")), header=None)\n",
+ " df.drop(0, axis=1, inplace=True)\n",
+ " input_data = xgb.DMatrix(data=df)\n",
+ "\n",
+ " else:\n",
+ " raise ValueError(\"Content type {} is not supported.\".format(request_content_type))\n",
+ "\n",
+ " prediction = model.predict(input_data)\n",
+ " feature_contribs = model.predict(input_data, pred_contribs=True, validate_features=False)\n",
+ " output = np.hstack((prediction[:, np.newaxis], feature_contribs))\n",
+ "\n",
+ " logging.info(\"Successfully completed transform job!\")\n",
+ "\n",
+ " return \",\".join(str(x) for x in output[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.model import Model\n",
+ "\n",
+ "model = Model(\n",
+ " image_uri=image_uri,\n",
+ " model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,\n",
+ " source_dir=\"code\",\n",
+ " entry_point=\"inference.py\",\n",
+ " role=role,\n",
+ " sagemaker_session=local_pipeline_session,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Define the `ModelStep` by providing the return values from `model.create()` as the step arguments. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.model_step import ModelStep\n",
+ "\n",
+ "step_create_model = ModelStep(\n",
+ " name=\"AbaloneCreateModel\", step_args=model.create(instance_type=instance_type)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a Transform Step to Perform Batch Transformation\n",
+ "\n",
+ "Now that a model instance is defined, create a `Transformer` instance with the appropriate model type, compute instance type, and desired output S3 URI.\n",
+ "\n",
+ "Specifically, pass in the `ModelName` from the `CreateModelStep`, `step_create_model` properties. The `CreateModelStep` `properties` attribute matches the object model of the [DescribeModel](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeModel.html) response object."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.transformer import Transformer\n",
+ "\n",
+ "\n",
+ "transformer = Transformer(\n",
+ " model_name=step_create_model.properties.ModelName,\n",
+ " instance_type=instance_type,\n",
+ " instance_count=transform_instance_count,\n",
+ " output_path=f\"s3://{default_bucket}/{prefix}/transform\",\n",
+ " sagemaker_session=local_pipeline_session,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Pass in the transformer instance and the `TransformInput` with the `batch_data` pipeline parameter defined earlier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.inputs import TransformInput\n",
+ "from sagemaker.workflow.steps import TransformStep\n",
+ "from sagemaker.workflow.functions import Join\n",
+ "\n",
+ "transform_data = Join(\n",
+ " on=\"/\",\n",
+ " values=[\n",
+ " step_process.properties.ProcessingOutputConfig.Outputs[\"test\"].S3Output.S3Uri,\n",
+ " \"test.csv\",\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "transform_args = transformer.transform(transform_data, content_type=\"text/csv\")\n",
+ "\n",
+ "step_transform = TransformStep(name=\"AbaloneTransform\", step_args=transform_args)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.fail_step import FailStep\n",
+ "\n",
+ "step_fail = FailStep(\n",
+ " name=\"AbaloneMSEFail\",\n",
+ " error_message=Join(on=\" \", values=[\"Execution failed due to MSE >\", mse_threshold]),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a Condition Step to Check Accuracy and Conditionally Create a Model and Run a Batch Transformation Or Terminate the Execution in Failed State\n",
+ "\n",
+ "In this step, the model is registered only if the accuracy of the model, as determined by the evaluation step `step_eval`, exceeded a specified value. Otherwise, the pipeline execution fails and terminates. A `ConditionStep` enables pipelines to support conditional execution in the pipeline DAG based on the conditions of the step properties.\n",
+ "\n",
+ "In the following section, you:\n",
+ "\n",
+ "* Define a `ConditionLessThanOrEqualTo` on the accuracy value found in the output of the evaluation step, `step_eval`.\n",
+ "* Use the condition in the list of conditions in a `ConditionStep`.\n",
+ "* Pass the `CreateModelStep` and `TransformStep` steps into the `if_steps` of the `ConditionStep`, which are only executed if the condition evaluates to `True`.\n",
+ "* Pass the `FailStep` step into the `else_steps`of the `ConditionStep`, which is only executed if the condition evaluates to `False`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo\n",
+ "from sagemaker.workflow.condition_step import ConditionStep\n",
+ "from sagemaker.workflow.functions import JsonGet\n",
+ "\n",
+ "cond_lte = ConditionLessThanOrEqualTo(\n",
+ " left=JsonGet(\n",
+ " step_name=step_eval.name,\n",
+ " property_file=evaluation_report,\n",
+ " json_path=\"regression_metrics.mse.value\",\n",
+ " ),\n",
+ " right=mse_threshold,\n",
+ ")\n",
+ "\n",
+ "step_cond = ConditionStep(\n",
+ " name=\"AbaloneMSECond\",\n",
+ " conditions=[cond_lte],\n",
+ " if_steps=[step_create_model, step_transform],\n",
+ " else_steps=[step_fail],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a Pipeline using `LocalPipelineSession`\n",
+ "\n",
+ "In this section, combine the steps into a Pipeline so it can be executed. We provide a `LocalPipelineSession` object to the `Pipeline` so that when executed, all the steps in the pipeline will run locally on the machine. By switching the `LocalPipelineSession` to a `sagemaker.session.Session` object, you can switch the execution to run in the cloud on SageMaker instances."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.pipeline import Pipeline\n",
+ "\n",
+ "pipeline_name = f\"LocalModelPipeline\"\n",
+ "pipeline = Pipeline(\n",
+ " name=pipeline_name,\n",
+ " parameters=[\n",
+ " input_data,\n",
+ " mse_threshold,\n",
+ " ],\n",
+ " steps=[step_process, step_train, step_eval, step_cond],\n",
+ " sagemaker_session=local_pipeline_session,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### (Optional) Examining the pipeline definition\n",
+ "\n",
+ "The JSON of the pipeline definition can be examined to confirm the pipeline is well-defined and the parameters and step properties resolve correctly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "definition = json.loads(pipeline.definition())\n",
+ "definition"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Submit the pipeline to SageMaker and start execution\n",
+ "\n",
+ "Submit the pipeline definition to the Pipeline service. The Pipeline service uses the role that is passed in to create all the jobs defined in the steps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pipeline.upsert(role_arn=role)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Start the pipeline and accept all the default parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "execution = pipeline.start()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "steps = execution.list_steps()\n",
+ "steps"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get the step outputs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get output files from processing job\n",
+ "\n",
+ "processing_job_name = steps[\"PipelineExecutionSteps\"][0][\"Metadata\"][\"ProcessingJob\"][\"Arn\"]\n",
+ "outputs = local_pipeline_session.sagemaker_client.describe_processing_job(\n",
+ " ProcessingJobName=processing_job_name\n",
+ ")[\"ProcessingOutputConfig\"][\"Outputs\"]\n",
+ "for key in outputs:\n",
+ " print(outputs[key][\"S3Output\"][\"S3Uri\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get output from training job\n",
+ "\n",
+ "training_job_name = steps[\"PipelineExecutionSteps\"][1][\"Metadata\"][\"TrainingJob\"][\"Arn\"]\n",
+ "outputs = local_pipeline_session.sagemaker_client.describe_training_job(\n",
+ " TrainingJobName=training_job_name\n",
+ ")\n",
+ "print(\"Model location : \", outputs[\"ModelArtifacts\"][\"S3ModelArtifacts\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get output from model evaluation step (processing job)\n",
+ "\n",
+ "processing_job_name = steps[\"PipelineExecutionSteps\"][2][\"Metadata\"][\"ProcessingJob\"][\"Arn\"]\n",
+ "outputs = local_pipeline_session.sagemaker_client.describe_processing_job(\n",
+ " ProcessingJobName=processing_job_name\n",
+ ")[\"ProcessingOutputConfig\"][\"Outputs\"]\n",
+ "for key in outputs:\n",
+ " print(outputs[key][\"S3Output\"][\"S3Uri\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get output of ModelStep\n",
+ "import json\n",
+ "\n",
+ "model_name = steps[\"PipelineExecutionSteps\"][-1][\"Metadata\"][\"Model\"][\"Arn\"]\n",
+ "outputs = local_pipeline_session.sagemaker_client.describe_model(ModelName=model_name)\n",
+ "print(outputs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get output from the TransformStep\n",
+ "\n",
+ "transform_job_name = steps[\"PipelineExecutionSteps\"][4][\"Metadata\"][\"TransformJob\"][\"Arn\"]\n",
+ "outputs = local_pipeline_session.sagemaker_client.describe_transform_job(\n",
+ " TransformJobName=transform_job_name\n",
+ ")\n",
+ "print(outputs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Transition to running pipeline as SageMaker Managed Pipeline\n",
+ "\n",
+ "We will now use a non-local PipelineSession object to re-run the Pipeline steps via SageMaker as a managed service. This will run all pipeline steps as SageMaker-managed processes. This will also allow us to view and track the results directly in the SageMaker Studio UI."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.workflow.pipeline_context import PipelineSession\n",
+ "\n",
+ "pipeline_session = PipelineSession()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Recreate the SKLearnProcessor with non-local session\n",
+ "\n",
+ "framework_version = \"1.0-1\"\n",
+ "\n",
+ "sklearn_processor = SKLearnProcessor(\n",
+ " framework_version=framework_version,\n",
+ " instance_type=instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " base_job_name=\"sklearn-abalone-process\",\n",
+ " role=role,\n",
+ " sagemaker_session=pipeline_session, # use non-local session\n",
+ ")\n",
+ "\n",
+ "processor_args = sklearn_processor.run(\n",
+ " inputs=[\n",
+ " ProcessingInput(source=input_data, destination=\"/opt/ml/processing/input\"),\n",
+ " ],\n",
+ " outputs=[\n",
+ " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n",
+ " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n",
+ " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n",
+ " ],\n",
+ " code=\"code/preprocessing.py\",\n",
+ ")\n",
+ "\n",
+ "step_process = ProcessingStep(name=\"AbaloneProcess\", step_args=processor_args)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"image_uri: {image_uri}\")\n",
+ "print(f\"model_path: {model_path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Recreate the Estimator instance with non-local session\n",
+ "\n",
+ "xgb_train = Estimator(\n",
+ " image_uri=image_uri,\n",
+ " entry_point=\"code/abalone.py\",\n",
+ " instance_type=instance_type,\n",
+ " instance_count=training_instance_count,\n",
+ " output_path=model_path,\n",
+ " role=role,\n",
+ " sagemaker_session=pipeline_session, # use non-local session\n",
+ ")\n",
+ "\n",
+ "xgb_train.set_hyperparameters(\n",
+ " objective=\"reg:squarederror\",\n",
+ " learning_rate=0.01,\n",
+ " num_round=50,\n",
+ " max_depth=5,\n",
+ " eta=0.2,\n",
+ " gamma=4,\n",
+ " min_child_weight=6,\n",
+ " subsample=0.7,\n",
+ ")\n",
+ "\n",
+ "train_args = xgb_train.fit(\n",
+ " inputs={\n",
+ " \"train\": TrainingInput(\n",
+ " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\"train\"].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " \"validation\": TrainingInput(\n",
+ " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n",
+ " \"validation\"\n",
+ " ].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "step_train = TrainingStep(\n",
+ " name=\"AbaloneTrain\",\n",
+ " step_args=train_args,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Recreate the Script Processor instance with non-local session\n",
+ "\n",
+ "script_eval = ScriptProcessor(\n",
+ " image_uri=image_uri,\n",
+ " command=[\"python3\"],\n",
+ " instance_type=instance_type,\n",
+ " instance_count=processing_instance_count,\n",
+ " base_job_name=\"script-abalone-eval\",\n",
+ " role=role,\n",
+ " sagemaker_session=pipeline_session, # use non-local session\n",
+ ")\n",
+ "\n",
+ "eval_args = script_eval.run(\n",
+ " inputs=[\n",
+ " ProcessingInput(\n",
+ " source=step_train.properties.ModelArtifacts.S3ModelArtifacts,\n",
+ " destination=\"/opt/ml/processing/model\",\n",
+ " ),\n",
+ " ProcessingInput(\n",
+ " source=step_process.properties.ProcessingOutputConfig.Outputs[\"test\"].S3Output.S3Uri,\n",
+ " destination=\"/opt/ml/processing/test\",\n",
+ " ),\n",
+ " ],\n",
+ " outputs=[\n",
+ " ProcessingOutput(output_name=\"evaluation\", source=\"/opt/ml/processing/evaluation\"),\n",
+ " ],\n",
+ " code=\"code/evaluation.py\",\n",
+ ")\n",
+ "\n",
+ "evaluation_report = PropertyFile(\n",
+ " name=\"EvaluationReport\", output_name=\"evaluation\", path=\"evaluation.json\"\n",
+ ")\n",
+ "\n",
+ "step_eval = ProcessingStep(\n",
+ " name=\"AbaloneEval\",\n",
+ " step_args=eval_args,\n",
+ " property_files=[evaluation_report],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Recreate the Model instance with non-local session\n",
+ "\n",
+ "model = Model(\n",
+ " image_uri=image_uri,\n",
+ " model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,\n",
+ " source_dir=\"code\",\n",
+ " entry_point=\"inference.py\",\n",
+ " role=role,\n",
+ " sagemaker_session=pipeline_session, # use non-local session\n",
+ ")\n",
+ "\n",
+ "step_create_model = ModelStep(\n",
+ " name=\"AbaloneCreateModel\", step_args=model.create(instance_type=instance_type)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Recreate the Transformer instance with non-local session\n",
+ "\n",
+ "transformer = Transformer(\n",
+ " model_name=step_create_model.properties.ModelName,\n",
+ " instance_type=instance_type,\n",
+ " instance_count=transform_instance_count,\n",
+ " output_path=f\"s3://{default_bucket}/{prefix}/transform\",\n",
+ " sagemaker_session=pipeline_session, # use non-local session\n",
+ ")\n",
+ "\n",
+ "transform_data = Join(\n",
+ " on=\"/\",\n",
+ " values=[\n",
+ " step_process.properties.ProcessingOutputConfig.Outputs[\"test\"].S3Output.S3Uri,\n",
+ " \"test.csv\",\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "transform_args = transformer.transform(transform_data, content_type=\"text/csv\")\n",
+ "\n",
+ "step_transform = TransformStep(name=\"AbaloneTransform\", step_args=transform_args)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Recreate the Step condition with new step instances\n",
+ "\n",
+ "step_cond = ConditionStep(\n",
+ " name=\"AbaloneMSECond\",\n",
+ " conditions=[cond_lte],\n",
+ " if_steps=[step_create_model, step_transform],\n",
+ " else_steps=[step_fail],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now that all the Steps are re-defined, we create a new Managed Pipeline\n",
+ "\n",
+ "We add each of the recreated steps to a new Pipeline instance that we will run as a managed (in-the-cloud) pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Re-define the Pipeline using non-local session\n",
+ "\n",
+ "pipeline_name = f\"SM-Managed-Pipeline\"\n",
+ "\n",
+ "sm_pipeline = Pipeline(\n",
+ " name=pipeline_name,\n",
+ " parameters=[\n",
+ " input_data,\n",
+ " mse_threshold,\n",
+ " ],\n",
+ " steps=[step_process, step_train, step_eval, step_cond],\n",
+ " sagemaker_session=pipeline_session, # non-local session\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sm_pipeline.upsert(role_arn=role)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# start execution of SageMaker-managed pipeline\n",
+ "sm_execution = sm_pipeline.start()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sm_execution.wait(delay=60, max_attempts=60)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sm_execution.list_steps()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### (Optional) After a Pipeline Step completes, you can view the CloudWatch Log output \n",
+ "\n",
+ "Using SageMaker Studio and navigating to the Pipelines components, find the specific execution that just completed. Under the 'Graph' tab on the left panel, select a particular step, like Training (AbaloneTrain in this example), then click the 'Logs' tab on the right panel, and click the 'view logs in CloudWatch console' link. This will open a new tab/window showing the log output from the Training job."
+ ]
+ },
+ {
+ "attachments": {
+ "blog-pipeline-local-mode-AbaloneTrain-logs-link.png": {
+ "image/png": ""
+ }
+ },
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![blog-pipeline-local-mode-AbaloneTrain-logs-link.png](attachment:blog-pipeline-local-mode-AbaloneTrain-logs-link.png)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "instance_type": "ml.t3.medium",
+ "kernelspec": {
+ "display_name": "conda_python3",
+ "language": "python",
+ "name": "conda_python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb b/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb
index 2fbf96183d..2b7f52f0ef 100644
--- a/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb
+++ b/sagemaker-pipelines/tabular/train-register-deploy-pipeline-model/train register and deploy a pipeline model.ipynb
@@ -1130,7 +1130,7 @@
"data = data.drop(\"medianHouseValue\", axis=1)\n",
"\n",
"pred_count = 10\n",
- "payload = data.iloc[:pred_count].to_string(header=False, index=False).replace(\" \", \",\")\n",
+ "payload = data.iloc[:pred_count].to_csv(header=False, index=False)\n",
"p = predictor.predict(payload, initial_args={\"ContentType\": \"text/csv\"})\n",
"print(p.decode(\"utf-8\"))"
]
@@ -1231,4 +1231,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/sagemaker-pipelines/time_series_forecasting/amazon_forecast_pipeline/sm_pipeline_with_amazon_forecast.ipynb b/sagemaker-pipelines/time_series_forecasting/amazon_forecast_pipeline/sm_pipeline_with_amazon_forecast.ipynb
index 4904fbbfe7..c49389ab19 100644
--- a/sagemaker-pipelines/time_series_forecasting/amazon_forecast_pipeline/sm_pipeline_with_amazon_forecast.ipynb
+++ b/sagemaker-pipelines/time_series_forecasting/amazon_forecast_pipeline/sm_pipeline_with_amazon_forecast.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
- "id": "0c9ee48f",
+ "id": "91ee6b6d",
"metadata": {},
"source": [
"# Creating an Amazon Forecast Predictor with SageMaker Pipelines\n",
@@ -27,7 +27,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "932dc4d0",
+ "id": "86a4678e",
"metadata": {},
"outputs": [],
"source": [
@@ -52,7 +52,7 @@
},
{
"cell_type": "markdown",
- "id": "de12d5a8",
+ "id": "b0125efc",
"metadata": {},
"source": [
"Finally, you will need the following trust policies."
@@ -61,7 +61,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "957d4d1f",
+ "id": "8050bbca",
"metadata": {},
"outputs": [],
"source": [
@@ -81,7 +81,7 @@
},
{
"cell_type": "markdown",
- "id": "45298f90",
+ "id": "9ed30cce",
"metadata": {},
"source": [
"## Prerequisites\n",
@@ -95,7 +95,17 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "9ab8df52",
+ "id": "1d137518",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! pip install sagemaker==2.93.0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b763c7b0",
"metadata": {},
"outputs": [],
"source": [
@@ -135,7 +145,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "51f2beea",
+ "id": "ad1e02a1",
"metadata": {},
"outputs": [],
"source": [
@@ -189,7 +199,7 @@
},
{
"cell_type": "markdown",
- "id": "fff1a8b5",
+ "id": "23cb4c19",
"metadata": {},
"source": [
"## Dataset\n",
@@ -200,7 +210,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "e4367c4a",
+ "id": "d71ab7f2",
"metadata": {},
"outputs": [],
"source": [
@@ -226,7 +236,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "5a4a1d4a",
+ "id": "8ce30a2a",
"metadata": {},
"outputs": [],
"source": [
@@ -243,7 +253,7 @@
},
{
"cell_type": "markdown",
- "id": "db24153a",
+ "id": "f58aea79",
"metadata": {},
"source": [
"The dataset happens to span January 01, 2011, to January 01, 2015. We are only going to use about two and a half week's of hourly data to train Amazon Forecast. \n",
@@ -253,7 +263,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "ac099d3c",
+ "id": "724eee5f",
"metadata": {},
"outputs": [],
"source": [
@@ -262,7 +272,7 @@
},
{
"cell_type": "markdown",
- "id": "d114bd69",
+ "id": "18b87844",
"metadata": {},
"source": [
"Next, we define parameters that can be set for the execution of the pipeline. They serve as variables. We define the following:\n",
@@ -286,7 +296,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "f1d86c83",
+ "id": "52ba0c45",
"metadata": {},
"outputs": [],
"source": [
@@ -294,7 +304,6 @@
"processing_instance_type = ParameterString(\n",
" name=\"ProcessingInstanceType\", default_value=\"ml.m5.large\"\n",
")\n",
- "training_instance_count = ParameterInteger(name=\"TrainingInstanceCount\", default_value=1)\n",
"training_instance_type = ParameterString(name=\"TrainingInstanceType\", default_value=\"ml.m5.large\")\n",
"\n",
"input_train = ParameterString(\n",
@@ -312,7 +321,7 @@
},
{
"cell_type": "markdown",
- "id": "eff2dad9",
+ "id": "3a2ee68c",
"metadata": {},
"source": [
"We use an updated [SKLearnProcessor](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#sagemaker.sklearn.processing.SKLearnProcessor) to run Python scripts to build a dataset group and train an Amazon Forecast predictor using `boto3`. In the next chunk, we instantiate an instance of `ScriptProcessor`, which is essentially an SKLearnProcessor with updated `boto3` and `botocore` (as built above) that we use in the next steps. "
@@ -321,7 +330,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "11e82c55",
+ "id": "130c2059",
"metadata": {},
"outputs": [],
"source": [
@@ -336,7 +345,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "88fb293a",
+ "id": "2abf7b80",
"metadata": {},
"outputs": [],
"source": [
@@ -353,7 +362,7 @@
},
{
"cell_type": "markdown",
- "id": "5d40d2b1",
+ "id": "26bd50c0",
"metadata": {},
"source": [
"First we preprocess the data using an Amazon SageMaker [ProcessingStep](https://sagemaker.readthedocs.io/en/stable/workflows/pipelines/sagemaker.workflow.pipelines.html?highlight=ProcessingStep#sagemaker.workflow.steps.ProcessingStep) that provides a containerized execution environment to run the `preprocess.py` script."
@@ -362,7 +371,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "b5d84ca3",
+ "id": "aa0259f4",
"metadata": {},
"outputs": [],
"source": [
@@ -383,7 +392,7 @@
},
{
"cell_type": "markdown",
- "id": "6d4b1540",
+ "id": "6e05150d",
"metadata": {},
"source": [
"The next step is to train and evaluate the forecasting model calling Amazon Forecast using `boto3`. We instantiate an instance of `SKLearn` estimator that we use in the next `TrainingStep` to run the script `train.py`. \n",
@@ -394,7 +403,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "b80ada3f",
+ "id": "95177b2f",
"metadata": {},
"outputs": [],
"source": [
@@ -425,7 +434,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "b52d154a",
+ "id": "cf10e258",
"metadata": {},
"outputs": [],
"source": [
@@ -433,7 +442,6 @@
" entry_point=\"train.py\",\n",
" role=role_arn,\n",
" image_uri=container_image_uri,\n",
- " instance_count=training_instance_count,\n",
" instance_type=training_instance_type,\n",
" sagemaker_session=sagemaker_session,\n",
" base_job_name=\"forecast-train\",\n",
@@ -446,7 +454,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "f5ddecce",
+ "id": "82f5536a",
"metadata": {},
"outputs": [],
"source": [
@@ -455,7 +463,7 @@
},
{
"cell_type": "markdown",
- "id": "29f0d4d4",
+ "id": "867f2daf",
"metadata": {},
"source": [
"The third step is an Amazon SageMaker ProcessingStep that deletes or keeps the Amazon Forecast model running using the script `conditional_delete.py`. If the error reported after training is higher than a threshold you specify for the metric you specify, this step deletes all the resources created by Amazon Forecast that are related to the pipeline's execution.\n",
@@ -465,7 +473,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "43c79816",
+ "id": "f6122249",
"metadata": {},
"outputs": [],
"source": [
@@ -492,7 +500,7 @@
},
{
"cell_type": "markdown",
- "id": "41ef4915",
+ "id": "991697b7",
"metadata": {},
"source": [
"Finally, we combine all the steps and define our pipeline."
@@ -501,7 +509,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "7cf7b196",
+ "id": "fdc925a3",
"metadata": {},
"outputs": [],
"source": [
@@ -513,7 +521,6 @@
" parameters=[\n",
" processing_instance_type,\n",
" processing_instance_count,\n",
- " training_instance_count,\n",
" training_instance_type,\n",
" input_train,\n",
" forecast_horizon,\n",
@@ -532,7 +539,7 @@
},
{
"cell_type": "markdown",
- "id": "c838b490",
+ "id": "681b8721",
"metadata": {},
"source": [
"Once the pipeline is successfully defined, we can start the execution."
@@ -541,7 +548,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "1cbe62f1",
+ "id": "5b375f45",
"metadata": {},
"outputs": [],
"source": [
@@ -551,7 +558,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "35e9c22d",
+ "id": "b2fec897",
"metadata": {},
"outputs": [],
"source": [
@@ -561,7 +568,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "ccb70b34",
+ "id": "72464cc3",
"metadata": {},
"outputs": [],
"source": [
@@ -571,7 +578,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "a20d8f39",
+ "id": "e66f34a3",
"metadata": {},
"outputs": [],
"source": [
@@ -580,7 +587,7 @@
},
{
"cell_type": "markdown",
- "id": "1e285dfd",
+ "id": "c5c56dff",
"metadata": {},
"source": [
"## Experiments Tracking\n",
@@ -602,7 +609,7 @@
},
{
"cell_type": "markdown",
- "id": "067b7888",
+ "id": "a0030897",
"metadata": {},
"source": [
"## Conclusion"
@@ -610,7 +617,7 @@
},
{
"cell_type": "markdown",
- "id": "40a6ba7e",
+ "id": "132ad067",
"metadata": {},
"source": [
"In this notebook we have seen how to create a SageMaker Pipeline to train an Amazon Forecast predictor on your own dataset with a target and related time series."
@@ -618,7 +625,7 @@
},
{
"cell_type": "markdown",
- "id": "93c99720",
+ "id": "d10d8baf",
"metadata": {},
"source": [
"## Clean up\n",
@@ -629,7 +636,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "bc956081",
+ "id": "c8665320",
"metadata": {},
"outputs": [],
"source": [
@@ -654,7 +661,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "4c2e6e8a",
+ "id": "6a234cbf",
"metadata": {},
"outputs": [],
"source": [
@@ -670,7 +677,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "e50ca583",
+ "id": "b269f192",
"metadata": {},
"outputs": [],
"source": [
@@ -680,7 +687,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "82e5928a",
+ "id": "a8828ef6",
"metadata": {},
"outputs": [],
"source": [
@@ -690,7 +697,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "44212447",
+ "id": "abeda944",
"metadata": {},
"outputs": [],
"source": [
@@ -708,7 +715,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "41649cfd",
+ "id": "d64d4ae5",
"metadata": {},
"outputs": [],
"source": [
@@ -720,7 +727,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "a9fd48dc",
+ "id": "33d2861f",
"metadata": {},
"outputs": [],
"source": [
@@ -733,7 +740,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "cc00b557",
+ "id": "d0e4bd9d",
"metadata": {},
"outputs": [],
"source": [
@@ -744,7 +751,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "336300e0",
+ "id": "d9968b15",
"metadata": {},
"outputs": [],
"source": [
@@ -756,7 +763,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "8eed39f2",
+ "id": "1440c84e",
"metadata": {},
"outputs": [],
"source": [
@@ -767,9 +774,9 @@
"metadata": {
"instance_type": "ml.t3.medium",
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "conda_python3",
"language": "python",
- "name": "python3"
+ "name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
@@ -781,7 +788,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.11"
+ "version": "3.8.12"
}
},
"nbformat": 4,
diff --git a/sagemaker-python-sdk/paddlepaddle_sentiment_analysis_byo_mms/Bring Your Own DL Framework to Amazon Sagemaker with Model Server for Apache MXNet's (MMS) BYO container.ipynb b/sagemaker-python-sdk/paddlepaddle_sentiment_analysis_byo_mms/Bring Your Own DL Framework to Amazon Sagemaker with Model Server for Apache MXNet's (MMS) BYO container.ipynb
deleted file mode 100644
index ab9f3d330b..0000000000
--- a/sagemaker-python-sdk/paddlepaddle_sentiment_analysis_byo_mms/Bring Your Own DL Framework to Amazon Sagemaker with Model Server for Apache MXNet's (MMS) BYO container.ipynb
+++ /dev/null
@@ -1,539 +0,0 @@
-{
- "cells": [
- {
- "attachments": {
- "image.png": {
- "image/png": ""
- }
- },
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Introduction\n",
- "\n",
- "Deep Learning frameworks enable Machine Learning (ML) practitioners to build and train ML models. However, the process of deploying ML models in production to serve predictions (also known as inferences) in real time is more complex. It requires that ML practitioners build a scalable and performant model server, which can host these models and handle inference requests at scale. Model Server for Apache MXNet (MMS), was developed to address this hurdle. MMS is a highly scalable, production ready inference server. MMS was designed in a ML/DL framework agnostic way to host models trained in any ML/DL framework.\n",
- "\n",
- "In this blog post, we will showcase how anyone could use Model Server for Apache MXNet (MMS) to host their model trained using any Machine Learning/Deep Learning (ML/DL) framework or tool kit in production. We chose Amazon Sagemaker service for production hosting - this PaaS solution does a lot of heavy lifting to provide infrastructure and allows users to focus on their use cases. We will be using 'Bring your own Inference code with Amazon Sagemaker hosting' approach, where users could bring their models together with all necessary dependencies, libraries, frameworks and other components compiled inside of a single custom-built docker container and host it on Sagemaker. \n",
- "\n",
- "To showcase the true 'ML/DL framework agnostic architecture' of MMS, we chose to launch a model trained with 'PaddlePaddle' framework into production.\n",
- "\n",
- "The overall picture of steps involved to take a model trained on any ML/DL framework to Amazon Sagemaker using MMS BYO container looks as follows:\n",
- "\n",
- "![image.png](attachment:image.png)\n",
- "\n",
- "As shown in the picture above, in order to bring your own ML/DL framework to Amazon Sagemaker using MMS Bring Your Own (BYO) container, we need two main components\n",
- "\n",
- "1. **Model artifacts/Model Archive**: These are all the artifacts required to run your model on a given host. This contains the following:\n",
- " 1. **Model files**, which are usually symbols and weights. These are the artifacts of training a model.\n",
- " 2. **Custom Service File**: This file contains the entry-point which gets called every time when inference request is received and served by MMS. This file generally contains the logic to initialize the model in a particular ML/DL framework, preprocess the incoming request, run inference in a particular ML/DL framework and post-process logic which takes the data coming out of framework's inference method and converts it to end-user consumable data.\n",
- " 3. **MANIFEST File**: This is the interface between custom service file and the MMS. This file is generated by running a tool that comes as part of MMS distribution, called 'model-archiver'.\n",
- "2. **Container artifact**: To load and run a model written in a custom DL framework on Sagemaker, you need to bring a container that will be run on Sagemaker service. In this document we will show how to use MMS base container and extend it to support custom DL frameworks and other model dependencies. The MMS base container is a docker container that comes with a highly scalable and performant model-server which is readily launchable onto Sagemaker service.\n",
- "In the following sections, we will see each of the above components in detail.\n",
- "\n",
- "## Preparing a Model\n",
- "MMS container is completely ML/DL framework agnostic. Users can write models in a ML/DL framework of their choice and bring it to Sagemaker with MMS BYO container to get the features of scalability and performance. In this blogpost, we chose to showcase this by bringing in a model written for PaddlePaddle framework. Lets look at how to prepare a PaddlePaddle model in the following sections. The model artifact is readily available at <*TODO: Update this with the S3 link with model.tar.gz*>.\n",
- "\n",
- "### Preparing Model Artifacts\n",
- "We are going to use [Understand Sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment) example that is available and published in examples section of PaddlePaddle repository. First of all we need to create a model. In order to do that we followed instructions provided in [PaddlePaddle/book](https://github.com/PaddlePaddle/book) repository: downloaded container and ran training by the notebook that is provided as part of the example. We used 'Stacked Bidirectional LSTM' network for our training and trained the model for 100 epochs. At the end of this training exercise, we get the following list of trained model artifacts.\n",
- "\n",
- "```bash\n",
- "!ls\n",
- "embedding_0.w_0 fc_2.w_0 fc_5.w_0 learning_rate_0 lstm_3.b_0 moment_10 moment_18 moment_25 moment_32 moment_8\n",
- "embedding_1.w_0 fc_2.w_1 fc_5.w_1 learning_rate_1 lstm_3.w_0 moment_11 moment_19 moment_26 moment_33 moment_9\n",
- "fc_0.b_0 fc_3.b_0 fc_6.b_0 lstm_0.b_0 lstm_4.b_0 moment_12 moment_2 moment_27 moment_34\n",
- "fc_0.w_0 fc_3.w_0 fc_6.w_0 lstm_0.w_0 lstm_4.w_0 moment_13 moment_20 moment_28 moment_35\n",
- "fc_1.b_0 fc_3.w_1 fc_6.w_1 lstm_1.b_0 lstm_5.b_0 moment_14 moment_21 moment_29 moment_4\n",
- "fc_1.w_0 fc_4.b_0 fc_7.b_0 lstm_1.w_0 lstm_5.w_0 moment_15 moment_22 moment_3 moment_5\n",
- "fc_1.w_1 fc_4.w_0 fc_7.w_0 lstm_2.b_0 moment_0 moment_16 moment_23 moment_30 moment_6\n",
- "fc_2.b_0 fc_5.b_0 fc_7.w_1 lstm_2.w_0 moment_1 moment_17 moment_24 moment_31 moment_7\n",
- "```\n",
- "\n",
- "These artifacts constitute a PaddlePaddle model. We copy these artifacts from within training container to localhost so that it will be easier to begin preparation of the model for production hosting. To learn more on how to copy files from inside a docker container to location outside of it please refer to [Docker CLI](https://docs.docker.com/engine/reference/commandline/cp/).\n",
- "\n",
- "### Writing Custom Service Code\n",
- "We now have model files required to host the model in production. We can now define a custom service file which knows how to use these files and also knows how to 'preprocess' the raw request coming into the server and how to 'postprocess' the responses coming out of the PaddlePaddle framework's 'infer' method. For this, we modified the notebook example written to test the trained model **. Let's look at some code. \n",
- "\n",
- "We created a custom service file called 'paddle_sentiment_analysis.py'. Here, we first define a class called 'PaddleSentimentAnalysis' which contains methods to initialize the model and also defines pre-processing, post-processing and inference methods. Refer [Custom Service Code](https://github.com/awslabs/mxnet-model-server/blob/master/docs/custom_service.md) document to learn how to write your custom-service code. The skeleton of this file is as follows:\n",
- "\n",
- "```bash\n",
- "$ cat paddle_sentiment_analysis.py\n",
- "```\n",
- "```python\n",
- "\n",
- "from __future__ import print_function\n",
- "import paddle\n",
- "import paddle.fluid as fluid\n",
- "import paddle.dataset as dataset\n",
- "from functools import partial\n",
- "\n",
- " \n",
- "class PaddleSentimentAnalysis(object):\n",
- " def __init__(self):\n",
- " ...\n",
- "\n",
- " def initialize(self, context):\n",
- " \"\"\"\n",
- " This method is used to initialize the network and read other artifacts.\n",
- " \"\"\"\n",
- " ...\n",
- " \n",
- " def preprocess(self, data):\n",
- " \"\"\"\n",
- " This method is used to convert the string requests coming from client \n",
- " into tensors. \n",
- " \"\"\"\n",
- " ...\n",
- "\n",
- " def inference(self, input):\n",
- " \"\"\"\n",
- " This method runs the tensors created in preprocess method through the \n",
- " DL framework's infer method.\n",
- " \"\"\"\n",
- " ...\n",
- "\n",
- " def postprocess(self, output, data):\n",
- " \"\"\"\n",
- " Here the values returned from the inference method is converted to a \n",
- " human understandable response.\n",
- " \"\"\"\n",
- " ...\n",
- " \n",
- "\n",
- "_service = PaddleSentimentAnalysis()\n",
- "\n",
- "\n",
- "def handle(data, context):\n",
- "\"\"\"\n",
- "This method is the entrypoint \\\"handler\\\" method that is used by MMS.\n",
- "Any request coming in for this model will be sent to this method.\n",
- "\"\"\"\n",
- " if not _service.initialized:\n",
- " _service.initialize(context)\n",
- "\n",
- " if data is None:\n",
- " return None\n",
- "\n",
- " pre = _service.preprocess(data)\n",
- " inf = _service.inference(pre)\n",
- " ret = _service.postprocess(inf, data)\n",
- " return ret\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Note about Permissions\n",
- "Running this notebook requires permissions in addition to the normal **SageMakerFullAccess** permissions. This is because we'll creating new repositories in Amazon ECR. The easiest way to add these permissions is simply to add the managed policy **AmazonEC2ContainerRegistryFullAccess** to the role that you used to start your notebook instance. There's no need to restart your notebook instance when you do this, the new permissions will be available immediately."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Creating Model artifact file to be hosted on sagemaker\n",
- "In order to load this model onto Sagemaker platform with MMS BYO container, we need to do the following:\n",
- "\n",
- "1. Create a MANIFEST file, which is used by MMS as a model's metadata to load and run the model.\n",
- "2. Add the above custom-service file and the trained model-artifacts, along with the MANIFEST file, to a .tar.gz file.\n",
- "\n",
- "Let's use 'model-archiver' tool, to accomplish the above points. Before we use the tool to create a ''.tar.gz' artifact, we need to collect all the model artifacts, including the custom-service-file mentioned above, into a separate folder. For ease of getting started, we have uploaded all the model artifacts onto an [S3 bucket](https://s3.amazonaws.com/model-server/blog_artifacts/PaddlePaddle_blog/sentiment.tar.gz). Lets run the following commands to get this artifact onto your host:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!(curl https://s3.amazonaws.com/model-server/blog_artifacts/PaddlePaddle_blog/artifacts.tgz | tar zxvf -) 2>/dev/null"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!ls -R artifacts/sentiment"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we have the model artifacts, let's convert this to a model artifact that can be hosted on Sagemaker. \n",
- "\n",
- "### Prerequisites\n",
- "Before we proceed with preparing a Sagemaker model-artifact and endpoint, we need the following:\n",
- "#### Software packages and tools\n",
- "1. pip\n",
- "1. Docker\n",
- "1. Model-archiver tool\n",
- "1. Sagemaker SDK\n",
- "1. Boto3 \n",
- "\n",
- "#### AWS user account with following permissions\n",
- "We will need AWS account user with permissions to \n",
- "1. Create roles (or access to an already existing Sagemaker role)\n",
- "2. Create Sagemaker Endpoint\n",
- "3. Create an ECR repository and upload a container to the repository\n",
- "4. Create an S3 bucket and upload an artifact to S3 bucket"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We are now ready to create a sagemaker model artifact. For this, we use the \"model-archiver\" tool to create a Sagemaker model artifact. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install -U mxnet-model-server"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!model-archiver -f --model-name paddle_sentiment \\\n",
- "--handler paddle_sentiment_analysis:handle \\\n",
- "--model-path artifacts/sentiment --export-path . --archive-format tgz"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The above command would create an model artifact called `paddle_sentiment.tar.gz`, which we will use to host our endpoint. Let's verify if this model artifact is created."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!ls"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next let's take a look at how to build a container with it and bring it into Sagemaker.\n",
- "\n",
- "### Building your own BYO container with MMS\n",
- "\n",
- "In this section, we build our own MMS based container which can be brought onto Sagemaker (also known as BYO Container).\n",
- "\n",
- "To help with this process, every released version of MMS comes with a corresponding MMS base container, hosted on [DockerHub](https://hub.docker.com/r/awsdeeplearningteam/mxnet-model-server/tags) which can be hosted on the Sagemaker platform.\n",
- "\n",
- "For this example, we will use container tagged *awsdeeplearningteam/mxnet-model-server:base_cpu_py3.6*. To host the model created in the above section, we need to install 'PaddlePaddle' and 'numpy' packages in the container. This can be done by creating a Dockerfile which extends from the base MMS image and installs the above python packages. Here is how its content should look like:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!cat artifacts/Dockerfile.paddle.mms"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we have Dockerfile that describes our BYO container let's build it:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!cd artifacts && docker build -t paddle-mms -f Dockerfile.paddle.mms ."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Creating Sagemaker endpoint with PaddlePaddle model\n",
- "Before we go on and create a Sagemaker endpoint for our model, we need to do some preparations:\n",
- "\n",
- "### Upload the Sagemaker model artifact to a S3 bucket\n",
- "Upload the model archive **sentiment.tar.gz** created above to a S3 bucket. Here we uploaded it to the S3 bucket called paddle_paddle. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import boto3, os, uuid\n",
- "\n",
- "s3 = boto3.resource(\"s3\")\n",
- "s3_bucket_name = \"paddle-sentiment-model-\" + str(uuid.uuid1())\n",
- "local_model_artifact = s3_model_artifact = \"paddle_sentiment.tar.gz\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now lets create a bucket called **paddle-sentiment-model**. Here is where we will copy the model, **paddle_sentiment.tar.gz**, that we had created above."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sagemaker\n",
- "from sagemaker import get_execution_role\n",
- "import boto3\n",
- "from botocore.exceptions import ClientError\n",
- "import json\n",
- "\n",
- "sess = sagemaker.Session()\n",
- "account = sess.boto_session.client(\"sts\").get_caller_identity()[\"Account\"]\n",
- "region = sess.boto_session.region_name\n",
- "\n",
- "s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={\"LocationConstraint\": region})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "s3.meta.client.upload_file(local_model_artifact, s3_bucket_name, s3_model_artifact)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We now have **paddle_sentiment.tar.gz** on S3 in our account. Now let's look at having the container that we built on ECR, so that we can go ahead and set up our Sagemaker Endpoint."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Upload the container image to ECR\n",
- "We had built an image called **paddle-mms** above. We need to upload this to a Amazon ECR in our account."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%sh\n",
- "\n",
- "# The name of our algorithm\n",
- "algorithm_name=paddle-mms\n",
- "\n",
- "account=$(aws sts get-caller-identity --query Account --output text)\n",
- "\n",
- "# Get the region defined in the current configuration (default to us-west-2 if none defined)\n",
- "region=$(aws configure get region)\n",
- "# specifically setting to us-east-1 since during the pre-release period, we support only that region.\n",
- "region=${region:-us-east-1}\n",
- "\n",
- "echo \"region is \" $region\n",
- "\n",
- "fullname=\"${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest\"\n",
- "\n",
- "echo $fullname\n",
- "# If the repository doesn't exist in ECR, create it.\n",
- "\n",
- "aws ecr describe-repositories --repository-names \"${algorithm_name}\" > /dev/null 2>&1\n",
- "\n",
- "if [ $? -ne 0 ]\n",
- "then\n",
- " aws ecr create-repository --repository-name \"${algorithm_name}\" > /dev/null\n",
- "fi\n",
- "\n",
- "# Get the login command from ECR and execute it directly\n",
- "$(aws ecr get-login --region ${region} --no-include-email)\n",
- "\n",
- "# Build the docker image locally with the image name and then push it to ECR\n",
- "# with the full name.\n",
- "\n",
- "docker tag ${algorithm_name}:latest ${fullname}\n",
- "\n",
- "docker push ${fullname}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This pushes the \"paddle-mms\" container to Amazon ECR in your account."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Creating Sagemaker Endpoint\n",
- "Now that the model and container artifacts are uploaded onto S3 and ECR respectively, we can go ahead and create Sagemaker endpoint. To do that we need to complete following steps\n",
- "\n",
- "\n",
- "#### Sagemaker role\n",
- "\n",
- "Before we go onto create an Sagemaker endpoint, we need to setup an IAM role which has **AmazonSageMakerFullAccess** and **AmazonS3FullAccess** and **AmazonEC2ContainerRegistryFullAccess** policy attached to it. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sagemaker\n",
- "from sagemaker import get_execution_role\n",
- "import boto3\n",
- "from botocore.exceptions import ClientError\n",
- "import json\n",
- "\n",
- "sess = sagemaker.Session()\n",
- "account = sess.boto_session.client(\"sts\").get_caller_identity()[\"Account\"]\n",
- "region = sess.boto_session.region_name\n",
- "# NOTE: If you already have a sagemaker execution role created with above attached policies, use it instead of calling get_execution_role()\n",
- "sm_role = get_execution_role()\n",
- "inference_image = \"{}.dkr.ecr.{}.amazonaws.com/paddle-mms:latest\".format(account, region)\n",
- "s3_url = \"s3://{}/{}\".format(s3_bucket_name, s3_model_artifact)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We created the role required to launch our Sagemaker endpoint above. Now let's use the Sagemaker SDK to launch an endpoint."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "inf_handler = None"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.model import Model\n",
- "\n",
- "endpoint = \"PaddleSentiment\"\n",
- "paddle_model = Model(model_data=s3_url, image=inference_image, role=sm_role)\n",
- "try:\n",
- " inf_handler = paddle_model.deploy(1, \"ml.m4.xlarge\", endpoint_name=endpoint)\n",
- "except ClientError as e:\n",
- " if \"ValidationException\" == e.response[\"Error\"][\"Code\"]:\n",
- " print('The endpoint \"{}\"already exists'.format(endpoint))\n",
- " pass\n",
- " else:\n",
- " raise"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "This creats an sagemaker endpoint using the model artifact \"paddle_sentiment.tar.gz\".\n",
- "\n",
- "### Testing the endpoint\n",
- "Let's test the endpoint. To do this, we will send a movie review to the endpoint \"paddle-sentiment\"."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.predictor import (\n",
- " json_serializer,\n",
- " csv_serializer,\n",
- " json_deserializer,\n",
- " RealTimePredictor,\n",
- ")\n",
- "\n",
- "predictor = RealTimePredictor(endpoint=endpoint, sagemaker_session=sess)\n",
- "\n",
- "message = \"This is an amazing movie.\"\n",
- "print(predictor.predict(message).decode(\"utf-8\"))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You would get a response showing that the review was positive.\n",
- "### Delete Endpoint\n",
- "After testing your endpoint, you could delete the endpoint you created as follows."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sess.delete_endpoint(endpoint)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Conclusion\n",
- "We have just shown how to build and host PaddlePaddle model on Sagemaker using MMS BYO container. This flow can be reused with minor modifications in order to build BYO containers serving inference traffic on Sagemaker endpoints with MMS for models built using many ML/DL frameworks, not just PaddlePaddle."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "conda_python3",
- "language": "python",
- "name": "conda_python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/sagemaker-python-sdk/pytorch_mnist/pytorch_mnist.ipynb b/sagemaker-python-sdk/pytorch_mnist/pytorch_mnist.ipynb
index 4c78865757..7c91e84339 100644
--- a/sagemaker-python-sdk/pytorch_mnist/pytorch_mnist.ipynb
+++ b/sagemaker-python-sdk/pytorch_mnist/pytorch_mnist.ipynb
@@ -69,7 +69,7 @@
"sagemaker_session = sagemaker.Session()\n",
"\n",
"bucket = sagemaker_session.default_bucket()\n",
- "prefix = 'sagemaker/DEMO-pytorch-mnist'\n",
+ "prefix = \"sagemaker/DEMO-pytorch-mnist\"\n",
"\n",
"role = sagemaker.get_execution_role()"
]
@@ -114,11 +114,11 @@
"MNIST.mirrors = [\"https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/\"]\n",
"\n",
"MNIST(\n",
- " 'data',\n",
+ " \"data\",\n",
" download=True,\n",
" transform=transforms.Compose(\n",
" [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n",
- " )\n",
+ " ),\n",
")"
]
},
@@ -144,8 +144,8 @@
}
],
"source": [
- "inputs = sagemaker_session.upload_data(path='data', bucket=bucket, key_prefix=prefix)\n",
- "print('input spec (in this case, just an S3 path): {}'.format(inputs))"
+ "inputs = sagemaker_session.upload_data(path=\"data\", bucket=bucket, key_prefix=prefix)\n",
+ "print(\"input spec (in this case, just an S3 path): {}\".format(inputs))"
]
},
{
@@ -202,16 +202,15 @@
"source": [
"from sagemaker.pytorch import PyTorch\n",
"\n",
- "estimator = PyTorch(entry_point='mnist.py',\n",
- " role=role,\n",
- " py_version='py3',\n",
- " framework_version='1.8.0',\n",
- " instance_count=2,\n",
- " instance_type='ml.c5.2xlarge',\n",
- " hyperparameters={\n",
- " 'epochs': 1,\n",
- " 'backend': 'gloo'\n",
- " })"
+ "estimator = PyTorch(\n",
+ " entry_point=\"mnist.py\",\n",
+ " role=role,\n",
+ " py_version=\"py38\",\n",
+ " framework_version=\"1.11.0\",\n",
+ " instance_count=2,\n",
+ " instance_type=\"ml.c5.2xlarge\",\n",
+ " hyperparameters={\"epochs\": 1, \"backend\": \"gloo\"},\n",
+ ")"
]
},
{
@@ -532,7 +531,7 @@
}
],
"source": [
- "estimator.fit({'training': inputs})"
+ "estimator.fit({\"training\": inputs})"
]
},
{
@@ -562,7 +561,7 @@
}
],
"source": [
- "predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')"
+ "predictor = estimator.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
]
},
{
@@ -600,16 +599,16 @@
"metadata": {},
"outputs": [],
"source": [
- "import gzip \n",
+ "import gzip\n",
"import numpy as np\n",
"import random\n",
"import os\n",
"\n",
- "data_dir = 'data/MNIST/raw'\n",
+ "data_dir = \"data/MNIST/raw\"\n",
"with gzip.open(os.path.join(data_dir, \"t10k-images-idx3-ubyte.gz\"), \"rb\") as f:\n",
" images = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28, 28).astype(np.float32)\n",
"\n",
- "mask = random.sample(range(len(images)), 16) # randomly select some of the test images\n",
+ "mask = random.sample(range(len(images)), 16) # randomly select some of the test images\n",
"mask = np.array(mask, dtype=np.int)\n",
"data = images[mask]"
]
@@ -710,9 +709,7 @@
"metadata": {},
"outputs": [],
"source": [
- "sagemaker_session.delete_endpoint(\n",
- " endpoint_name = predictor.endpoint_name\n",
- ")"
+ "sagemaker_session.delete_endpoint(endpoint_name=predictor.endpoint_name)"
]
}
],
diff --git a/sagemaker_batch_transform/introduction_to_batch_transform/Dockerfile b/sagemaker_batch_transform/introduction_to_batch_transform/Dockerfile
index c72b3e416f..9509a739c4 100644
--- a/sagemaker_batch_transform/introduction_to_batch_transform/Dockerfile
+++ b/sagemaker_batch_transform/introduction_to_batch_transform/Dockerfile
@@ -6,9 +6,33 @@ RUN apt-get -y update && apt-get install -y --no-install-recommends \
wget \
r-base \
r-base-dev \
- ca-certificates
+ ca-certificates
-RUN R -e "install.packages(c('dbscan', 'plumber'), repos='https://cloud.r-project.org')"
+RUN R -e "install.packages(c('Rcpp', 'BH', 'R6', 'jsonlite', 'crayon'), repos='https://cloud.r-project.org')"
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/stringi/stringi_1.2.4.tar.gz
+RUN R CMD INSTALL stringi_1.2.4.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/rlang/rlang_0.2.2.tar.gz
+RUN R CMD INSTALL rlang_0.2.2.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/magrittr/magrittr_1.5.tar.gz
+RUN R CMD INSTALL magrittr_1.5.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/later/later_0.7.5.tar.gz
+RUN R CMD INSTALL later_0.7.5.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/promises/promises_1.0.1.tar.gz
+RUN R CMD INSTALL promises_1.0.1.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/httpuv/httpuv_1.4.4.2.tar.gz
+RUN R CMD INSTALL httpuv_1.4.4.2.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/dbscan/dbscan_1.1-2.tar.gz
+RUN R CMD INSTALL dbscan_1.1-2.tar.gz
+
+RUN wget http://cran.r-project.org/src/contrib/Archive/plumber/plumber_0.4.6.tar.gz
+RUN R CMD INSTALL plumber_0.4.6.tar.gz
COPY dbscan.R /opt/ml/dbscan.R
COPY plumber.R /opt/ml/plumber.R
diff --git a/sagemaker_batch_transform/introduction_to_batch_transform/batch_transform_pca_dbscan_movie_clusters.ipynb b/sagemaker_batch_transform/introduction_to_batch_transform/batch_transform_pca_dbscan_movie_clusters.ipynb
index dab8931db9..420935fe05 100644
--- a/sagemaker_batch_transform/introduction_to_batch_transform/batch_transform_pca_dbscan_movie_clusters.ipynb
+++ b/sagemaker_batch_transform/introduction_to_batch_transform/batch_transform_pca_dbscan_movie_clusters.ipynb
@@ -261,7 +261,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now, we'll setup to split our dataset into train and test. Dimensionality reduction and clustering don't always require a holdout set to test accuracy, but it will allow us to illustrate how batch prediction might be used when new data arrives. In this case, our test dataset will be a simple 10% sample of items."
+ "Now, we'll setup to split our dataset into train and test. Dimensionality reduction and clustering don't always require a holdout set to test accuracy, but it will allow us to illustrate how batch prediction might be used when new data arrives. In this case, our test dataset will be a simple 0.5% sample of items."
]
},
{
@@ -270,7 +270,7 @@
"metadata": {},
"outputs": [],
"source": [
- "test_products = products.sample(frac=0.1)\n",
+ "test_products = products.sample(frac=0.005)\n",
"train_products = products[~(products.index.isin(test_products.index))]"
]
},
diff --git a/sagemaker_batch_transform/introduction_to_batch_transform/dbscan.R b/sagemaker_batch_transform/introduction_to_batch_transform/dbscan.R
index c19cf3f5fe..7dba606c3f 100644
--- a/sagemaker_batch_transform/introduction_to_batch_transform/dbscan.R
+++ b/sagemaker_batch_transform/introduction_to_batch_transform/dbscan.R
@@ -69,7 +69,7 @@ parse_file <- function(file) {
# Second helper function for apply
parse_json <- function(line) {
if (validate(line)) {
- return(do.call(rbind, fromJSON(line)[['projections']][[1]]))}}
+ return(do.call(rbind, fromJSON(line)))}}
# Setup scoring function
diff --git a/sagemaker_batch_transform/introduction_to_batch_transform/plumber.R b/sagemaker_batch_transform/introduction_to_batch_transform/plumber.R
index 1a884f083c..6857c2e474 100644
--- a/sagemaker_batch_transform/introduction_to_batch_transform/plumber.R
+++ b/sagemaker_batch_transform/introduction_to_batch_transform/plumber.R
@@ -47,4 +47,4 @@ parse_file <- function(file) {
# Second helper function for apply
parse_json <- function(line) {
if (validate(line)) {
- return(do.call(rbind, fromJSON(line)[['projections']][[1]]))}}
+ return(do.call(rbind, fromJSON(line)))}}
diff --git a/sagemaker_batch_transform/tensorflow_open-images_tfrecord/tensorflow-serving-tfrecord.cli.ipynb b/sagemaker_batch_transform/tensorflow_open-images_tfrecord/tensorflow-serving-tfrecord.cli.ipynb
index 48f7427362..678a38c58c 100644
--- a/sagemaker_batch_transform/tensorflow_open-images_tfrecord/tensorflow-serving-tfrecord.cli.ipynb
+++ b/sagemaker_batch_transform/tensorflow_open-images_tfrecord/tensorflow-serving-tfrecord.cli.ipynb
@@ -270,7 +270,7 @@
"SPLIT_TYPE=\"TFRecord\"\n",
"BATCH_STRATEGY=\"SingleRecord\"\n",
"\n",
- "# Join outputs by newline characters. This will make JSONLines output, since each output is JSON.\n",
+ "# Join outputs by newline characters. This will make JSON Lines output, since each output is JSON.\n",
"ASSEMBLE_WITH=\"Line\"\n",
"\n",
"# The Data Source tells Batch to get all objects under the S3 prefix.\n",
diff --git a/sagemaker_batch_transform/working_with_tfrecords/working-with-tfrecords.ipynb b/sagemaker_batch_transform/working_with_tfrecords/working-with-tfrecords.ipynb
index ec20e480c9..ebe06ddfbe 100644
--- a/sagemaker_batch_transform/working_with_tfrecords/working-with-tfrecords.ipynb
+++ b/sagemaker_batch_transform/working_with_tfrecords/working-with-tfrecords.ipynb
@@ -50,7 +50,7 @@
"import sagemaker\n",
"import tensorflow as tf\n",
"\n",
- "bucket = \"\"\n",
+ "bucket = sagemaker.Session().default_bucket()\n",
"training_prefix = \"training\"\n",
"batch_input_prefix = \"batch_input\"\n",
"batch_output_prefix = \"batch_output\"\n",
@@ -292,7 +292,7 @@
"tf_serving_model = Model(\n",
" model_data=estimator.model_data,\n",
" role=sagemaker.get_execution_role(),\n",
- " image=estimator.image_name,\n",
+ " image_uri=estimator.image_uri,\n",
" framework_version=estimator.framework_version,\n",
" sagemaker_session=estimator.sagemaker_session,\n",
")\n",
diff --git a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb
index 7831728e60..99fa1fe0bc 100644
--- a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb
+++ b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb
@@ -50,7 +50,7 @@
"1. Explaining the importance of the various input features on the model's decision\n",
"1. Accessing the reports through SageMaker Studio if you have an instance set up.\n",
"\n",
- "In doing so, the notebook first trains a [SageMaker XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in CSV format. SageMaker Clarify also supports analyzing dataset in [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb)."
+ "In doing so, the notebook first trains a [SageMaker XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in CSV format. SageMaker Clarify also supports analyzing dataset in [SageMaker JSON Lines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb)."
]
},
{
diff --git a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb
index cfa55ef88d..df31b3060e 100644
--- a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb
+++ b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_byoc.ipynb
@@ -422,9 +422,9 @@
"* At container startup, the script initializes an estimator using the model file provided by the client side deploy() method. The model directory and model file name are the same as in the `train` script.\n",
"* Once started, the server is ready to serve inference requests. The logic resides in the `predict` method,\n",
" * Input validation. The example container supports the same MIME types as Clarify job does, i.e., `text/csv` and `application/jsonlines`.\n",
- " * Parse payload. Clarify job may send **batch requests** to the container for better efficiency, i.e., the payload can have multiple lines and each is a sample. So, the method decodes request payload and then split lines, then loads the lines according to the content type. For JSONLines content, the method uses a key \"features\" to extract the list of features from a JSON line. The key shall be the same as the one defined in your Clarify job analysis configuration `predictor.content_template`. It is a **contract** between the Clarify job and the container, here you can change it to something else, like \"attributes\", but remember to update the `predictor.content_template` configuration accordingly.\n",
+ " * Parse payload. Clarify job may send **batch requests** to the container for better efficiency, i.e., the payload can have multiple lines and each is a sample. So, the method decodes request payload and then split lines, then loads the lines according to the content type. For JSON Lines content, the method uses a key \"features\" to extract the list of features from a JSON line. The key shall be the same as the one defined in your Clarify job analysis configuration `predictor.content_template`. It is a **contract** between the Clarify job and the container, here you can change it to something else, like \"attributes\", but remember to update the `predictor.content_template` configuration accordingly.\n",
" * Do prediction. The method gets the probability scores instead of binary labels, because scores are better for feature explainability.\n",
- " * Format output. For a **batch request**, Clarify job expects the same number of result lines as the number of samples in the request. So, the method encodes each prediction and then join them by line-break. For JSONLines accept type, the method uses two keys \"predicted_label\" and \"score\" to indicate the prediction. The keys shall be the same as your Clarify job analysis configuration `predictor.label` and `predictor.probability`, and they are used by the Clarify job to extract predictions from container response payload. The keys are **contracts** between the Clarify job and the container, here you can change them to something else, but remember to update the analysis configuration accordingly.\n",
+ " * Format output. For a **batch request**, Clarify job expects the same number of result lines as the number of samples in the request. So, the method encodes each prediction and then join them by line-break. For JSON Lines accept type, the method uses two keys \"predicted_label\" and \"score\" to indicate the prediction. The keys shall be the same as your Clarify job analysis configuration `predictor.label` and `predictor.probability`, and they are used by the Clarify job to extract predictions from container response payload. The keys are **contracts** between the Clarify job and the container, here you can change them to something else, but remember to update the analysis configuration accordingly.\n",
"\n",
"Similarly, the script is built from scratch for demonstration purpose. In a real project, you can utilize [SageMaker Inference Toolkit](https://github.com/aws/sagemaker-inference-toolkit) which implements a model serving stack built on [Multi Model Server](https://github.com/awslabs/multi-model-server), and it can serve your own models or those you trained on SageMaker using Machine Learning frameworks with native SageMaker support."
]
@@ -473,7 +473,7 @@
"python serve --model_dir \n",
"```\n",
"\n",
- "Upon successful execution, the script should be listening on local host port `8080` for inference requests. The following cell generates a few CURL commands to send inference requests (both CSV and JSONLines) to the port. You can copy&paste them to your local terminal for execution, to hit the port and trigger the inference code. For a single sample request, the command should output only one result, and for a batch request, the command should output the same number of results (lines) as the number of samples in the request."
+ "Upon successful execution, the script should be listening on local host port `8080` for inference requests. The following cell generates a few CURL commands to send inference requests (both CSV and JSON Lines) to the port. You can copy&paste them to your local terminal for execution, to hit the port and trigger the inference code. For a single sample request, the command should output only one result, and for a batch request, the command should output the same number of results (lines) as the number of samples in the request."
]
},
{
@@ -923,13 +923,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "There are three scenarios where Clarify handles data types, and they all support both CSV (`text/csv`) and JSONLines (`application/jsonlines`).\n",
+ "There are three scenarios where Clarify handles data types, and they all support both CSV (`text/csv`) and JSON Lines (`application/jsonlines`).\n",
"\n",
"* dataset type: the MIME type of the dataset and SHAP baseline.\n",
"* content type: the MIME type of the shadow endpoint request payload\n",
"* accept type: the MIME type of the shadow endpoint response payload\n",
"\n",
- "The Clarify jobs in this notebook always uses CSV for dataset type, but you can choose for the other two. The following code chose JSONLines for both, but it is fine if you change one of them or both of them to CSV, because CSV and JSONLines are supported by the customer container as well."
+ "The Clarify jobs in this notebook always uses CSV for dataset type, but you can choose for the other two. The following code chose JSON Lines for both, but it is fine if you change one of them or both of them to CSV, because CSV and JSON Lines are supported by the customer container as well."
]
},
{
@@ -991,7 +991,7 @@
"A [ModelConfig](https://sagemaker.readthedocs.io/en/stable/api/training/processing.html#sagemaker.clarify.ModelConfig) object communicates information about your trained model. To avoid additional traffic to your production models, SageMaker Clarify sets up and tears down a dedicated endpoint when processing.\n",
"* `instance_type` and `instance_count` specify your preferred instance type and instance count used to run your model on during SageMaker Clarify's processing. The testing dataset is small so a single standard instance is good enough to run this example. If you have a large complex dataset, you may want to use a better instance type to speed up, or add more instances to enable Spark parallelization.\n",
"* `accept_type` denotes the endpoint response payload format, and `content_type` denotes the payload format of request to the endpoint.\n",
- "* `content_template` is used by SageMaker Clarify to compose the request payload if the content type is JSONLines. To be more specific, the placeholder `$features` will be replaced by the features list from samples. For example, the first sample of the test dataset is `25,2,226802,1,7,4,6,3,2,1,0,0,40,37`, so the corresponding request payload is `'{\"features\":[25,2,226802,1,7,4,6,3,2,1,0,0,40,37]}'`, which conforms to [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats)."
+ "* `content_template` is used by SageMaker Clarify to compose the request payload if the content type is JSON Lines. To be more specific, the placeholder `$features` will be replaced by the features list from samples. For example, the first sample of the test dataset is `25,2,226802,1,7,4,6,3,2,1,0,0,40,37`, so the corresponding request payload is `'{\"features\":[25,2,226802,1,7,4,6,3,2,1,0,0,40,37]}'`, which conforms to [SageMaker JSON Lines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats)."
]
},
{
@@ -1017,7 +1017,7 @@
"#### Writing ModelPredictedLabelConfig\n",
"\n",
"A [ModelPredictedLabelConfig](https://sagemaker.readthedocs.io/en/stable/api/training/processing.html#sagemaker.clarify.ModelPredictedLabelConfig) provides information on the format of your predictions.\n",
- "* `probability` is used by SageMaker Clarify to locate the probability score in endpoint response if the accept type is JSONLines. In this case, the response payload for a single sample request looks like `'{\"predicted_label\": 0, \"score\": 0.026494730307781475}'`, so SageMaker Clarify can find the score `0.026494730307781475` by JSONPath `'score'`.\n",
+ "* `probability` is used by SageMaker Clarify to locate the probability score in endpoint response if the accept type is JSON Lines. In this case, the response payload for a single sample request looks like `'{\"predicted_label\": 0, \"score\": 0.026494730307781475}'`, so SageMaker Clarify can find the score `0.026494730307781475` by JSONPath `'score'`.\n",
"* `probability_threshold` is used by SageMaker Clarify to convert the probability to binary labels for bias analysis. Prediction above the threshold is interpreted as label value 1 and below or equal as label value 0."
]
},
diff --git a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb
index 4a7412b295..90b08ec30d 100644
--- a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb
+++ b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Fairness and Explainability with SageMaker Clarify - JSONLines Format"
+ "# Fairness and Explainability with SageMaker Clarify - JSON Lines Format"
]
},
{
@@ -44,7 +44,7 @@
"1. Explaining the importance of the various input features on the model's decision\n",
"1. Accessing the reports through SageMaker Studio if you have an instance set up.\n",
"\n",
- "In doing so, the notebook will first train a [SageMaker Linear Learner](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats). SageMaker Clarify also supports analyzing CSV dataset, which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb)."
+ "In doing so, the notebook will first train a [SageMaker Linear Learner](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in [SageMaker JSON Lines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats). SageMaker Clarify also supports analyzing CSV dataset, which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb)."
]
},
{
@@ -247,7 +247,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Then save the testing dataset to a JSONLines file. The file conforms to [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), with an additional field to hold the ground truth label."
+ "Then save the testing dataset to a JSON Lines file. The file conforms to [SageMaker JSON Lines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), with an additional field to hold the ground truth label."
]
},
{
@@ -392,14 +392,14 @@
"#### Writing DataConfig and ModelConfig\n",
"A `DataConfig` object communicates some basic information about data I/O to SageMaker Clarify. We specify where to find the input dataset, where to store the output, the target column (`label`), the header names, and the dataset type.\n",
"\n",
- "Some special things to note about this configuration for the JSONLines dataset,\n",
+ "Some special things to note about this configuration for the JSON Lines dataset,\n",
"* Argument `features` or `label` is **NOT** header string. Instead, it is a [JSONPath string](https://jmespath.org/specification.html) to locate the features list or label in the dataset. For example, for a sample like below, `features` should be 'data.features.values', and `label` should be 'data.label'. \n",
"\n",
"```\n",
"{\"data\": {\"features\": {\"values\": [25, 2, 226802, 1, 7, 4, 6, 3, 2, 1, 0, 0, 40, 37]}, \"label\": 0}}\n",
"```\n",
"\n",
- "* SageMaker Clarify will load the JSONLines dataset into tabular representation for further analysis, and argument `headers` is the list of column names. The label header shall be the last one in the headers list, and the order of feature headers shall be the same as the order of features in a sample."
+ "* SageMaker Clarify will load the JSON Lines dataset into tabular representation for further analysis, and argument `headers` is the list of column names. The label header shall be the last one in the headers list, and the order of feature headers shall be the same as the order of features in a sample."
]
},
{
@@ -426,7 +426,7 @@
"A `ModelConfig` object communicates information about your trained model. To avoid additional traffic to your production models, SageMaker Clarify sets up and tears down a dedicated endpoint when processing.\n",
"* `instance_type` and `instance_count` specify your preferred instance type and instance count used to run your model on during SageMaker Clarify's processing. The testing dataset is small so a single standard instance is good enough to run this example. If your have a large complex dataset, you may want to use a better instance type to speed up, or add more instances to enable Spark parallelization.\n",
"* `accept_type` denotes the endpoint response payload format, and `content_type` denotes the payload format of request to the endpoint.\n",
- "* `content_template` is used by SageMaker Clarify to compose the request payload if the content type is JSONLines. To be more specific, the placeholder `$features` will be replaced by the features list from samples. The request payload of a sample from the testing dataset happens to be similar to the sample itself, like `'{\"features\": [25, 2, 226802, 1, 7, 4, 6, 3, 2, 1, 0, 0, 40, 37]}'`, because both the dataset and the model input conform to [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats)."
+ "* `content_template` is used by SageMaker Clarify to compose the request payload if the content type is JSON Lines. To be more specific, the placeholder `$features` will be replaced by the features list from samples. The request payload of a sample from the testing dataset happens to be similar to the sample itself, like `'{\"features\": [25, 2, 226802, 1, 7, 4, 6, 3, 2, 1, 0, 0, 40, 37]}'`, because both the dataset and the model input conform to [SageMaker JSON Lines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats)."
]
},
{
@@ -465,7 +465,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "If you are building your own model, then you may choose a different JSONLines format, as long as it has the key elements like label and features list, and request payload built using `content_template` is supported by the model (you can customize the template but the placeholder of features list must be `$features`). Also, `dataset_type`, `accept_type` and `content_type` don't have to be the same, for example, a use case may use CSV dataset and content type, but JSONLines accept type."
+ "If you are building your own model, then you may choose a different JSON Lines format, as long as it has the key elements like label and features list, and request payload built using `content_template` is supported by the model (you can customize the template but the placeholder of features list must be `$features`). Also, `dataset_type`, `accept_type` and `content_type` don't have to be the same, for example, a use case may use CSV dataset and content type, but JSON Lines accept type."
]
},
{
diff --git a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_outputs.ipynb b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_outputs.ipynb
index 289d3b8e5b..ca4cd45863 100644
--- a/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_outputs.ipynb
+++ b/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_outputs.ipynb
@@ -70,7 +70,7 @@
"1. Explaining the importance of the various input features on the model's decision\n",
"1. Accessing the reports through SageMaker Studio if you have an instance set up.\n",
"\n",
- "In doing so, the notebook first trains a [SageMaker XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in CSV format. SageMaker Clarify also supports analyzing dataset in [SageMaker JSONLines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb)."
+ "In doing so, the notebook first trains a [SageMaker XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) model using training dataset, then use SageMaker Clarify to analyze a testing dataset in CSV format. SageMaker Clarify also supports analyzing dataset in [SageMaker JSON Lines dense format](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#common-in-formats), which is illustrated in [another notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability_jsonlines_format.ipynb)."
]
},
{
diff --git a/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb b/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb
index 212ea4bf70..d351b0cc54 100644
--- a/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb
+++ b/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb
@@ -461,14 +461,14 @@
"from sagemaker.processing import FrameworkProcessor\n",
"\n",
"est_cls = sagemaker.sklearn.estimator.SKLearn\n",
- "framework_version_str=\"0.20.0\"\n",
+ "framework_version_str = \"0.20.0\"\n",
"\n",
"script_processor = FrameworkProcessor(\n",
" role=role,\n",
" instance_count=1,\n",
" instance_type=\"ml.m5.xlarge\",\n",
" estimator_cls=est_cls,\n",
- " framework_version=framework_version_str\n",
+ " framework_version=framework_version_str,\n",
")"
]
},
@@ -542,8 +542,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "By default optional steps do not run automatically, set `run_optional_steps` to True if you want to \n",
- "execute optional steps"
+ "By default optional steps run automatically, set `run_optional_steps` to False if you don't want to execute optional steps"
]
},
{
@@ -552,7 +551,7 @@
"metadata": {},
"outputs": [],
"source": [
- "run_optional_steps = False"
+ "run_optional_steps = True"
]
},
{
@@ -697,9 +696,9 @@
"metadata": {
"instance_type": "ml.t3.medium",
"kernelspec": {
- "display_name": "Python 3 (Data Science)",
+ "display_name": "conda_python3",
"language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
+ "name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
@@ -711,7 +710,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.10"
+ "version": "3.6.13"
}
},
"nbformat": 4,