Merge pull request #1 from awslabs/master

merge from master branch
aws · Sep 18, 2018 · 8631fa2 · 8631fa2
2 parents 25ce707 + e8aa973
commit 8631fa2
Show file tree

Hide file tree

Showing 20 changed files with 1,794 additions and 103 deletions.
diff --git a/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb b/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb
@@ -50,7 +50,6 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true,
     "isConfigCell": true
    },
    "outputs": [],
@@ -74,7 +73,7 @@
     "bucket='<s3-bucket>' # put your s3 bucket name here, and create s3 bucket\n",
     "prefix = 'sagemaker/DEMO-kms'\n",
     "# customize to your bucket where you have stored the data\n",
-    "bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)"
+    "bucket_path = 's3://{}'.format(bucket)"
    ]
   },
   {
@@ -93,9 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.datasets import load_boston\n",
@@ -116,15 +113,13 @@
    "source": [
     "### Data preprocessing\n",
     "\n",
-    "Now that we have the dataset, we need to split it into *train*, *validation*, and *test* datasets which we can use to evaluate the accuracy of the machine learning algorithm. We randomly split the dataset into 60% training, 20% validation and 20% test. Note that SageMaker Xgboost, expects the label column to be the first one in the datasets. So, we'll move the median value column (`MEDV`) from the last to the first position within the `write_file` method below. "
+    "Now that we have the dataset, we need to split it into *train*, *validation*, and *test* datasets which we can use to evaluate the accuracy of the machine learning algorithm. We'll also create a test dataset file with the labels removed so it can be fed into a batch transform job. We randomly split the dataset into 60% training, 20% validation and 20% test. Note that SageMaker Xgboost, expects the label column to be the first one in the datasets. So, we'll move the median value column (`MEDV`) from the last to the first position within the `write_file` method below. "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.model_selection import train_test_split\n",
@@ -135,37 +130,31 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "def write_file(X, y, fname):\n",
+    "def write_file(X, y, fname, include_labels=True):\n",
     "    feature_names = boston['feature_names']\n",
     "    data = pd.DataFrame(X, columns=feature_names)\n",
-    "    target = pd.DataFrame(y, columns={'MEDV'})\n",
-    "    data['MEDV'] = y\n",
-    "    # bring this column to the front before writing the files\n",
-    "    cols = data.columns.tolist()\n",
-    "    cols = cols[-1:] + cols[:-1]\n",
-    "    data = data[cols]\n",
+    "    if include_labels:\n",
+    "        data.insert(0, 'MEDV', y)\n",
     "    data.to_csv(fname, header=False, index=False)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "train_file = 'train.csv'\n",
     "validation_file = 'val.csv'\n",
     "test_file = 'test.csv'\n",
+    "test_no_labels_file = 'test_no_labels.csv'\n",
     "write_file(X_train, y_train, train_file)\n",
     "write_file(X_val, y_val, validation_file)\n",
-    "write_file(X_test, y_test, test_file)"
+    "write_file(X_test, y_test, test_file)\n",
+    "write_file(X_test, y_test, test_no_labels_file, False)"
    ]
   },
   {
@@ -178,9 +167,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "s3 = boto3.client('s3')\n",
@@ -207,7 +194,19 @@
     "              ServerSideEncryption='aws:kms',\n",
     "              SSEKMSKeyId=kms_key_id)\n",
     "\n",
-    "print(\"Done uploading the validation dataset\")"
+    "print(\"Done uploading the validation dataset\")\n",
+    "\n",
+    "data_test = open(test_no_labels_file, 'rb')\n",
+    "key_test = '{}/test/{}'.format(prefix,test_no_labels_file)\n",
+    "\n",
+    "print(\"Put object...\")\n",
+    "s3.put_object(Bucket=bucket,\n",
+    "              Key=key_test,\n",
+    "              Body=data_test,\n",
+    "              ServerSideEncryption='aws:kms',\n",
+    "              SSEKMSKeyId=kms_key_id)\n",
+    "\n",
+    "print(\"Done uploading the test dataset\")"
    ]
   },
   {
@@ -222,9 +221,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
@@ -234,9 +231,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -334,9 +329,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -375,9 +368,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from time import gmtime, strftime\n",
@@ -401,15 +392,13 @@
    "metadata": {},
    "source": [
     "### Create endpoint\n",
-    "Lastly, create the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 9-11 minutes to complete."
+    "Create the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 9-11 minutes to complete."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -449,15 +438,13 @@
    "metadata": {},
    "source": [
     "## Validate the model for use\n",
-    "Finally, you can now validate the model for use. They can obtain the endpoint from the client library using the result from previous operations, and generate classifications from the trained model using that endpoint.\n"
+    "You can now validate the model for use. Obtain the endpoint from the client library using the result from previous operations, and run a single prediction on the trained model using that endpoint.\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "runtime_client = boto3.client('runtime.sagemaker')"
@@ -466,87 +453,125 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import sys\n",
     "import math\n",
     "def do_predict(data, endpoint_name, content_type):\n",
-    "    payload = ''.join(data)\n",
     "    response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, \n",
     "                                   ContentType=content_type, \n",
-    "                                   Body=payload)\n",
+    "                                   Body=data)\n",
     "    result = response['Body'].read()\n",
     "    result = result.decode(\"utf-8\")\n",
-    "    result = result.split(',')\n",
     "    return result\n",
     "\n",
-    "def batch_predict(data, batch_size, endpoint_name, content_type):\n",
-    "    items = len(data)\n",
-    "    arrs = []\n",
-    "    \n",
-    "    for offset in range(0, items, batch_size):\n",
-    "        if offset+batch_size < items:\n",
-    "            results = do_predict(data[offset:(offset+batch_size)], endpoint_name, content_type)\n",
-    "            arrs.extend(results)\n",
-    "        else:\n",
-    "            arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))\n",
-    "        sys.stdout.write('.')\n",
-    "    return(arrs)"
+    "# pull the first item from the test dataset\n",
+    "with open('test.csv') as f:\n",
+    "    first_line = f.readline()\n",
+    "    features = first_line.split(',')[1:]\n",
+    "    feature_str = ','.join(features)\n",
+    "\n",
+    "prediction = do_predict(feature_str, endpoint_name, 'text/csv')\n",
+    "print('Prediction: ' + prediction)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The following helps us calculate the Median Absolute Percent Error (MdAPE) on the batch dataset. Note that the intent of this example is not to produce the most accurate regressor but to demonstrate how to handle KMS encrypted data with SageMaker. "
+    "### (Optional) Delete the Endpoint\n",
+    "\n",
+    "If you're ready to be done with this notebook, please run the delete_endpoint line in the cell below.  This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.delete_endpoint(EndpointName=endpoint_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run batch prediction using batch transform\n",
+    "Create a transform job to do batch prediction using the trained model. Similar to the training section above, the execution role assumed by this notebook must have permissions to encrypt and decrypt data with the KMS key (`kms_key_id`) used for S3 server-side encryption."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
-    "import json\n",
-    "import numpy as np\n",
-    "\n",
+    "transform_job_name = 'DEMO-xgboost-batch-prediction' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
+    "print(\"Transform job\", transform_job_name)\n",
     "\n",
-    "with open('test.csv') as f:\n",
-    "    lines = f.readlines()\n",
-    "\n",
-    "#remove the labels\n",
-    "labels = [line.split(',')[0] for line in lines]\n",
-    "features = [line.split(',')[1:] for line in lines]\n",
+    "transform_params = \\\n",
+    "{\n",
+    "    \"TransformJobName\": transform_job_name,\n",
+    "    \"ModelName\": model_name,\n",
+    "    \"TransformInput\": {\n",
+    "        \"ContentType\": \"text/csv\",\n",
+    "        \"DataSource\": {\n",
+    "            \"S3DataSource\": {\n",
+    "                \"S3DataType\": \"S3Prefix\",\n",
+    "                \"S3Uri\": bucket_path + \"/\"+ prefix + '/test'\n",
+    "            }\n",
+    "        },\n",
+    "        \"SplitType\": \"Line\"\n",
+    "    },\n",
+    "    \"TransformOutput\": {\n",
+    "        \"AssembleWith\": \"Line\",\n",
+    "        \"S3OutputPath\": bucket_path + \"/\"+ prefix + '/predict'\n",
+    "    },\n",
+    "    \"TransformResources\": {\n",
+    "        \"InstanceCount\": 1,\n",
+    "        \"InstanceType\": \"ml.c4.xlarge\"\n",
+    "    }\n",
+    "}\n",
     "\n",
-    "features_str = [','.join(row) for row in features]\n",
-    "preds = batch_predict(features_str, 100, endpoint_name, 'text/csv')\n",
-    "print('\\n Median Absolute Percent Error (MdAPE) = ', np.median(np.abs(np.asarray(labels, dtype=float) - np.asarray(preds, dtype=float)) / np.asarray(labels, dtype=float)))"
+    "client.create_transform_job(**transform_params)\n",
+    "\n",
+    "while True:\n",
+    "    response = client.describe_transform_job(TransformJobName=transform_job_name)\n",
+    "    status = response['TransformJobStatus']\n",
+    "    if status == 'InProgress':\n",
+    "        time.sleep(15)\n",
+    "    elif status == 'Completed':\n",
+    "        print(\"Transform job completed!\")\n",
+    "        break\n",
+    "    else:\n",
+    "        print(\"Unexpected transform job status: \" + status)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### (Optional) Delete the Endpoint\n",
+    "### Evaluate the batch predictions\n",
     "\n",
-    "If you're ready to be done with this notebook, please run the delete_endpoint line in the cell below.  This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on."
+    "The following helps us calculate the Median Absolute Percent Error (MdAPE) on the batch prediction output in S3. Note that the intent of this example is not to produce the most accurate regressor but to demonstrate how to handle KMS encrypted data with SageMaker."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "client.delete_endpoint(EndpointName=endpoint_name)"
+    "print(\"Downloading prediction object...\")\n",
+    "s3.download_file(Bucket=bucket,\n",
+    "                 Key=prefix + '/predict/' + test_no_labels_file + '.out',\n",
+    "                 Filename='./predictions.csv')\n",
+    "\n",
+    "preds = np.loadtxt('predictions.csv')\n",
+    "print('\\nMedian Absolute Percent Error (MdAPE) = ', np.median(np.abs(y_test - preds) / y_test))"
    ]
   }
  ],

diff --git a/hyperparameter_tuning/keras_bring_your_own/main.py b/hyperparameter_tuning/keras_bring_your_own/main.py
@@ -80,7 +80,7 @@ def upload_training_data():
 
     tensorflow_version_tag = get_tensorflow_version_tag(args.tf_version, args.instance_type)
 
-    image_name = get_image_name(args.ecr_repository, args.tensorflow_version_tag)
+    image_name = get_image_name(args.ecr_repository, tensorflow_version_tag)
 
     build_image(image_name, tensorflow_version_tag)
 

diff --git a/...bring_your_own/hpo_r_bring_your_own.ipynb → ...ring_your_own/tune_r_bring_your_own.ipynb b/...bring_your_own/hpo_r_bring_your_own.ipynb → ...ring_your_own/tune_r_bring_your_own.ipynb
diff --git a/hyperparameter_tuning/tensorflow_mnist/hpo_tensorflow_mnist.ipynb b/hyperparameter_tuning/tensorflow_mnist/hpo_tensorflow_mnist.ipynb
@@ -176,6 +176,7 @@
    "source": [
     "estimator = TensorFlow(entry_point='mnist.py',\n",
     "                  role=role,\n",
+    "                  framework_version='1.10.0',\n",
     "                  training_steps=1000, \n",
     "                  evaluation_steps=100,\n",
     "                  train_instance_count=1,\n",