Skip to content

Commit

Permalink
Merge pull request #1 from awslabs/master
Browse files Browse the repository at this point in the history
merge from master branch
  • Loading branch information
Aloha106 authored Sep 18, 2018
2 parents 25ce707 + e8aa973 commit 8631fa2
Show file tree
Hide file tree
Showing 20 changed files with 1,794 additions and 103 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"isConfigCell": true
},
"outputs": [],
Expand All @@ -74,7 +73,7 @@
"bucket='<s3-bucket>' # put your s3 bucket name here, and create s3 bucket\n",
"prefix = 'sagemaker/DEMO-kms'\n",
"# customize to your bucket where you have stored the data\n",
"bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)"
"bucket_path = 's3://{}'.format(bucket)"
]
},
{
Expand All @@ -93,9 +92,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_boston\n",
Expand All @@ -116,15 +113,13 @@
"source": [
"### Data preprocessing\n",
"\n",
"Now that we have the dataset, we need to split it into *train*, *validation*, and *test* datasets which we can use to evaluate the accuracy of the machine learning algorithm. We randomly split the dataset into 60% training, 20% validation and 20% test. Note that SageMaker Xgboost, expects the label column to be the first one in the datasets. So, we'll move the median value column (`MEDV`) from the last to the first position within the `write_file` method below. "
"Now that we have the dataset, we need to split it into *train*, *validation*, and *test* datasets which we can use to evaluate the accuracy of the machine learning algorithm. We'll also create a test dataset file with the labels removed so it can be fed into a batch transform job. We randomly split the dataset into 60% training, 20% validation and 20% test. Note that SageMaker Xgboost, expects the label column to be the first one in the datasets. So, we'll move the median value column (`MEDV`) from the last to the first position within the `write_file` method below. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
Expand All @@ -135,37 +130,31 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def write_file(X, y, fname):\n",
"def write_file(X, y, fname, include_labels=True):\n",
" feature_names = boston['feature_names']\n",
" data = pd.DataFrame(X, columns=feature_names)\n",
" target = pd.DataFrame(y, columns={'MEDV'})\n",
" data['MEDV'] = y\n",
" # bring this column to the front before writing the files\n",
" cols = data.columns.tolist()\n",
" cols = cols[-1:] + cols[:-1]\n",
" data = data[cols]\n",
" if include_labels:\n",
" data.insert(0, 'MEDV', y)\n",
" data.to_csv(fname, header=False, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"train_file = 'train.csv'\n",
"validation_file = 'val.csv'\n",
"test_file = 'test.csv'\n",
"test_no_labels_file = 'test_no_labels.csv'\n",
"write_file(X_train, y_train, train_file)\n",
"write_file(X_val, y_val, validation_file)\n",
"write_file(X_test, y_test, test_file)"
"write_file(X_test, y_test, test_file)\n",
"write_file(X_test, y_test, test_no_labels_file, False)"
]
},
{
Expand All @@ -178,9 +167,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"s3 = boto3.client('s3')\n",
Expand All @@ -207,7 +194,19 @@
" ServerSideEncryption='aws:kms',\n",
" SSEKMSKeyId=kms_key_id)\n",
"\n",
"print(\"Done uploading the validation dataset\")"
"print(\"Done uploading the validation dataset\")\n",
"\n",
"data_test = open(test_no_labels_file, 'rb')\n",
"key_test = '{}/test/{}'.format(prefix,test_no_labels_file)\n",
"\n",
"print(\"Put object...\")\n",
"s3.put_object(Bucket=bucket,\n",
" Key=key_test,\n",
" Body=data_test,\n",
" ServerSideEncryption='aws:kms',\n",
" SSEKMSKeyId=kms_key_id)\n",
"\n",
"print(\"Done uploading the test dataset\")"
]
},
{
Expand All @@ -222,9 +221,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from sagemaker.amazon.amazon_estimator import get_image_uri\n",
Expand All @@ -234,9 +231,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand Down Expand Up @@ -334,9 +329,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand Down Expand Up @@ -375,9 +368,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from time import gmtime, strftime\n",
Expand All @@ -401,15 +392,13 @@
"metadata": {},
"source": [
"### Create endpoint\n",
"Lastly, create the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 9-11 minutes to complete."
"Create the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 9-11 minutes to complete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand Down Expand Up @@ -449,15 +438,13 @@
"metadata": {},
"source": [
"## Validate the model for use\n",
"Finally, you can now validate the model for use. They can obtain the endpoint from the client library using the result from previous operations, and generate classifications from the trained model using that endpoint.\n"
"You can now validate the model for use. Obtain the endpoint from the client library using the result from previous operations, and run a single prediction on the trained model using that endpoint.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"runtime_client = boto3.client('runtime.sagemaker')"
Expand All @@ -466,87 +453,125 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import math\n",
"def do_predict(data, endpoint_name, content_type):\n",
" payload = ''.join(data)\n",
" response = runtime_client.invoke_endpoint(EndpointName=endpoint_name, \n",
" ContentType=content_type, \n",
" Body=payload)\n",
" Body=data)\n",
" result = response['Body'].read()\n",
" result = result.decode(\"utf-8\")\n",
" result = result.split(',')\n",
" return result\n",
"\n",
"def batch_predict(data, batch_size, endpoint_name, content_type):\n",
" items = len(data)\n",
" arrs = []\n",
" \n",
" for offset in range(0, items, batch_size):\n",
" if offset+batch_size < items:\n",
" results = do_predict(data[offset:(offset+batch_size)], endpoint_name, content_type)\n",
" arrs.extend(results)\n",
" else:\n",
" arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))\n",
" sys.stdout.write('.')\n",
" return(arrs)"
"# pull the first item from the test dataset\n",
"with open('test.csv') as f:\n",
" first_line = f.readline()\n",
" features = first_line.split(',')[1:]\n",
" feature_str = ','.join(features)\n",
"\n",
"prediction = do_predict(feature_str, endpoint_name, 'text/csv')\n",
"print('Prediction: ' + prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following helps us calculate the Median Absolute Percent Error (MdAPE) on the batch dataset. Note that the intent of this example is not to produce the most accurate regressor but to demonstrate how to handle KMS encrypted data with SageMaker. "
"### (Optional) Delete the Endpoint\n",
"\n",
"If you're ready to be done with this notebook, please run the delete_endpoint line in the cell below. This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"client.delete_endpoint(EndpointName=endpoint_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run batch prediction using batch transform\n",
"Create a transform job to do batch prediction using the trained model. Similar to the training section above, the execution role assumed by this notebook must have permissions to encrypt and decrypt data with the KMS key (`kms_key_id`) used for S3 server-side encryption."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"import json\n",
"import numpy as np\n",
"\n",
"transform_job_name = 'DEMO-xgboost-batch-prediction' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
"print(\"Transform job\", transform_job_name)\n",
"\n",
"with open('test.csv') as f:\n",
" lines = f.readlines()\n",
"\n",
"#remove the labels\n",
"labels = [line.split(',')[0] for line in lines]\n",
"features = [line.split(',')[1:] for line in lines]\n",
"transform_params = \\\n",
"{\n",
" \"TransformJobName\": transform_job_name,\n",
" \"ModelName\": model_name,\n",
" \"TransformInput\": {\n",
" \"ContentType\": \"text/csv\",\n",
" \"DataSource\": {\n",
" \"S3DataSource\": {\n",
" \"S3DataType\": \"S3Prefix\",\n",
" \"S3Uri\": bucket_path + \"/\"+ prefix + '/test'\n",
" }\n",
" },\n",
" \"SplitType\": \"Line\"\n",
" },\n",
" \"TransformOutput\": {\n",
" \"AssembleWith\": \"Line\",\n",
" \"S3OutputPath\": bucket_path + \"/\"+ prefix + '/predict'\n",
" },\n",
" \"TransformResources\": {\n",
" \"InstanceCount\": 1,\n",
" \"InstanceType\": \"ml.c4.xlarge\"\n",
" }\n",
"}\n",
"\n",
"features_str = [','.join(row) for row in features]\n",
"preds = batch_predict(features_str, 100, endpoint_name, 'text/csv')\n",
"print('\\n Median Absolute Percent Error (MdAPE) = ', np.median(np.abs(np.asarray(labels, dtype=float) - np.asarray(preds, dtype=float)) / np.asarray(labels, dtype=float)))"
"client.create_transform_job(**transform_params)\n",
"\n",
"while True:\n",
" response = client.describe_transform_job(TransformJobName=transform_job_name)\n",
" status = response['TransformJobStatus']\n",
" if status == 'InProgress':\n",
" time.sleep(15)\n",
" elif status == 'Completed':\n",
" print(\"Transform job completed!\")\n",
" break\n",
" else:\n",
" print(\"Unexpected transform job status: \" + status)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (Optional) Delete the Endpoint\n",
"### Evaluate the batch predictions\n",
"\n",
"If you're ready to be done with this notebook, please run the delete_endpoint line in the cell below. This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on."
"The following helps us calculate the Median Absolute Percent Error (MdAPE) on the batch prediction output in S3. Note that the intent of this example is not to produce the most accurate regressor but to demonstrate how to handle KMS encrypted data with SageMaker."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"client.delete_endpoint(EndpointName=endpoint_name)"
"print(\"Downloading prediction object...\")\n",
"s3.download_file(Bucket=bucket,\n",
" Key=prefix + '/predict/' + test_no_labels_file + '.out',\n",
" Filename='./predictions.csv')\n",
"\n",
"preds = np.loadtxt('predictions.csv')\n",
"print('\\nMedian Absolute Percent Error (MdAPE) = ', np.median(np.abs(y_test - preds) / y_test))"
]
}
],
Expand Down
2 changes: 1 addition & 1 deletion hyperparameter_tuning/keras_bring_your_own/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def upload_training_data():

tensorflow_version_tag = get_tensorflow_version_tag(args.tf_version, args.instance_type)

image_name = get_image_name(args.ecr_repository, args.tensorflow_version_tag)
image_name = get_image_name(args.ecr_repository, tensorflow_version_tag)

build_image(image_name, tensorflow_version_tag)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@
"source": [
"estimator = TensorFlow(entry_point='mnist.py',\n",
" role=role,\n",
" framework_version='1.10.0',\n",
" training_steps=1000, \n",
" evaluation_steps=100,\n",
" train_instance_count=1,\n",
Expand Down
Loading

0 comments on commit 8631fa2

Please sign in to comment.