diff --git a/use-cases/computer_vision/2-metastases-detection-lineage-registry.ipynb b/use-cases/computer_vision/2-metastases-detection-lineage-registry.ipynb deleted file mode 100644 index 1da3ebd915..0000000000 --- a/use-cases/computer_vision/2-metastases-detection-lineage-registry.ipynb +++ /dev/null @@ -1,367 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Computer Vision for Medical Imaging: Part 2. Model Lineage and Model Registry\n", - "This notebook is part 2 of a 4-part series of techniques and services offer by SageMaker to build a model which predicts if an image of cells contains cancer. This notebook gives an overview of how to track model lineage, how to create a model registry, and how to store models into the registry." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset\n", - "The dataset for this demo comes from the [Camelyon16 Challenge](https://camelyon16.grand-challenge.org/) made available under the CC0 licencse. The raw data provided by the challenge has been processed into 96x96 pixel tiles by [Bas Veeling](https://github.com/basveeling/pcam) and also made available under the CC0 license. For detailed information on each dataset please see the papers below:\n", - "* Ehteshami Bejnordi et al. Diagnostic Assessment of Deep Learning Algorithms for Detection of Lymph Node Metastases in Women With Breast Cancer. JAMA: The Journal of the American Medical Association, 318(22), 2199–2210. [doi:jama.2017.14585](https://doi.org/10.1001/jama.2017.14585)\n", - "* B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling. \"Rotation Equivariant CNNs for Digital Pathology\". [arXiv:1806.03962](http://arxiv.org/abs/1806.03962)\n", - "\n", - "The tiled dataset from Bas Veeling is over 6GB of data. In order to easily run this demo, the dataset has been pruned to the first 14,000 images of the tiled dataset and comes included in the repo with this notebook for convenience." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Update Sagemaker SDK and Boto3\n", - "\n", - "
\n", - "NOTE You may get an error from pip's dependency resolver; you can ignore this error.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import sagemaker\n", - "import numpy as np\n", - "import cv2\n", - "\n", - "from inference_specification import InferenceSpecification" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure Boto3 Clients and Sessions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "region = \"us-west-2\" # Change region as needed\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", - "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", - "\n", - "bucket = sagemaker.Session().default_bucket()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Examine Lineage\n", - "Though you already know the training job details from the previous notebook, if we were just given the model uri, we could use SageMaker Lineage to retrieve the training job details which produced the model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Lineage and Metrics for Best Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.lineage import context, artifact, association, action" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training data artifact" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = sagemaker.analytics.HyperparameterTuningJobAnalytics(tuning_job_name)\n", - "results_df = results.dataframe()\n", - "best_training_job_summary = results.description()[\"BestTrainingJob\"]\n", - "best_training_job_details = sagemaker_boto_client.describe_training_job(\n", - " TrainingJobName=best_training_job_name\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_artifact_list = []\n", - "for data_input in best_training_job_details[\"InputDataConfig\"]:\n", - " channel = data_input[\"ChannelName\"]\n", - " data_s3_uri = data_input[\"DataSource\"][\"S3DataSource\"][\"S3Uri\"]\n", - "\n", - " matching_artifacts = list(\n", - " artifact.Artifact.list(source_uri=data_s3_uri, sagemaker_session=sagemaker_session)\n", - " )\n", - "\n", - " if matching_artifacts:\n", - " data_artifact = matching_artifacts[0]\n", - " print(f\"Using existing artifact: {data_artifact.artifact_arn}\")\n", - " else:\n", - " data_artifact = artifact.Artifact.create(\n", - " artifact_name=channel,\n", - " source_uri=data_s3_uri,\n", - " artifact_type=\"DataSet\",\n", - " sagemaker_session=sagemaker_session,\n", - " )\n", - " print(f\"Create artifact {data_artifact.artifact_arn}: SUCCESSFUL\")\n", - " data_artifact_list.append(data_artifact)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model artifact" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trained_model_s3_uri = best_training_job_details[\"ModelArtifacts\"][\"S3ModelArtifacts\"]\n", - "\n", - "matching_artifacts = list(\n", - " artifact.Artifact.list(source_uri=trained_model_s3_uri, sagemaker_session=sagemaker_session)\n", - ")\n", - "\n", - "if matching_artifacts:\n", - " model_artifact = matching_artifacts[0]\n", - " print(f\"Using existing artifact: {model_artifact.artifact_arn}\")\n", - "else:\n", - " model_artifact = artifact.Artifact.create(\n", - " artifact_name=\"TrainedModel\",\n", - " source_uri=trained_model_s3_uri,\n", - " artifact_type=\"Model\",\n", - " sagemaker_session=sagemaker_session,\n", - " )\n", - " print(f\"Create artifact {model_artifact.artifact_arn}: SUCCESSFUL\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Set artifact associations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trial_component = sagemaker_boto_client.describe_trial_component(\n", - " TrialComponentName=best_training_job_summary[\"TrainingJobName\"] + \"-aws-training-job\"\n", - ")\n", - "trial_component_arn = trial_component[\"TrialComponentArn\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Store artifacts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "artifact_list = data_artifact_list + [model_artifact]\n", - "\n", - "for artif in artifact_list:\n", - " if artif.artifact_type == \"DataSet\":\n", - " assoc = \"ContributedTo\"\n", - " else:\n", - " assoc = \"Produced\"\n", - " try:\n", - " association.Association.create(\n", - " source_arn=artif.artifact_arn,\n", - " destination_arn=trial_component_arn,\n", - " association_type=assoc,\n", - " sagemaker_session=sagemaker_session,\n", - " )\n", - " print(f\"Association with {artif.artifact_type}: SUCCESSFUL\")\n", - " except:\n", - " print(f\"Association already exists with {artif.artifact_type}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Registry" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "mpg_name = prefix\n", - "\n", - "model_packages = sagemaker_boto_client.list_model_packages(ModelPackageGroupName=mpg_name)[\n", - " \"ModelPackageSummaryList\"\n", - "]\n", - "\n", - "if model_packages:\n", - " print(f\"Using existing Model Package Group: {mpg_name}\")\n", - "else:\n", - " mpg_input_dict = {\n", - " \"ModelPackageGroupName\": mpg_name,\n", - " \"ModelPackageGroupDescription\": \"Cancer metastasis detection\",\n", - " }\n", - "\n", - " mpg_response = sagemaker_boto_client.create_model_package_group(**mpg_input_dict)\n", - " print(f\"Create Model Package Group {mpg_name}: SUCCESSFUL\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store mpg_name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "training_jobs = results_df[\"TrainingJobName\"]\n", - "\n", - "for job_name in training_jobs:\n", - " job_data = sagemaker_boto_client.describe_training_job(TrainingJobName=job_name)\n", - " model_uri = job_data.get(\"ModelArtifacts\", {}).get(\"S3ModelArtifacts\")\n", - " training_image = job_data[\"AlgorithmSpecification\"][\"TrainingImage\"]\n", - "\n", - " mp_inference_spec = InferenceSpecification().get_inference_specification_dict(\n", - " ecr_image=training_image,\n", - " supports_gpu=False,\n", - " supported_content_types=[\"text/csv\"],\n", - " supported_mime_types=[\"text/csv\"],\n", - " )\n", - "\n", - " mp_inference_spec[\"InferenceSpecification\"][\"Containers\"][0][\"ModelDataUrl\"] = model_uri\n", - " mp_input_dict = {\n", - " \"ModelPackageGroupName\": mpg_name,\n", - " \"ModelPackageDescription\": \"SageMaker Image Classifier\",\n", - " \"ModelApprovalStatus\": \"PendingManualApproval\",\n", - " }\n", - "\n", - " mp_input_dict.update(mp_inference_spec)\n", - " mp_response = sagemaker_boto_client.create_model_package(**mp_input_dict)\n", - "\n", - "model_packages = sagemaker_boto_client.list_model_packages(\n", - " ModelPackageGroupName=mpg_name, MaxResults=6\n", - ")[\"ModelPackageSummaryList\"]\n", - "model_packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store model_packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "conda_mxnet_p36", - "language": "python", - "name": "conda_mxnet_p36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/computer_vision/3-metastases-detection-deploy-predict.ipynb b/use-cases/computer_vision/3-metastases-detection-deploy-predict.ipynb deleted file mode 100644 index d51ffb0892..0000000000 --- a/use-cases/computer_vision/3-metastases-detection-deploy-predict.ipynb +++ /dev/null @@ -1,402 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Computer Vision for Medical Imaging: Part 3. Deploy Model & Make Predictions\n", - "This notebook is part 3 of a 4-part series of techniques and services offer by SageMaker to build a model which predicts if an image of cells contains cancer. This notebook demonstrates how to use SageMaker to deploy a model and how to make predictions using the deployed model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset\n", - "The dataset for this demo comes from the [Camelyon16 Challenge](https://camelyon16.grand-challenge.org/) made available under the CC0 licencse. The raw data provided by the challenge has been processed into 96x96 pixel tiles by [Bas Veeling](https://github.com/basveeling/pcam) and also made available under the CC0 license. For detailed information on each dataset please see the papers below:\n", - "* Ehteshami Bejnordi et al. Diagnostic Assessment of Deep Learning Algorithms for Detection of Lymph Node Metastases in Women With Breast Cancer. JAMA: The Journal of the American Medical Association, 318(22), 2199–2210. [doi:jama.2017.14585](https://doi.org/10.1001/jama.2017.14585)\n", - "* B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling. \"Rotation Equivariant CNNs for Digital Pathology\". [arXiv:1806.03962](http://arxiv.org/abs/1806.03962)\n", - "\n", - "The tiled dataset from Bas Veeling is over 6GB of data. In order to easily run this demo, the dataset has been pruned to the first 14,000 images of the tiled dataset and comes included in the repo with this notebook for convenience." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Update Sagemaker SDK and Boto3\n", - "\n", - "
\n", - "NOTE You may get an error from pip's dependency resolver; you can ignore this error.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import sagemaker\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import zipfile\n", - "import h5py\n", - "import cv2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure Boto3 Clients and Sessions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "region = \"us-west-2\" # Change region as needed\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", - "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", - "\n", - "bucket = sagemaker.Session().default_bucket()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = sagemaker.analytics.HyperparameterTuningJobAnalytics(tuning_job_name)\n", - "results_df = results.dataframe()\n", - "best_training_job_summary = results.description()[\"BestTrainingJob\"]\n", - "best_training_job_details = sagemaker_boto_client.describe_training_job(\n", - " TrainingJobName=best_training_job_name\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"metastasis-detection\"\n", - "model_matches = sagemaker_boto_client.list_models(NameContains=model_name)[\"Models\"]\n", - "training_image = sagemaker.image_uris.retrieve(\"image-classification\", region)\n", - "\n", - "if not model_matches:\n", - " print(f\"Creating model {model_name}\")\n", - " sagemaker_session.create_model_from_job(\n", - " name=model_name,\n", - " training_job_name=best_training_job_summary[\"TrainingJobName\"],\n", - " role=sagemaker_role,\n", - " image_uri=training_image,\n", - " )\n", - "else:\n", - " print(f\"Model {model_name} already exists.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_jobs = results_df[\"TrainingJobName\"]\n", - "best_model_index = np.where(training_jobs.values == best_training_job_summary[\"TrainingJobName\"])[\n", - " 0\n", - "][0]\n", - "best_model_info = sagemaker_boto_client.describe_model_package(\n", - " ModelPackageName=model_packages[best_model_index][\"ModelPackageArn\"]\n", - ")\n", - "best_model_container = best_model_info.get(\"InferenceSpecification\").get(\"Containers\")[0]\n", - "deploy_instance_type = best_model_info.get(\"InferenceSpecification\").get(\n", - " \"SupportedRealtimeInferenceInstanceTypes\"\n", - ")[0]\n", - "\n", - "best_model = sagemaker.Model(\n", - " image_uri=best_model_container.get(\"Image\"),\n", - " model_data=best_model_container.get(\"ModelDataUrl\"),\n", - " role=sagemaker.get_execution_role(),\n", - " name=mpg_name,\n", - ")\n", - "\n", - "best_model.deploy(\n", - " initial_instance_count=1, instance_type=deploy_instance_type, endpoint_name=mpg_name\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store deploy_instance_type" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference\n", - "Finally, the we can now validate the model for use. You can obtain the endpoint from the client library using the result from previous operations, and generate classifications from the trained model using that endpoint." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "with h5py.File(\"data/camelyon16_tiles.h5\", \"r\") as hf:\n", - " X = hf[\"x\"][()]\n", - " y = hf[\"y\"][()]\n", - "\n", - "X_numpy = X[:]\n", - "y_numpy = y[:]\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X_numpy, y_numpy, test_size=1000, random_state=0\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# view test image\n", - "image = X_test[0]\n", - "label = y_test[0]\n", - "plt.imshow(image)\n", - "plt.axis(\"off\")\n", - "plt.title(f\"Label: {label}\");" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image\n", - "\n", - "img = Image.fromarray(X_test[0])\n", - "file_name = \"data/test_image.jpg\"\n", - "img.save(file_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0.010918906889855862, 0.9890810251235962]\n" - ] - } - ], - "source": [ - "import json\n", - "\n", - "runtime = boto3.Session().client(service_name=\"runtime.sagemaker\")\n", - "with open(file_name, \"rb\") as f:\n", - " payload = f.read()\n", - " payload = bytearray(payload)\n", - "\n", - "response = runtime.invoke_endpoint(\n", - " EndpointName=mpg_name, ContentType=\"application/x-image\", Body=payload\n", - ")\n", - "\n", - "result = response[\"Body\"].read()\n", - "\n", - "# result will be in json format and convert it to ndarray\n", - "result = json.loads(result)\n", - "print(result)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# the result will output the probabilities for all classes\n", - "# find the class with maximum probability and print the class index\n", - "index = np.argmax(result)\n", - "index" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "predictions = []\n", - "for i in range(len(X_test)):\n", - " img = Image.fromarray(X_test[i])\n", - " file_name = f\"/tmp/test_image.jpg\"\n", - " img.save(file_name)\n", - "\n", - " with open(file_name, \"rb\") as f:\n", - " payload = f.read()\n", - " payload = bytearray(payload)\n", - "\n", - " response = runtime.invoke_endpoint(\n", - " EndpointName=mpg_name, ContentType=\"application/x-image\", Body=payload\n", - " )\n", - "\n", - " result = response[\"Body\"].read()\n", - " result = json.loads(result)\n", - " index = np.argmax(result)\n", - " predictions.append(index)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Precision = 0.8927835051546392\n", - "Recall = 0.8523622047244095\n", - "F1-Score = 0.8721047331319234\n" - ] - } - ], - "source": [ - "from sklearn.metrics import precision_recall_fscore_support\n", - "\n", - "precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions)\n", - "print(f\"Precision = {precision[1]}\")\n", - "print(f\"Recall = {recall[1]}\")\n", - "print(f\"F1-Score = {f1[1]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean up resources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_model.sagemaker_session.delete_endpoint(mpg_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "conda_mxnet_p36", - "language": "python", - "name": "conda_mxnet_p36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/index.rst b/use-cases/index.rst index 5f406a8668..8e2cae295e 100644 --- a/use-cases/index.rst +++ b/use-cases/index.rst @@ -15,7 +15,6 @@ Fleet Predictive Maintenance .. toctree:: :maxdepth: 1 - predictive_maintenance/0_usecase_and_architecture_predmaint predictive_maintenance/1_dataprep_dw_job_predmaint predictive_maintenance/2_dataprep_predmaint predictive_maintenance/3_train_tune_predict_predmaint @@ -37,10 +36,8 @@ Computer Vision for Medical Imaging .. toctree:: :maxdepth: 1 - computer_vision/1-metastases-detection-train-model - computer_vision/2-metastases-detection-lineage-registry - computer_vision/3-metastases-detection-deploy-predict - computer_vision/4-metastases-detection-pipeline + computer_vision/metastases-detection + computer_vision/metastases-detection-pipeline Pipelines with NLP for Product Rating Prediction diff --git a/use-cases/predictive_maintenance/0_usecase_and_architecture_predmaint.ipynb b/use-cases/predictive_maintenance/0_usecase_and_architecture_predmaint.ipynb deleted file mode 100644 index 9853666c61..0000000000 --- a/use-cases/predictive_maintenance/0_usecase_and_architecture_predmaint.ipynb +++ /dev/null @@ -1,167 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fleet Predictive Maintenance: Part 1. Introduction\n", - "\n", - "*Using SageMaker Studio to Predict Fault Classification*\n", - "\n", - "---\n", - "\n", - "## Contents\n", - "\n", - "1. [Background](#0_Background)\n", - "1. [Setup](#0_Setup)\n", - "1. [Architecure](#0_Architecture)\n", - "1. [Data Prep: Processing Job from Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb)\n", - "1. [Data Prep: Featurization](./2_dataprep_predmaint.ipynb.ipynb)\n", - "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb.ipynb)\n", - "\n", - "\n", - "\n", - "\n", - "---\n", - " \n", - "## Background\n", - "\n", - "The purpose of this notebook is to demonstrate a Predictive Maintenance (PrM) solution for automible fleet maintenance via Amazon SageMaker Studio so that business users have a quick path towards a PrM POC. In this notebook, we focus on preprocessing engine sensor data before feature engineering and buidling an inital model leveraging SageMaker's algorithms. This notebook will cover the following:\n", - "\n", - "* Setup for using SageMaker\n", - "* Basic data cleaning, analysis and preprocessing\n", - "* Converting datasets to format used by the Amazon SageMaker algorithms and uploading to S3 \n", - "* Training SageMaker's linear learner on the dataset\n", - "* Hyperparamter tuning using SageMaker Automatic Tuning\n", - "* Deploying and getting predictions using Batch Transform\n", - "\n", - "## Important Notes: \n", - "\n", - "* Due to cost consideration, the goal of this example is to show you how to use some of SageMaker Studio's features, not necessarily to achieve the best result. \n", - "* We use the built-in classification algorithm in this example, and a Python 3 (Data Science) Kernel is required.\n", - "* The nature of predictive maintenace solutions, requires a domain knowledge expert of the system or machinery. With this in mind, we will make assumptions here for certain elements of this solution with the acknowldgement that these assumptions should be informed by a domain expert and a main business stakeholder\n", - "\n", - "Please see the README.md for more information about this use case. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "## Set up\n", - "\n", - "Let's start by:\n", - "\n", - "* Setting up or refreshing storemagic variables \n", - "* Install and Import any dependencies\n", - "* Instatiate SageMaker session\n", - "* Specifying the S3 bucket and prefix that you want to use for your training and model data. This should be within the same region as SageMaker training\n", - "* Define the IAM role used to give training access to your data\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View stored variables from previous session\n", - "\n", - "If you ran this notebook before, you may want to re-use the resources you aready created with AWS. Run the cell below to load any prevously created variables. You should see a print-out of the existing variables. If you don't see anything you may need to create them again or it may be your first time running this notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After you run the notebooks each in succession you will accrue a set of stored variables, stored gradually as you run each notebook:\n", - "Stored variables and their in-db values:\n", - "\n", - "\n", - "- create_date -> '2021-03-16-06-42-12'\n", - "- dw_output_path_prm -> 's3://sagemaker-us-east-2-1234567890/export-flow\n", - "- exp_prefix -> 'sagemaker-experiments/linear-learner-2021-03-16-0\n", - "- experiment_name -> 'll-failure-classification-2021-03-16-06-42-12'\n", - "- features_created_prm -> True\n", - "- path_to_test_data_prm -> 's3://sagemaker-us-east-2-1234567890/test/test.c\n", - "- path_to_test_x_data_prm -> 's3://sagemaker-us-east-2-1234567890/test/test_x\n", - "- path_to_train_data_prm -> 's3://sagemaker-us-east-2-1234567890/train/train\n", - "- path_to_valid_data_prm -> 's3://sagemaker-us-east-2-1234567890/validation/\n", - "- trial_name_1 -> 'linear-learner-lr-training-job-2021-03-16-06-42-1\n", - "- trial_name_2 -> 'linear-learner-svm-2021-03-16-06-00-37'\n", - "- trial_name_3 -> 'linear-learner-svm-thresh-2021-03-16-06-00-37'\n", - "- trial_name_4 -> 'linear-learner-svm-balanced-2021-03-16-06-00-37'\n", - "- tune_trial_name -> 'll-svm-tuning-job-trial'\n", - "- tuning_job_name -> 'll-svm-tuning-job'\n", - " \n", - " \n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note : The above output will be null in the very beginning. On subsequent runs, you will see the stored variables. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "## Architecture\n", - "\n", - "![solution_arch_diagram](./images/solution_arch_diagram.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - " \n", - "## Next Notebook : Data Prep with DataWrangler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/predictive_maintenance/1_dataprep_dw_job_predmaint.ipynb b/use-cases/predictive_maintenance/1_dataprep_dw_job_predmaint.ipynb index 20cf3b8dc0..3ea4987017 100644 --- a/use-cases/predictive_maintenance/1_dataprep_dw_job_predmaint.ipynb +++ b/use-cases/predictive_maintenance/1_dataprep_dw_job_predmaint.ipynb @@ -4,18 +4,45 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Fleet Predictive Maintenance: Part 2. Data Preparation with Data Wrangler\n", + "# Fleet Predictive Maintenance: Part 1. Data Preparation with SageMaker Data Wrangler\n", "\n", - "1. [Architecure](0_usecase_and_architecture_predmaint.ipynb#0_Architecture)\n", - "1. [Data Prep: Processing Job from Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb)\n", + "*Using SageMaker Studio to Predict Fault Classification*\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Background\n", + "\n", + "This notebook is part of a sequence of notebooks whose purpose is to demonstrate a Predictive Maintenance (PrM) solution for automobile fleet maintenance via Amazon SageMaker Studio so that business users have a quick path towards a PrM POC. In this notebook, we will be focusing on preprocessing engine sensor data. It is the first notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. [Data Prep: Processing Job from Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb) (current notebook)\n", "1. [Data Prep: Featurization](./2_dataprep_predmaint.ipynb)\n", - "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb.ipynb)" + "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Important Notes: \n", + "\n", + "* Due to cost consideration, the goal of this example is to show you how to use some of SageMaker Studio's features, not necessarily to achieve the best result. \n", + "* We use the built-in classification algorithm in this example, and a Python 3 (Data Science) Kernel is required.\n", + "* The nature of predictive maintenace solutions, requires a domain knowledge expert of the system or machinery. With this in mind, we will make assumptions here for certain elements of this solution with the acknowldgement that these assumptions should be informed by a domain expert and a main business stakeholder" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "----\n", "## SageMaker Data Wrangler Job Notebook\n", "\n", "This notebook uses the Data Wrangler .flow file to submit a SageMaker Data Wrangler Job\n", @@ -31,22 +58,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# SageMaker Python SDK version 2.x is required\n", - "import pkg_resources\n", - "import subprocess\n", - "import sys\n", - "\n", - "original_version = pkg_resources.get_distribution(\"sagemaker\").version\n", - "_ = subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"sagemaker==2.20.0\"])" + "# Upgrade SageMaker to the latest version\n", + "! pip install --upgrade sagemaker" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -70,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -87,18 +109,15 @@ "\n", "iam_role = sagemaker.get_execution_role()\n", "\n", - "container_uri = (\n", - " \"415577184552.dkr.ecr.us-east-2.amazonaws.com/sagemaker-data-wrangler-container:1.2.1\"\n", - ")\n", - "\n", "# Processing Job Resources Configurations\n", "# Data wrangler processing job only supports 1 instance.\n", "instance_count = 1\n", "instance_type = \"ml.m5.4xlarge\"\n", "\n", - "# Processing Job Path URI Information\n", + "# Processing Job Path URI Information. This is the where the output data from SageMaker Data Wrangler will be stored.\n", "output_prefix = f\"export-{flow_name}/output\"\n", "output_path = f\"s3://{bucket}/{output_prefix}\"\n", + "# Output name is auto-generated from the select node's ID + output name from the flow file, which specifies how the data will be transformed.\n", "output_name = \"ff586e7b-a02d-472b-91d4-da3dd05d7a30.default\"\n", "\n", "processing_job_name = f\"data-wrangler-flow-processing-{flow_id}\"\n", @@ -128,14 +147,16 @@ "metadata": {}, "outputs": [], "source": [ - "from demo_helpers import update_dw_s3uri, get_dw_container_for_region\n", + "from demo_helpers import update_dw_s3uri\n", "\n", "# update the flow file to change the s3 location to our bucket\n", "update_dw_s3uri(flow_file_name)\n", "\n", "# get the Data Wrangler container associated with our region\n", "region = boto3.Session().region_name\n", - "container_uri = get_dw_container_for_region(region)\n", + "container_uri = sagemaker.image_uris.retrieve(\n", + " \"data-wrangler\", sagemaker.Session().boto_region_name, version=\"1.0.1\"\n", + ")\n", "\n", "dw_output_path_prm = output_path\n", "print(\n", @@ -183,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -333,6 +354,140 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Cleaning with Data Wrangler\n", + "\n", + "#### Load, preparation, EDA and Preprocessing \n", + "\n", + "[contents](#2_Contents)\n", + "\n", + "For the initial data preparation and exploration, we will utilize SageMaker's new feature, Data Wrangler, to load data and do some data transformations. In the Data Wrangler GUI, we will perform the following steps. Note that because this data is generated, the data is relatively clean and there are few data cleaning steps needed. After completing these steps, you can uncomment and run the code below to inspect your cleaned data.\n", + "1. Load fleet sensor logs data from S3\n", + "1. Load fleet details data from S3\n", + "1. Change column data types \n", + "1. Change coulmn headers \n", + "1. Check for Null/NA values (impute or drop)\n", + "1. Join sensor and details data\n", + "1. One-Hot Encode categorical features\n", + "1. Do preliminar analysis using built-in feature\n", + "1. Export recipe as SageMaker Data Wrangler job\n", + "1. Upload final cleaned data set to S3\n", + "\n", + "\n", + "\n", + "For our purposes, we will download the final cleaned data set from S3 into our SageMaker Studio instance, but for more information on how to load and preprocess tabular data follow this link: [Tabular Preprocessing Blog]().\n", + "For additional information on preprocessing for PrM, please refer to this blog, [On the relevance of preprocessing in predictive\n", + "maintenance for dynamic systems](https://bird.bcamath.org/bitstream/handle/20.500.11824/892/CernudaPREDICT2018S16.pdf?sequence=1&isAllowed=y)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fleet = wr.s3.read_csv(path=dw_output_path_prm, dataset=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # add in additional features and change data types\n", + "# fleet[\"datetime\"] = pd.to_datetime(fleet[\"datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", + "# fleet[\"cycle\"] = fleet.groupby(\"vehicle_id\")[\"datetime\"].rank(\"dense\")\n", + "# fleet[\"make\"] = fleet[\"make\"].astype(\"category\")\n", + "# fleet[\"model\"] = fleet[\"model\"].astype(\"category\")\n", + "# fleet[\"vehicle_class\"] = fleet[\"vehicle_class\"].astype(\"category\")\n", + "# fleet[\"engine_type\"] = fleet[\"engine_type\"].astype(\"category\")\n", + "# fleet[\"engine_age\"] = fleet[\"datetime\"].dt.year - fleet[\"year\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fleet = fleet[\n", + "# [\n", + "# \"target\",\n", + "# \"vehicle_id\",\n", + "# \"datetime\",\n", + "# \"make\",\n", + "# \"model\",\n", + "# \"year\",\n", + "# \"vehicle_class\",\n", + "# \"engine_type\",\n", + "# \"make_code_Make A\",\n", + "# \"make_code_Make B\",\n", + "# \"make_code_Make E\",\n", + "# \"make_code_Make C\",\n", + "# \"make_code_Make D\",\n", + "# \"model_code_Model E1\",\n", + "# \"model_code_Model A4\",\n", + "# \"model_code_Model B1\",\n", + "# \"model_code_Model B2\",\n", + "# \"model_code_Model A2\",\n", + "# \"model_code_Model A3\",\n", + "# \"model_code_Model B3\",\n", + "# \"model_code_Model C2\",\n", + "# \"model_code_Model A1\",\n", + "# \"model_code_Model A5\",\n", + "# \"model_code_Model A6\",\n", + "# \"model_code_Model C1\",\n", + "# \"model_code_Model D1\",\n", + "# \"model_code_Model E2\",\n", + "# \"vehicle_class_code_Truck-Tractor\",\n", + "# \"vehicle_class_code_Truck\",\n", + "# \"vehicle_class_code_Bus\",\n", + "# \"vehicle_class_code_Transport\",\n", + "# \"engine_type_code_Engine E\",\n", + "# \"engine_type_code_Engine C\",\n", + "# \"engine_type_code_Engine B\",\n", + "# \"engine_type_code_Engine F\",\n", + "# \"engine_type_code_Engine H\",\n", + "# \"engine_type_code_Engine D\",\n", + "# \"engine_type_code_Engine A\",\n", + "# \"engine_type_code_Engine G\",\n", + "# \"voltage\",\n", + "# \"current\",\n", + "# \"resistance\",\n", + "# \"cycle\",\n", + "# \"engine_age\",\n", + "# ]\n", + "# ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# fleet.sort_values(by=[\"vehicle_id\", \"datetime\"], inplace=True)\n", + "# fleet.to_csv(\"fleet_data.csv\", index=False)\n", + "# fleet.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you followed the above steps correctly, you data should match that of the existing [fleet_data.csv](fleet_data.csv). It would also fit the following key observations:\n", + "\n", + "- There are 90 vehicles in the fleet\n", + "- Data has 9000 observations and 44 columns.\n", + "- Vehicle can be identified useing the 'vehicle_id' column.\n", + "- The label column, called 'Target', is an indicator of failure ('0' = No Failure; '1' = Failure).\n", + "- There are 4 numeric features available for prediction and 4 categorical features. We will expand upon these later in the Feature Engineering section of this notebook. " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -346,8 +501,7 @@ "It is important to note that the following XGBoost objective ['binary', 'regression',\n", "'multiclass'], hyperparameters, or content_type may not be suitable for the output data, and will\n", "require changes to train a proper model. Furthermore, for CSV training, the algorithm assumes that\n", - "the target variable is in the first column. For more information on SageMaker XGBoost, please see\n", - "https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html.\n", + "the target variable is in the first column. For more information on SageMaker XGBoost, please see [XGBoost Algorithm](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html).\n", "\n", "### Find Training Data path\n", "\n", @@ -377,12 +531,12 @@ "metadata": {}, "source": [ "Next, the Training Job hyperparameters are set. For more information on XGBoost Hyperparameters,\n", - "see https://xgboost.readthedocs.io/en/latest/parameter.html." + "see [XGBoost Parameters](https://xgboost.readthedocs.io/en/latest/parameter.html)." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -425,30 +579,6 @@ ")\n", "estimator.fit({\"train\": train_input})" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cleanup\n", - "\n", - "Uncomment the following code cell to revert the SageMaker Python SDK to the original version used\n", - "before running this notebook. This notebook upgrades the SageMaker Python SDK to 2.x, which may\n", - "cause other example notebooks to break. To learn more about the changes introduced in the\n", - "SageMaker Python SDK 2.x update, see\n", - "[Use Version 2.x of the SageMaker Python SDK.](https://sagemaker.readthedocs.io/en/stable/v2.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# _ = subprocess.check_call(\n", - "# [sys.executable, \"-m\", \"pip\", \"install\", f\"sagemaker=={original_version}\"]\n", - "# )" - ] } ], "metadata": { diff --git a/use-cases/predictive_maintenance/2_dataprep_predmaint.ipynb b/use-cases/predictive_maintenance/2_dataprep_predmaint.ipynb index 983bc2456b..1dca5c3a41 100644 --- a/use-cases/predictive_maintenance/2_dataprep_predmaint.ipynb +++ b/use-cases/predictive_maintenance/2_dataprep_predmaint.ipynb @@ -4,104 +4,73 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Fleet Predictive Maintenance: Part 3. Feature Engineering\n", + "# Fleet Predictive Maintenance: Part 2. Feature Engineering and Exploratory Data Visualization\n", "\n", - "## Data Preparation: Featurization and Exploratory Data Visualization\n", - "\n", - "*Using SageMaker Studio to Predict Fault Classification*\n", - "\n", - "1. [Architecure](0_usecase_and_architecture_predmaint.ipynb#0_Architecture)\n", - "1. [Data Prep: Processing Job from Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb)\n", - "1. [Data Prep: Featurization](./2_dataprep_predmaint.ipynb)\n", - "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb)" + "*Using SageMaker Studio to Predict Fault Classification*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "---\n", - " \n", - "\n", - "## Contents\n", - "\n", - "1. [Background](#Background)\n", - "1. [Setup](#2_Setup)\n", - "1. [Data](#2_Data)\n", - "1. [Feature Engineering](#2_Features)\n", - "1. [Data Visualization](#2_Visualization)\n", - "\n", - "\n", - "---\n", - "\n", "## Background\n", "\n", - "The purpose of this notebook is to demonstrate a Predictive Maintenance (PrM) solution for automible fleet maintenance via Amazon SageMaker Studio so that business users have a quick path towards a PrM POC. In this notebook, we focus on preprocessing engine sensor data before feature engineering and buidling an inital model leveraging SageMaker's algorithms. This notebook will cover the following:\n", - "\n", - "* Setup for using SageMaker\n", - "* Basic data cleaning, analysis and preprocessing\n", - "* Converting datasets to format used by the Amazon SageMaker algorithms and uploading to S3 \n", - "* Training SageMaker's linear learner on the dataset\n", - "* Hyperparamter tuning using SageMaker Automatic Tuning\n", - "* Deploying and getting predictions using Batch Transform\n", - "\n", - "## Important Notes: \n", - "\n", - "* Due to cost consideration, the goal of this example is to show you how to use some of SageMaker Studio's features, not necessarily to achieve the best result. \n", - "* We use the built-in classification algorithm in this example, and a Python 3 (Data Science) Kernel is required.\n", - "* The nature of predictive maintenace solutions, requires a domain knowledge expert of the system or machinery. With this in mind, we will make assumptions here for certain elements of this solution with the acknowldgement that these assumptions should be informed by a domain expert and a main business stakeholder\n", - "\n", - "Please see the README.md for more information about this use case. " + "This notebook is part of a sequence of notebooks whose purpose is to demonstrate a Predictive Maintenance (PrM) solution for automobile fleet maintenance via Amazon SageMaker Studio so that business users have a quick path towards a PrM POC. In this notebook, we will be focusing on feature engineering. It is the second notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "---\n", - " \n", - "## Set up\n", - "\n", - "[contents](#2_Contents)\n", - "\n", - "Let's start by:\n", - "\n", - "* Setting up or refreshing storemagic variables \n", - "* Install and Import any dependencies\n", - "* Instatiate SageMaker session\n", - "* Specifying the S3 bucket and prefix that you want to use for your training and model data. This should be within the same region as SageMaker training\n", - "* Define the IAM role used to give training access to your data\n", - " " + "1. [Data Prep: Processing Job from SageMaker Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb)\n", + "1. [Data Prep: Featurization](./2_dataprep_predmaint.ipynb) (current notebook)\n", + "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### View stored variables from previous session\n", - "If you ran this notebook before, you may want to re-use the resources you aready created with AWS. Run the cell below to load any prevously created variables. You should see a print-out of the existing variables. If you don't see anything you may need to create them again or it may be your first time running this notebook." + "## Important Notes: \n", + "\n", + "* Due to cost consideration, the goal of this example is to show you how to use some of SageMaker Studio's features, not necessarily to achieve the best result. \n", + "* We use the built-in classification algorithm in this example, and a Python 3 (Data Science) Kernel is required.\n", + "* The nature of predictive maintenace solutions, requires a domain knowledge expert of the system or machinery. With this in mind, we will make assumptions here for certain elements of this solution with the acknowldgement that these assumptions should be informed by a domain expert and a main business stakeholder" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "%store -r\n", - "%store" + "---\n", + " \n", + "\n", + "## Contents\n", + "\n", + "1. [Setup](#Setup)\n", + "1. [Feature Engineering](#Feature-Engineering)\n", + "1. [Visualization of the Data Distributions](#Visualization-of-the-Data-Distributions)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Note : dw_output_path_prm should appear above as a stored (restored) variable, whose value was set when you ran notebook 1_datapred_predmaint.ipynb" + "---\n", + "## Setup\n", + "\n", + "Let's start by:\n", + "\n", + "* Installing and importing any dependencies\n", + "* Instantiating SageMaker session\n", + "* Specifying the S3 bucket and prefix that you want to use for your training and model data. This should be within the same region as SageMaker training\n", + "* Defining the IAM role used to give training access to your data\n", + " " ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -151,184 +120,39 @@ "metadata": {}, "source": [ "---\n", - " \n", - "## Data\n", - "\n", - "#### Load, preparation, EDA and Preprocessing \n", - "\n", - "[contents](#2_Contents)\n", - "\n", - "For the initial data preparation and exploration, we will utilize SageMaker's new feature, Data Wrangler, to load data and do some data transformations. In the Data Wrangler GUI, we will perform the following steps. Note that because this data is generated, the data is relatively clean and there are few data cleaning steps needed. \n", - "1. Load fleet sensor logs data from S3\n", - "1. Load fleet details data from S3\n", - "1. Change column data types \n", - "1. Change coulmn headers \n", - "1. Check for Null/NA values (impute or drop)\n", - "1. Join sensor and details data\n", - "1. One-Hot Encode categorical features\n", - "1. Do preliminar analysis using built-in feature\n", - "1. Export recipe as SageMaker Data Wrangler job\n", - "1. Upload final cleaned data set to S3\n", - "\n", + "## Feature Engineering \n", "\n", + "For PrM, feature selection, generation and engineering is extremely important and very depended on domain expertise and understanding of the systems involved. For our solution, we will focus on the some simple features such as:\n", + "* lag features \n", + "* rolling average\n", + "* rolling standard deviation \n", + "* age of the engines \n", + "* categorical labels\n", "\n", - "For our purposes, we will download the final clened data set from S3 into our SageMaker Studio instance, but for more information on how to load and preprocess tabular data follow this link: [Tabular Preprocessing Blog]().\n", - "For additional information on preprocessing for PrM, please refer to this blog, [On the relevance of preprocessing in predictive\n", - "maintenance for dynamic systems](https://bird.bcamath.org/bitstream/handle/20.500.11824/892/CernudaPREDICT2018S16.pdf?sequence=1&isAllowed=y)." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "fleet = wr.s3.read_csv(path=dw_output_path_prm, dataset=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:numexpr.utils:NumExpr defaulting to 2 threads.\n" - ] - } - ], - "source": [ - "# add in additional features and change data types\n", - "fleet[\"datetime\"] = pd.to_datetime(fleet[\"datetime\"], format=\"%Y-%m-%d %H:%M:%S\")\n", - "fleet[\"cycle\"] = fleet.groupby(\"vehicle_id\")[\"datetime\"].rank(\"dense\")\n", - "fleet[\"make\"] = fleet[\"make\"].astype(\"category\")\n", - "fleet[\"model\"] = fleet[\"model\"].astype(\"category\")\n", - "fleet[\"vehicle_class\"] = fleet[\"vehicle_class\"].astype(\"category\")\n", - "fleet[\"engine_type\"] = fleet[\"engine_type\"].astype(\"category\")\n", - "fleet[\"engine_age\"] = fleet[\"datetime\"].dt.year - fleet[\"year\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "fleet = fleet[\n", - " [\n", - " \"target\",\n", - " \"vehicle_id\",\n", - " \"datetime\",\n", - " \"make\",\n", - " \"model\",\n", - " \"year\",\n", - " \"vehicle_class\",\n", - " \"engine_type\",\n", - " \"make_code_Make A\",\n", - " \"make_code_Make B\",\n", - " \"make_code_Make E\",\n", - " \"make_code_Make C\",\n", - " \"make_code_Make D\",\n", - " \"model_code_Model E1\",\n", - " \"model_code_Model A4\",\n", - " \"model_code_Model B1\",\n", - " \"model_code_Model B2\",\n", - " \"model_code_Model A2\",\n", - " \"model_code_Model A3\",\n", - " \"model_code_Model B3\",\n", - " \"model_code_Model C2\",\n", - " \"model_code_Model A1\",\n", - " \"model_code_Model A5\",\n", - " \"model_code_Model A6\",\n", - " \"model_code_Model C1\",\n", - " \"model_code_Model D1\",\n", - " \"model_code_Model E2\",\n", - " \"vehicle_class_code_Truck-Tractor\",\n", - " \"vehicle_class_code_Truck\",\n", - " \"vehicle_class_code_Bus\",\n", - " \"vehicle_class_code_Transport\",\n", - " \"engine_type_code_Engine E\",\n", - " \"engine_type_code_Engine C\",\n", - " \"engine_type_code_Engine B\",\n", - " \"engine_type_code_Engine F\",\n", - " \"engine_type_code_Engine H\",\n", - " \"engine_type_code_Engine D\",\n", - " \"engine_type_code_Engine A\",\n", - " \"engine_type_code_Engine G\",\n", - " \"voltage\",\n", - " \"current\",\n", - " \"resistance\",\n", - " \"cycle\",\n", - " \"engine_age\",\n", - " ]\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(9000, 44)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fleet.sort_values(by=[\"vehicle_id\", \"datetime\"], inplace=True)\n", - "fleet.to_csv(\"fleet_data.csv\", index=False)\n", - "fleet.shape" + "These features serve as a small example of the potential features that could be created. Other features to consider are changes in the sensor values within a window, change from the initial value or number over a defined threshold. For additional guidance on Feature Engineering, visit the [SageMaker Tabular Feature Engineering guide](). " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Key observations:\n", - "\n", - "- There are 90 vehicles in the fleet\n", - "- Data has 9000 observations and 44 columns.\n", - "- Vehicle can be identified useing the 'vehicle_id' column.\n", - "- The label column, called 'Target', is an indicator of failure ('0' = No Failure; '1' = Failure).\n", - "- There are 4 numeric features available for prediction and 4 categorical features. We will expand upon these later in the Feature Engineering section of this notebook. " + "First, we load up our cleaned dataset, which can be produced by following the steps in the notebook [Data Prep: Processing Job from SageMaker Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb) (the first section in this notebook series). See the [Background](#Background) section at the beginning of the notebook for more information." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# # run this cell to pick-up the new cleaned dataset\n", - "# fleet = pd.read_csv('fleet_data.csv')" + "fleet = pd.read_csv(\"fleet_data.csv\")" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "%matplotlib inline\n", "fig, axs = plt.subplots(3, 1, figsize=(20, 15))\n", @@ -346,20 +170,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "fig, axs = plt.subplots(3, 1, figsize=(20, 15))\n", "plot_fleet = fleet.loc[fleet[\"vehicle_id\"] == 2]\n", @@ -376,26 +189,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 7238\n", - "1 1762\n", - "Name: target, dtype: int64\n", - "\n", - "Percent of failures in the dataset: 0.19577777777777777\n", - "Number of vehicles with 1+ failures: 49\n", - "\n", - "0 0.804222\n", - "1 0.195778\n", - "Name: target, dtype: float64\n" - ] - } - ], + "outputs": [], "source": [ "# let's look at the proportion of failures to non-failure\n", "print(fleet[\"target\"].value_counts())\n", @@ -422,38 +218,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " percentage of failures\n", - "vehicle_id \n", - "84 1.00\n", - "65 1.00\n", - "17 1.00\n", - "71 1.00\n", - "28 0.99\n", - "15 0.92\n", - "3 0.88\n", - "63 0.76\n", - "31 0.74\n", - "40 0.73\n", - "75 0.67\n", - "6 0.66\n", - "73 0.61\n", - "42 0.58\n", - "64 0.49\n", - "85 0.42\n", - "16 0.40\n", - "22 0.38\n", - "39 0.36\n", - "26 0.35\n" - ] - } - ], + "outputs": [], "source": [ "p = fleet.groupby([\"vehicle_id\"])[\"target\"].sum().rename(\"percentage of failures\")\n", "fail_percent = pd.DataFrame(p / 100)\n", @@ -463,123 +230,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "target 0\n", - "vehicle_id 0\n", - "datetime 0\n", - "make 0\n", - "model 0\n", - "year 0\n", - "vehicle_class 0\n", - "engine_type 0\n", - "make_code_Make A 0\n", - "make_code_Make B 0\n", - "make_code_Make E 0\n", - "make_code_Make C 0\n", - "make_code_Make D 0\n", - "model_code_Model E1 0\n", - "model_code_Model A4 0\n", - "model_code_Model B1 0\n", - "model_code_Model B2 0\n", - "model_code_Model A2 0\n", - "model_code_Model A3 0\n", - "model_code_Model B3 0\n", - "model_code_Model C2 0\n", - "model_code_Model A1 0\n", - "model_code_Model A5 0\n", - "model_code_Model A6 0\n", - "model_code_Model C1 0\n", - "model_code_Model D1 0\n", - "model_code_Model E2 0\n", - "vehicle_class_code_Truck-Tractor 0\n", - "vehicle_class_code_Truck 0\n", - "vehicle_class_code_Bus 0\n", - "vehicle_class_code_Transport 0\n", - "engine_type_code_Engine E 0\n", - "engine_type_code_Engine C 0\n", - "engine_type_code_Engine B 0\n", - "engine_type_code_Engine F 0\n", - "engine_type_code_Engine H 0\n", - "engine_type_code_Engine D 0\n", - "engine_type_code_Engine A 0\n", - "engine_type_code_Engine G 0\n", - "voltage 0\n", - "current 0\n", - "resistance 0\n", - "cycle 0\n", - "engine_age 0\n", - "dtype: int64\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
targetvehicle_iddatetimemakemodelyearvehicle_classengine_typemake_code_Make Amake_code_Make B...engine_type_code_Engine Fengine_type_code_Engine Hengine_type_code_Engine Dengine_type_code_Engine Aengine_type_code_Engine Gvoltagecurrentresistancecycleengine_age
\n", - "

0 rows × 44 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [target, vehicle_id, datetime, make, model, year, vehicle_class, engine_type, make_code_Make A, make_code_Make B, make_code_Make E, make_code_Make C, make_code_Make D, model_code_Model E1, model_code_Model A4, model_code_Model B1, model_code_Model B2, model_code_Model A2, model_code_Model A3, model_code_Model B3, model_code_Model C2, model_code_Model A1, model_code_Model A5, model_code_Model A6, model_code_Model C1, model_code_Model D1, model_code_Model E2, vehicle_class_code_Truck-Tractor, vehicle_class_code_Truck, vehicle_class_code_Bus, vehicle_class_code_Transport, engine_type_code_Engine E, engine_type_code_Engine C, engine_type_code_Engine B, engine_type_code_Engine F, engine_type_code_Engine H, engine_type_code_Engine D, engine_type_code_Engine A, engine_type_code_Engine G, voltage, current, resistance, cycle, engine_age]\n", - "Index: []\n", - "\n", - "[0 rows x 44 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# check for missing values\n", "print(fleet.isnull().sum())\n", @@ -588,30 +241,9 @@ "fleet[fleet.loc[:, \"voltage\":\"resistance\"].values == 0]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - " \n", - "## Feature Engineering \n", - "\n", - "\n", - "[contents](#2_Contents)\n", - "\n", - "For PrM, feature selection, generation and engineering is extremely important and very depended on domain expertise and understanding of the systems involved. For our solution, we will focus on the some simple features such as:\n", - "* lag features \n", - "* rolling average\n", - "* rolling standard deviation \n", - "* age of the engines \n", - "* categorical labels\n", - "\n", - "These features serve as a small example of the potential features that could be created. Other features to consider are changes in the sensor values within a window, change from the initial value or number over a defined threshold. For additional guidance on Feature Engineering, visit the [SageMaker Tabular Feature Engineering guide](). " - ] - }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -622,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -642,135 +274,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vehicle_idvoltage_rolling_mean_4current_rolling_mean_4resistance_rolling_mean_4voltage_rolling_std_4current_rolling_std_4resistance_rolling_std_4
level_1
0014.0340300.173326128.3127600.0542980.0042014.661643
1014.0340300.173326128.3127600.0542980.0042014.661643
2014.0340300.173326128.3127600.0542980.0042014.661643
3014.0340300.173326128.3127600.0542980.0042014.661643
4014.0119340.172462121.8480690.0285050.00339810.347376
\n", - "
" - ], - "text/plain": [ - " vehicle_id voltage_rolling_mean_4 current_rolling_mean_4 \\\n", - "level_1 \n", - "0 0 14.034030 0.173326 \n", - "1 0 14.034030 0.173326 \n", - "2 0 14.034030 0.173326 \n", - "3 0 14.034030 0.173326 \n", - "4 0 14.011934 0.172462 \n", - "\n", - " resistance_rolling_mean_4 voltage_rolling_std_4 \\\n", - "level_1 \n", - "0 128.312760 0.054298 \n", - "1 128.312760 0.054298 \n", - "2 128.312760 0.054298 \n", - "3 128.312760 0.054298 \n", - "4 121.848069 0.028505 \n", - "\n", - " current_rolling_std_4 resistance_rolling_std_4 \n", - "level_1 \n", - "0 0.004201 4.661643 \n", - "1 0.004201 4.661643 \n", - "2 0.004201 4.661643 \n", - "3 0.004201 4.661643 \n", - "4 0.003398 10.347376 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# create rolling stats for voltage, current and resistance group by vehicle_id\n", "stats = pd.DataFrame()\n", @@ -801,136 +307,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
targetvehicle_iddatetimemakemodelyearvehicle_classengine_typemake_code_Make Amake_code_Make B...engine_agevoltage_lag_1current_lag_1resistance_lag_1voltage_rolling_mean_4current_rolling_mean_4resistance_rolling_mean_4voltage_rolling_std_4current_rolling_std_4resistance_rolling_std_4
0002020-01-01 00:00:00Make AModel A12018TruckEngine A1.00.0...214.1034210.177269133.05960314.034030.173326128.312760.0542980.0042014.661643
1002020-01-01 02:00:00Make AModel A12018TruckEngine A1.00.0...214.1034210.177269133.05960314.034030.173326128.312760.0542980.0042014.661643
\n", - "

2 rows × 53 columns

\n", - "
" - ], - "text/plain": [ - " target vehicle_id datetime make model year \\\n", - "0 0 0 2020-01-01 00:00:00 Make A Model A1 2018 \n", - "1 0 0 2020-01-01 02:00:00 Make A Model A1 2018 \n", - "\n", - " vehicle_class engine_type make_code_Make A make_code_Make B ... \\\n", - "0 Truck Engine A 1.0 0.0 ... \n", - "1 Truck Engine A 1.0 0.0 ... \n", - "\n", - " engine_age voltage_lag_1 current_lag_1 resistance_lag_1 \\\n", - "0 2 14.103421 0.177269 133.059603 \n", - "1 2 14.103421 0.177269 133.059603 \n", - "\n", - " voltage_rolling_mean_4 current_rolling_mean_4 resistance_rolling_mean_4 \\\n", - "0 14.03403 0.173326 128.31276 \n", - "1 14.03403 0.173326 128.31276 \n", - "\n", - " voltage_rolling_std_4 current_rolling_std_4 resistance_rolling_std_4 \n", - "0 0.054298 0.004201 4.661643 \n", - "1 0.054298 0.004201 4.661643 \n", - "\n", - "[2 rows x 53 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "fleet_lagged = pd.concat([fleet, stats.drop(columns=[\"vehicle_id\"])], axis=1)\n", "fleet_lagged.head(2)" @@ -938,680 +317,9 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countmeanstdmin25%50%75%max
target9000.00.200.400.000.000.000.001.00
vehicle_id9000.044.5025.980.0022.0044.5067.0089.00
year9000.02016.073.062006.002015.002017.002018.002020.00
make_code_Make A9000.00.400.490.000.000.001.001.00
make_code_Make B9000.00.240.430.000.000.000.001.00
make_code_Make E9000.00.200.400.000.000.000.001.00
make_code_Make C9000.00.110.310.000.000.000.001.00
make_code_Make D9000.00.040.210.000.000.000.001.00
model_code_Model E19000.00.180.380.000.000.000.001.00
model_code_Model A49000.00.130.340.000.000.000.001.00
model_code_Model B19000.00.090.280.000.000.000.001.00
model_code_Model B29000.00.090.280.000.000.000.001.00
model_code_Model A29000.00.070.250.000.000.000.001.00
model_code_Model A39000.00.070.250.000.000.000.001.00
model_code_Model B39000.00.070.250.000.000.000.001.00
model_code_Model C29000.00.070.250.000.000.000.001.00
model_code_Model A19000.00.040.210.000.000.000.001.00
model_code_Model A59000.00.040.210.000.000.000.001.00
model_code_Model A69000.00.040.210.000.000.000.001.00
model_code_Model C19000.00.040.210.000.000.000.001.00
model_code_Model D19000.00.040.210.000.000.000.001.00
model_code_Model E29000.00.020.150.000.000.000.001.00
vehicle_class_code_Truck-Tractor9000.00.670.470.000.001.001.001.00
vehicle_class_code_Truck9000.00.200.400.000.000.000.001.00
vehicle_class_code_Bus9000.00.090.280.000.000.000.001.00
vehicle_class_code_Transport9000.00.040.210.000.000.000.001.00
engine_type_code_Engine E9000.00.310.460.000.000.001.001.00
engine_type_code_Engine C9000.00.270.440.000.000.001.001.00
engine_type_code_Engine B9000.00.180.380.000.000.000.001.00
engine_type_code_Engine F9000.00.090.280.000.000.000.001.00
engine_type_code_Engine H9000.00.070.250.000.000.000.001.00
engine_type_code_Engine D9000.00.040.210.000.000.000.001.00
engine_type_code_Engine A9000.00.020.150.000.000.000.001.00
engine_type_code_Engine G9000.00.020.150.000.000.000.001.00
voltage9000.013.650.4011.5513.3713.7013.9315.94
current9000.00.170.060.010.130.160.190.39
resistance9000.087.0222.9234.3858.7994.69102.61138.36
cycle9000.050.5028.871.0025.7550.5075.25100.00
engine_age9000.03.933.060.002.003.005.0014.00
voltage_lag_19000.013.650.4111.5513.3713.7013.9315.94
current_lag_19000.00.170.060.010.130.160.190.39
resistance_lag_19000.087.0222.9534.3858.8494.69102.64138.36
voltage_rolling_mean_49000.013.650.4111.7713.3613.7013.9315.87
current_rolling_mean_49000.00.170.060.020.140.160.190.39
resistance_rolling_mean_49000.087.0322.9335.2258.7594.81102.56136.35
voltage_rolling_std_49000.00.040.040.000.010.030.060.28
current_rolling_std_49000.00.000.000.000.000.000.000.02
resistance_rolling_std_49000.01.020.730.000.520.891.3610.94
\n", - "
" - ], - "text/plain": [ - " count mean std min 25% \\\n", - "target 9000.0 0.20 0.40 0.00 0.00 \n", - "vehicle_id 9000.0 44.50 25.98 0.00 22.00 \n", - "year 9000.0 2016.07 3.06 2006.00 2015.00 \n", - "make_code_Make A 9000.0 0.40 0.49 0.00 0.00 \n", - "make_code_Make B 9000.0 0.24 0.43 0.00 0.00 \n", - "make_code_Make E 9000.0 0.20 0.40 0.00 0.00 \n", - "make_code_Make C 9000.0 0.11 0.31 0.00 0.00 \n", - "make_code_Make D 9000.0 0.04 0.21 0.00 0.00 \n", - "model_code_Model E1 9000.0 0.18 0.38 0.00 0.00 \n", - "model_code_Model A4 9000.0 0.13 0.34 0.00 0.00 \n", - "model_code_Model B1 9000.0 0.09 0.28 0.00 0.00 \n", - "model_code_Model B2 9000.0 0.09 0.28 0.00 0.00 \n", - "model_code_Model A2 9000.0 0.07 0.25 0.00 0.00 \n", - "model_code_Model A3 9000.0 0.07 0.25 0.00 0.00 \n", - "model_code_Model B3 9000.0 0.07 0.25 0.00 0.00 \n", - "model_code_Model C2 9000.0 0.07 0.25 0.00 0.00 \n", - "model_code_Model A1 9000.0 0.04 0.21 0.00 0.00 \n", - "model_code_Model A5 9000.0 0.04 0.21 0.00 0.00 \n", - "model_code_Model A6 9000.0 0.04 0.21 0.00 0.00 \n", - "model_code_Model C1 9000.0 0.04 0.21 0.00 0.00 \n", - "model_code_Model D1 9000.0 0.04 0.21 0.00 0.00 \n", - "model_code_Model E2 9000.0 0.02 0.15 0.00 0.00 \n", - "vehicle_class_code_Truck-Tractor 9000.0 0.67 0.47 0.00 0.00 \n", - "vehicle_class_code_Truck 9000.0 0.20 0.40 0.00 0.00 \n", - "vehicle_class_code_Bus 9000.0 0.09 0.28 0.00 0.00 \n", - "vehicle_class_code_Transport 9000.0 0.04 0.21 0.00 0.00 \n", - "engine_type_code_Engine E 9000.0 0.31 0.46 0.00 0.00 \n", - "engine_type_code_Engine C 9000.0 0.27 0.44 0.00 0.00 \n", - "engine_type_code_Engine B 9000.0 0.18 0.38 0.00 0.00 \n", - "engine_type_code_Engine F 9000.0 0.09 0.28 0.00 0.00 \n", - "engine_type_code_Engine H 9000.0 0.07 0.25 0.00 0.00 \n", - "engine_type_code_Engine D 9000.0 0.04 0.21 0.00 0.00 \n", - "engine_type_code_Engine A 9000.0 0.02 0.15 0.00 0.00 \n", - "engine_type_code_Engine G 9000.0 0.02 0.15 0.00 0.00 \n", - "voltage 9000.0 13.65 0.40 11.55 13.37 \n", - "current 9000.0 0.17 0.06 0.01 0.13 \n", - "resistance 9000.0 87.02 22.92 34.38 58.79 \n", - "cycle 9000.0 50.50 28.87 1.00 25.75 \n", - "engine_age 9000.0 3.93 3.06 0.00 2.00 \n", - "voltage_lag_1 9000.0 13.65 0.41 11.55 13.37 \n", - "current_lag_1 9000.0 0.17 0.06 0.01 0.13 \n", - "resistance_lag_1 9000.0 87.02 22.95 34.38 58.84 \n", - "voltage_rolling_mean_4 9000.0 13.65 0.41 11.77 13.36 \n", - "current_rolling_mean_4 9000.0 0.17 0.06 0.02 0.14 \n", - "resistance_rolling_mean_4 9000.0 87.03 22.93 35.22 58.75 \n", - "voltage_rolling_std_4 9000.0 0.04 0.04 0.00 0.01 \n", - "current_rolling_std_4 9000.0 0.00 0.00 0.00 0.00 \n", - "resistance_rolling_std_4 9000.0 1.02 0.73 0.00 0.52 \n", - "\n", - " 50% 75% max \n", - "target 0.00 0.00 1.00 \n", - "vehicle_id 44.50 67.00 89.00 \n", - "year 2017.00 2018.00 2020.00 \n", - "make_code_Make A 0.00 1.00 1.00 \n", - "make_code_Make B 0.00 0.00 1.00 \n", - "make_code_Make E 0.00 0.00 1.00 \n", - "make_code_Make C 0.00 0.00 1.00 \n", - "make_code_Make D 0.00 0.00 1.00 \n", - "model_code_Model E1 0.00 0.00 1.00 \n", - "model_code_Model A4 0.00 0.00 1.00 \n", - "model_code_Model B1 0.00 0.00 1.00 \n", - "model_code_Model B2 0.00 0.00 1.00 \n", - "model_code_Model A2 0.00 0.00 1.00 \n", - "model_code_Model A3 0.00 0.00 1.00 \n", - "model_code_Model B3 0.00 0.00 1.00 \n", - "model_code_Model C2 0.00 0.00 1.00 \n", - "model_code_Model A1 0.00 0.00 1.00 \n", - "model_code_Model A5 0.00 0.00 1.00 \n", - "model_code_Model A6 0.00 0.00 1.00 \n", - "model_code_Model C1 0.00 0.00 1.00 \n", - "model_code_Model D1 0.00 0.00 1.00 \n", - "model_code_Model E2 0.00 0.00 1.00 \n", - "vehicle_class_code_Truck-Tractor 1.00 1.00 1.00 \n", - "vehicle_class_code_Truck 0.00 0.00 1.00 \n", - "vehicle_class_code_Bus 0.00 0.00 1.00 \n", - "vehicle_class_code_Transport 0.00 0.00 1.00 \n", - "engine_type_code_Engine E 0.00 1.00 1.00 \n", - "engine_type_code_Engine C 0.00 1.00 1.00 \n", - "engine_type_code_Engine B 0.00 0.00 1.00 \n", - "engine_type_code_Engine F 0.00 0.00 1.00 \n", - "engine_type_code_Engine H 0.00 0.00 1.00 \n", - "engine_type_code_Engine D 0.00 0.00 1.00 \n", - "engine_type_code_Engine A 0.00 0.00 1.00 \n", - "engine_type_code_Engine G 0.00 0.00 1.00 \n", - "voltage 13.70 13.93 15.94 \n", - "current 0.16 0.19 0.39 \n", - "resistance 94.69 102.61 138.36 \n", - "cycle 50.50 75.25 100.00 \n", - "engine_age 3.00 5.00 14.00 \n", - "voltage_lag_1 13.70 13.93 15.94 \n", - "current_lag_1 0.16 0.19 0.39 \n", - "resistance_lag_1 94.69 102.64 138.36 \n", - "voltage_rolling_mean_4 13.70 13.93 15.87 \n", - "current_rolling_mean_4 0.16 0.19 0.39 \n", - "resistance_rolling_mean_4 94.81 102.56 136.35 \n", - "voltage_rolling_std_4 0.03 0.06 0.28 \n", - "current_rolling_std_4 0.00 0.00 0.02 \n", - "resistance_rolling_std_4 0.89 1.36 10.94 " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# let's look at the descriptive statistics that summarize the central tendency, dispersion and shape of a dataset’s distribution\n", "round(fleet_lagged.describe(), 2).T" @@ -1622,36 +330,14 @@ "metadata": {}, "source": [ "---\n", - " \n", - "## Visualization of the Data Distributions\n", - "\n", - "[contents](#2_Contents)\n" + "## Visualization of the Data Distributions" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py:288: UserWarning: Data must have variance to compute a kernel density estimate.\n", - " warnings.warn(msg, UserWarning)\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# plot a single engine's histograms\n", "# we will lood at vehicle_id 2 as it has 1+ failures\n", @@ -1672,19 +358,20 @@ "plot_engine_hists(fleet_lagged[fleet_lagged[\"vehicle_id\"] == 2].loc[:, \"voltage\":])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should get a diagram that looks like the diagram below.\n", + "\n", + "![](engine_histogram_output.png)" + ] + }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'features_created_prm' (bool)\n" - ] - } - ], + "outputs": [], "source": [ "# remove features used for one-hot encoding the categorical features including make, model, engine_type and vehicle_class\n", "features = fleet_lagged.drop(columns=[\"make\", \"model\", \"year\", \"vehicle_class\", \"engine_type\"])\n", @@ -1695,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1758,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1773,20 +460,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total Observations: 9000\n", - "Number of observations in the training data: 7200\n", - "Number of observations in the test data: 900\n", - "Number of observations in the validation data: 900\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Total Observations: \", len(ordered))\n", "print(\"Number of observations in the training data:\", len(train))\n", @@ -1800,12 +476,12 @@ "source": [ "#### Converting data to the appropriate format for Estimator\n", "\n", - "Amazon SageMaker implementation of Linear Learner takes either csv format or recordIO-wrapped protobuf. We will start by scaling the features and saving the data files to csv format. Then, we will upload the data to S3. If you are using your own data, and it is too large to fit in memory, protobuf might be a better option than csv. Refer to the SageMaker's Developer's Guide for [more information on data formats for training](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html)." + "Amazon SageMaker implementation of Linear Learner takes either csv format or recordIO-wrapped protobuf. We will start by scaling the features and saving the data files to csv format. Then, we will save the data to file. If you are using your own data, and it is too large to fit in memory, protobuf might be a better option than csv. For more information on data formats for training, please refer to [Common Data Formats for Training](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html)." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1815,86 +491,12 @@ "scaler = preprocessing.MinMaxScaler(feature_range=(0.0, 1.0))\n", "train = pd.DataFrame(scaler.fit_transform(train))\n", "test = pd.DataFrame(scaler.transform(test))\n", - "val = pd.DataFrame(scaler.transform(val))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add in a helper function that uploads the converted data to S3. " - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "# helper function for converting data to csv(necessary for Linear Learner) and upload to S3\n", - "def upload_file_to_bucket(df, bucket, prefix, file_path):\n", - " file_dir, file_name = os.path.split(file_path)\n", - " df.to_csv(file_name, header=False, index=False)\n", - " boto3.resource(\"s3\").meta.client.upload_file(\n", - " Filename=file_path, Bucket=bucket, Key=(prefix + \"/\" + file_name)\n", - " )\n", - " print(f\"uploaded {prefix} data location: s3://{bucket}/{prefix}/{file_name}\")\n", - " path_to_data = f\"s3://{bucket}/{prefix}/{file_name}\"\n", - " return path_to_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert and upload to S3\n", - "path_to_train_data_prm = upload_file_to_bucket(train, bucket, \"train\", \"train.csv\")\n", - "path_to_test_data_prm = upload_file_to_bucket(test, bucket, \"test\", \"test.csv\")\n", - "path_to_test_x_data_prm = upload_file_to_bucket(test.loc[:, 1:], bucket, \"test\", \"test_x.csv\")\n", - "path_to_valid_data_prm = upload_file_to_bucket(val, bucket, \"validation\", \"validation.csv\")\n", - "\n", - "# let's also setup an output S3 location for the model artifact that will be output as the result of training with the algorithm.\n", - "output_location = f\"s3://{bucket}/output\"\n", - "print(\"training artifacts will be uploaded to: {}\".format(output_location))\n", - "\n", - "%store path_to_train_data_prm\n", - "%store path_to_test_data_prm\n", - "%store path_to_test_x_data_prm\n", - "%store path_to_valid_data_prm" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'data_channels' (dict)\n" - ] - } - ], - "source": [ - "from sagemaker.inputs import TrainingInput\n", - "\n", - "train_channel = TrainingInput(path_to_train_data_prm, content_type=\"text/csv\")\n", - "test_channel = TrainingInput(path_to_test_data_prm, content_type=\"text/csv\")\n", - "test_x_channel = TrainingInput(path_to_test_x_data_prm, content_type=\"text/csv\")\n", - "valid_channel = TrainingInput(path_to_valid_data_prm, content_type=\"text/csv\")\n", + "val = pd.DataFrame(scaler.transform(val))\n", "\n", - "data_channels = {\"train\": train_channel, \"validation\": valid_channel}\n", - "%store data_channels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At this point, the data has been cleaned, preprocessed and features have been created. We have also stored the data in S3, so you are able to pick the notebook up starting from the *Train* section below without running the above again. " + "train.to_csv(\"train.csv\", header=False, index=False)\n", + "test.to_csv(\"test.csv\", header=False, index=False)\n", + "test.loc[:, 1:].to_csv(\"test_x.csv\", header=False, index=False)\n", + "val.to_csv(\"validation.csv\", header=False, index=False)" ] }, { @@ -1908,13 +510,6 @@ "\n", "Once you have selected some models that you would like to try out, SageMaker Experiments can be a great tool to track and compare all of the models before selecting the best model to deploy. We will set up an experiment using SageMaker experiments to track all the model training iterations for the Linear Learner Estimator we will try. You can read more about [SageMaker Experiments](https://docs.aws.amazon.com/sagemaker/latest/dg/experiments.html) to learn about experiment features, tracking and comparing outputs. " ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/use-cases/predictive_maintenance/3_train_tune_predict_predmaint.ipynb b/use-cases/predictive_maintenance/3_train_tune_predict_predmaint.ipynb index f76dc1e861..b0d8f6e533 100644 --- a/use-cases/predictive_maintenance/3_train_tune_predict_predmaint.ipynb +++ b/use-cases/predictive_maintenance/3_train_tune_predict_predmaint.ipynb @@ -4,43 +4,59 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Fleet Predictive Maintenance: Part 4. Training, Hyperparameter Tuning, and Prediction\n", + "# Fleet Predictive Maintenance: Part 3. Training, Hyperparameter Tuning, and Prediction\n", "\n", - "1. [Architecure](0_usecase_and_architecture_predmaint.ipynb#0_Architecture)\n", - "1. [Data Prep: Processing Job from Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb)\n", - "1. [Data Prep: Featurization](./2_dataprep_predmaint.ipynb)\n", - "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb)" + "*Using SageMaker Studio to Predict Fault Classification*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### View stored variables from previous session\n", + "## Background\n", "\n", - "If you ran this notebook before, you may want to re-use the resources you aready created with AWS. Run the cell below to load any prevously created variables. You should see a print-out of the existing variables. If you don't see anything you may need to create them again or it may be your first time running this notebook." + "This notebook is part of a sequence of notebooks whose purpose is to demonstrate a Predictive Maintenance (PrM) solution for automobile fleet maintenance via Amazon SageMaker Studio so that business users have a quick path towards a PrM POC. In this notebook, we will be focusing on training, tuning, and deploying a model. It is the third notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. " ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. [Data Prep: Processing Job from SageMaker Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb)\n", + "1. [Data Prep: Featurization](./2_dataprep_predmaint.ipynb)\n", + "1. [Train, Tune and Predict using Batch Transform](./3_train_tune_predict_predmaint.ipynb) (current notebook)\n" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "%store -r\n", - "%store" + "## Important Notes: \n", + "\n", + "* Due to cost consideration, the goal of this example is to show you how to use some of SageMaker Studio's features, not necessarily to achieve the best result. \n", + "* We use the built-in classification algorithm in this example, and a Python 3 (Data Science) Kernel is required.\n", + "* The nature of predictive maintenace solutions, requires a domain knowledge expert of the system or machinery. With this in mind, we will make assumptions here for certain elements of this solution with the acknowldgement that these assumptions should be informed by a domain expert and a main business stakeholder\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Note : dw_output_path_prm should appear above as a stored (restored) variable, whose value was set when you ran notebook 1_datapred_predmaint.ipynb" + "---\n", + "## Setup\n", + "\n", + "Let's start by:\n", + "\n", + "* Installing and importing any dependencies\n", + "* Instantiating SageMaker session\n", + "* Specifying the S3 bucket and prefix that you want to use for your training and model data. This should be within the same region as SageMaker training\n", + "* Defining the IAM role used to give training access to your data\n", + " " ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -85,6 +101,71 @@ "prefix_prm = \"predmaint\" # place to upload training files within the bucket" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before training, we must first upload our data in S3. To see how the existing train, test, and validation datasets were generated, take a look at [Data Prep: Processing Job from SageMaker Data Wrangler Output](./1_dataprep_dw_job_predmaint.ipynb) (which is the first part of this notebook series) followed by [Data Prep: Featurization](./2_dataprep_predmaint.ipynb) (which is the second part of this notebook series). See the [Background](#Background) section at the beginning of the notebook for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# helper function for converting data to csv(necessary for Linear Learner) and upload to S3\n", + "def upload_file_to_bucket(bucket, prefix, file_path):\n", + " file_dir, file_name = os.path.split(file_path)\n", + " df = pd.read_csv(file_path)\n", + " boto3.resource(\"s3\").meta.client.upload_file(\n", + " Filename=file_path, Bucket=bucket, Key=(prefix + \"/\" + file_name)\n", + " )\n", + " print(f\"uploaded {prefix} data location: s3://{bucket}/{prefix}/{file_name}\")\n", + " path_to_data = f\"s3://{bucket}/{prefix}/{file_name}\"\n", + " return path_to_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# convert and upload to S3\n", + "path_to_train_data_prm = upload_file_to_bucket(bucket, \"train\", \"train.csv\")\n", + "path_to_test_data_prm = upload_file_to_bucket(bucket, \"test\", \"test.csv\")\n", + "path_to_test_x_data_prm = upload_file_to_bucket(bucket, \"test\", \"test_x.csv\")\n", + "path_to_valid_data_prm = upload_file_to_bucket(bucket, \"validation\", \"validation.csv\")\n", + "\n", + "# let's also setup an output S3 location for the model artifact that will be output as the result of training with the algorithm.\n", + "output_location = f\"s3://{bucket}/output\"\n", + "print(\"training artifacts will be uploaded to: {}\".format(output_location))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.inputs import TrainingInput\n", + "\n", + "train_channel = TrainingInput(path_to_train_data_prm, content_type=\"text/csv\")\n", + "test_channel = TrainingInput(path_to_test_data_prm, content_type=\"text/csv\")\n", + "test_x_channel = TrainingInput(path_to_test_x_data_prm, content_type=\"text/csv\")\n", + "valid_channel = TrainingInput(path_to_valid_data_prm, content_type=\"text/csv\")\n", + "\n", + "data_channels = {\"train\": train_channel, \"validation\": valid_channel}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data is stored in S3 and is ready for use in the estimators." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -99,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -115,16 +196,16 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "if 'create_date' not in locals():\n", + "if \"create_date\" not in locals():\n", " create_date = strftime(\"%Y-%m-%d-%H-%M-%S\")\n", " %store create_date\n", "\n", " # location within S3 for outputs\n", - " exp_prefix = f'sagemaker-experiments/linear-learner-{create_date}'\n", + " exp_prefix = f\"sagemaker-experiments/linear-learner-{create_date}\"\n", " %store exp_prefix" ] }, @@ -143,7 +224,6 @@ "source": [ "# create the experiment\n", "experiment_name = f\"ll-failure-classification-{create_date}\"\n", - "%store experiment_name\n", "\n", "try:\n", " my_experiment = Experiment.load(experiment_name=experiment_name)\n", @@ -168,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +260,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can begin to specify our linear model from the Amazon SageMaker Linear Learner Estimator. For this binary classification problem, we have the option of selecting between logistic regression or hinge loss (Support Vector Machines). Here are additional resources to [learn more about Linear Learner](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html#ll-input_output) and the [loss functions available](https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html). One piece to note is that Amazon SageMaker's Linear Learner actually fits many models in parallel, each with slightly different hyperparameters, and then returns the one with the best fit. This functionality is automatically enabled. There are a number of additional parameters available for the Linear Learner Estimator, so we will start be using the default features as well as:\n", + "Now we can begin to specify our linear model from the Amazon SageMaker Linear Learner Estimator. For this binary classification problem, we have the option of selecting between logistic regression or hinge loss (Support Vector Machines). Here are additional resources to learn more about the [Input/Output Interface for the Linear Learner Algorithm](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html#ll-input_output) and the [Linear Learner Hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html). One piece to note is that Amazon SageMaker's Linear Learner actually fits many models in parallel, each with slightly different hyperparameters, and then returns the one with the best fit. This functionality is automatically enabled. There are a number of additional parameters available for the Linear Learner Estimator, so we will start be using the default features as well as:\n", "\n", "- `loss` which controls how we penalize mistakes in our model estimates. For this case, we will start with logistic and move to using hinge loss if necessary for model improvement.\n", "- `predictor_type` is set to 'binary_classifier' since we are trying to predict whether a failure occurs or it doesn't.\n", @@ -370,7 +450,7 @@ "source": [ "### Let's try dealing with class imbalances to try to improve precision and recall\n", "\n", - "We will set the hyperparameter `positive_example_weight_mult` to *balanced* in order to use weighting by class to address the class imbalance issue. Since we have only 19% failures compared to non-failures, we can leverage this built-in hyperparameter to try to improve model performnce. Read about [linear learner hyperparameters here](https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html)." + "We will set the hyperparameter `positive_example_weight_mult` to *balanced* in order to use weighting by class to address the class imbalance issue. Since we have only 19% failures compared to non-failures, we can leverage this built-in hyperparameter to try to improve model performance. Here is more documentation about [Linear Learner Hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html)." ] }, { @@ -428,169 +508,9 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TrialComponentNameDisplayNamepositive_example_weight_multvalidation:recall - Avgvalidation:binary_classification_accuracy - Avgvalidation:roc_auc_score - Avgtrain:objective_loss - Avgvalidation:objective_loss:final - Avgvalidation:objective_loss - Avgvalidation:binary_f_beta - Avgvalidation:precision - AvgTrialsExperiments
2linear-learner-2021-04-07-15-32-16-116-aws-tra...ll-svm-training-jobNaN0.4897960.8477780.7268230.2154800.2304290.2443500.5835870.721805[linear-learner-svm-2021-04-07-15-16-22][ll-failure-classification-2021-04-07-15-16-22]
3linear-learner-2021-04-07-15-27-01-989-aws-tra...ll-lr-training-jobNaN0.5816330.8422220.7848950.4227240.4542410.4636270.6162160.655172[linear-learner-lr-training-job-2021-04-07-15-...[ll-failure-classification-2021-04-07-15-16-22]
0linear-learner-2021-04-07-15-42-43-638-aws-tra...ll-svm-bal-training-jobbalanced0.3520410.8300000.7774520.5312801.5522680.5743140.4742270.726316[linear-learner-svm-balanced-2021-04-07-15-16-22][ll-failure-classification-2021-04-07-15-16-22]
1linear-learner-2021-04-07-15-37-30-147-aws-tra...ll-svm-thresh-training-jobNaN0.6428570.8200000.7335920.2154800.2292300.2443500.6086960.577982[linear-learner-svm-thresh-2021-04-07-15-16-22][ll-failure-classification-2021-04-07-15-16-22]
\n", - "
" - ], - "text/plain": [ - " TrialComponentName \\\n", - "2 linear-learner-2021-04-07-15-32-16-116-aws-tra... \n", - "3 linear-learner-2021-04-07-15-27-01-989-aws-tra... \n", - "0 linear-learner-2021-04-07-15-42-43-638-aws-tra... \n", - "1 linear-learner-2021-04-07-15-37-30-147-aws-tra... \n", - "\n", - " DisplayName positive_example_weight_mult \\\n", - "2 ll-svm-training-job NaN \n", - "3 ll-lr-training-job NaN \n", - "0 ll-svm-bal-training-job balanced \n", - "1 ll-svm-thresh-training-job NaN \n", - "\n", - " validation:recall - Avg validation:binary_classification_accuracy - Avg \\\n", - "2 0.489796 0.847778 \n", - "3 0.581633 0.842222 \n", - "0 0.352041 0.830000 \n", - "1 0.642857 0.820000 \n", - "\n", - " validation:roc_auc_score - Avg train:objective_loss - Avg \\\n", - "2 0.726823 0.215480 \n", - "3 0.784895 0.422724 \n", - "0 0.777452 0.531280 \n", - "1 0.733592 0.215480 \n", - "\n", - " validation:objective_loss:final - Avg validation:objective_loss - Avg \\\n", - "2 0.230429 0.244350 \n", - "3 0.454241 0.463627 \n", - "0 1.552268 0.574314 \n", - "1 0.229230 0.244350 \n", - "\n", - " validation:binary_f_beta - Avg validation:precision - Avg \\\n", - "2 0.583587 0.721805 \n", - "3 0.616216 0.655172 \n", - "0 0.474227 0.726316 \n", - "1 0.608696 0.577982 \n", - "\n", - " Trials \\\n", - "2 [linear-learner-svm-2021-04-07-15-16-22] \n", - "3 [linear-learner-lr-training-job-2021-04-07-15-... \n", - "0 [linear-learner-svm-balanced-2021-04-07-15-16-22] \n", - "1 [linear-learner-svm-thresh-2021-04-07-15-16-22] \n", - "\n", - " Experiments \n", - "2 [ll-failure-classification-2021-04-07-15-16-22] \n", - "3 [ll-failure-classification-2021-04-07-15-16-22] \n", - "0 [ll-failure-classification-2021-04-07-15-16-22] \n", - "1 [ll-failure-classification-2021-04-07-15-16-22] " - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# first we can look at all the trials together to evaluate the performance\n", "trial_component_analytics = ExperimentAnalytics(experiment_name=my_experiment.experiment_name)\n", @@ -634,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -670,18 +590,18 @@ "In this example, we are using SageMaker Python SDK to set up and manage the hyperparameter tuning job. We first configure the training jobs the hyperparameter tuning job will launch by initiating an estimator, which includes the following configuration:\n", "\n", "* hyperparameters that SageMaker Automatic Model Tuning will tune: `learning_rate` \n", - "* the maximum number of training jobs it will run to optimize the objective metric: 20\n", + "* the maximum number of training jobs it will run to optimize the objective metric: 5\n", "* the number of parallel training jobs that will run in the tuning job: 2\n", "* the objective metric that Automatic Model Tuning will use: validation:accuracy\n", "\n", "We will also demonstrates how to associate trial components created by a hyperparameter tuning job with an experiment management trial.\n", "\n", - "Read the following link more information on [tuning linear learner hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner-tuning.html) and [automatic tuning with SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-how-it-works.html)" + "Read the following link more information on how to [Tune a Linear Learner Model](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner-tuning.html) and about [How Hyperparameter Tuning Works](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-how-it-works.html)" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -691,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -732,13 +652,14 @@ " estimator=svm_tune, # previously-configured Estimator object\n", " objective_metric_name=\"validation:binary_classification_accuracy\",\n", " hyperparameter_ranges=hyperparameter_ranges,\n", - " max_jobs=20,\n", + " max_jobs=5,\n", " max_parallel_jobs=2,\n", " strategy=\"Random\",\n", + " base_tuning_job_name=prm_tuning_job_name,\n", " )\n", "\n", " # start hyperparameter tuning job\n", - " my_tuner.fit(inputs=data_channels, include_cls_metadata=False, job_name=prm_tuning_job_name)\n", + " my_tuner.fit(inputs=data_channels, include_cls_metadata=False)\n", " print(f\"Create tuning job {prm_tuning_job_name}: SUCCESSFUL\")\n", "except ClientError as e:\n", " if \"ResourceInUse\" in str(e):\n", @@ -748,20 +669,9 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Completed'" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# check status\n", "boto3.client(\"sagemaker\").describe_hyper_parameter_tuning_job(\n", @@ -771,7 +681,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -782,19 +692,9 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 10 tuning jobs.\n", - "Stored 'tune_trial_name' (str)\n", - "Associate all training jobs created by ll-svm-tuning-job with trial ll-svm-tuning-job-trial\n" - ] - } - ], + "outputs": [], "source": [ "# get the most recently created tuning jobs\n", "list_tuning_jobs_response = smclient.list_hyper_parameter_tuning_jobs(\n", @@ -852,17 +752,9 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 20 trial components.\n" - ] - } - ], + "outputs": [], "source": [ "import time\n", "from datetime import datetime, timezone\n", @@ -915,317 +807,9 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
learning_rateTrainingJobNameTrainingJobStatusFinalObjectiveValueTrainingStartTimeTrainingEndTimeTrainingElapsedTimeSeconds
00.094162ll-svm-tuning-job-020-de38493cCompleted0.8077782021-03-16 02:48:13+00:002021-03-16 02:49:25+00:0072.0
10.018556ll-svm-tuning-job-019-b5e9ee8dCompleted0.8177782021-03-16 02:48:17+00:002021-03-16 02:49:38+00:0081.0
20.108048ll-svm-tuning-job-018-e720402bCompleted0.8033332021-03-16 02:44:49+00:002021-03-16 02:46:02+00:0073.0
30.105569ll-svm-tuning-job-017-ee995315Completed0.7988892021-03-16 02:43:54+00:002021-03-16 02:45:16+00:0082.0
40.256796ll-svm-tuning-job-016-f023d0fbCompleted0.7922222021-03-16 02:40:53+00:002021-03-16 02:42:17+00:0084.0
50.368504ll-svm-tuning-job-015-e97dc476Completed0.8100002021-03-16 02:39:49+00:002021-03-16 02:41:15+00:0086.0
60.018072ll-svm-tuning-job-014-fcf45964Completed0.8222222021-03-16 02:36:51+00:002021-03-16 02:38:03+00:0072.0
70.234124ll-svm-tuning-job-013-a1f86f0fCompleted0.8377782021-03-16 02:35:57+00:002021-03-16 02:37:24+00:0087.0
80.027784ll-svm-tuning-job-012-e5277482Completed0.8355562021-03-16 02:33:05+00:002021-03-16 02:34:08+00:0063.0
90.187483ll-svm-tuning-job-011-cc73e5e8Completed0.8266672021-03-16 02:32:16+00:002021-03-16 02:33:26+00:0070.0
100.099079ll-svm-tuning-job-010-07005361Completed0.7977782021-03-16 02:29:00+00:002021-03-16 02:30:17+00:0077.0
110.017746ll-svm-tuning-job-009-e77521ffCompleted0.8166672021-03-16 02:28:39+00:002021-03-16 02:29:47+00:0068.0
120.020755ll-svm-tuning-job-008-6ed6082eCompleted0.8466672021-03-16 02:25:19+00:002021-03-16 02:26:12+00:0053.0
130.048608ll-svm-tuning-job-007-692a0a7dCompleted0.7977782021-03-16 02:24:51+00:002021-03-16 02:26:38+00:00107.0
140.027099ll-svm-tuning-job-006-99d391aaCompleted0.8166672021-03-16 02:20:51+00:002021-03-16 02:21:53+00:0062.0
150.282473ll-svm-tuning-job-005-06ecccfaCompleted0.7955562021-03-16 02:20:34+00:002021-03-16 02:22:21+00:00107.0
160.026969ll-svm-tuning-job-004-329ec538Completed0.8088892021-03-16 02:16:39+00:002021-03-16 02:17:53+00:0074.0
170.010212ll-svm-tuning-job-003-a889d04cCompleted0.8455562021-03-16 02:16:56+00:002021-03-16 02:17:46+00:0050.0
180.051641ll-svm-tuning-job-002-9f9f727bCompleted0.8244442021-03-16 02:12:43+00:002021-03-16 02:14:05+00:0082.0
190.022299ll-svm-tuning-job-001-1694f3c9Completed0.8277782021-03-16 02:13:00+00:002021-03-16 02:14:03+00:0063.0
\n", - "
" - ], - "text/plain": [ - " learning_rate TrainingJobName TrainingJobStatus \\\n", - "0 0.094162 ll-svm-tuning-job-020-de38493c Completed \n", - "1 0.018556 ll-svm-tuning-job-019-b5e9ee8d Completed \n", - "2 0.108048 ll-svm-tuning-job-018-e720402b Completed \n", - "3 0.105569 ll-svm-tuning-job-017-ee995315 Completed \n", - "4 0.256796 ll-svm-tuning-job-016-f023d0fb Completed \n", - "5 0.368504 ll-svm-tuning-job-015-e97dc476 Completed \n", - "6 0.018072 ll-svm-tuning-job-014-fcf45964 Completed \n", - "7 0.234124 ll-svm-tuning-job-013-a1f86f0f Completed \n", - "8 0.027784 ll-svm-tuning-job-012-e5277482 Completed \n", - "9 0.187483 ll-svm-tuning-job-011-cc73e5e8 Completed \n", - "10 0.099079 ll-svm-tuning-job-010-07005361 Completed \n", - "11 0.017746 ll-svm-tuning-job-009-e77521ff Completed \n", - "12 0.020755 ll-svm-tuning-job-008-6ed6082e Completed \n", - "13 0.048608 ll-svm-tuning-job-007-692a0a7d Completed \n", - "14 0.027099 ll-svm-tuning-job-006-99d391aa Completed \n", - "15 0.282473 ll-svm-tuning-job-005-06ecccfa Completed \n", - "16 0.026969 ll-svm-tuning-job-004-329ec538 Completed \n", - "17 0.010212 ll-svm-tuning-job-003-a889d04c Completed \n", - "18 0.051641 ll-svm-tuning-job-002-9f9f727b Completed \n", - "19 0.022299 ll-svm-tuning-job-001-1694f3c9 Completed \n", - "\n", - " FinalObjectiveValue TrainingStartTime TrainingEndTime \\\n", - "0 0.807778 2021-03-16 02:48:13+00:00 2021-03-16 02:49:25+00:00 \n", - "1 0.817778 2021-03-16 02:48:17+00:00 2021-03-16 02:49:38+00:00 \n", - "2 0.803333 2021-03-16 02:44:49+00:00 2021-03-16 02:46:02+00:00 \n", - "3 0.798889 2021-03-16 02:43:54+00:00 2021-03-16 02:45:16+00:00 \n", - "4 0.792222 2021-03-16 02:40:53+00:00 2021-03-16 02:42:17+00:00 \n", - "5 0.810000 2021-03-16 02:39:49+00:00 2021-03-16 02:41:15+00:00 \n", - "6 0.822222 2021-03-16 02:36:51+00:00 2021-03-16 02:38:03+00:00 \n", - "7 0.837778 2021-03-16 02:35:57+00:00 2021-03-16 02:37:24+00:00 \n", - "8 0.835556 2021-03-16 02:33:05+00:00 2021-03-16 02:34:08+00:00 \n", - "9 0.826667 2021-03-16 02:32:16+00:00 2021-03-16 02:33:26+00:00 \n", - "10 0.797778 2021-03-16 02:29:00+00:00 2021-03-16 02:30:17+00:00 \n", - "11 0.816667 2021-03-16 02:28:39+00:00 2021-03-16 02:29:47+00:00 \n", - "12 0.846667 2021-03-16 02:25:19+00:00 2021-03-16 02:26:12+00:00 \n", - "13 0.797778 2021-03-16 02:24:51+00:00 2021-03-16 02:26:38+00:00 \n", - "14 0.816667 2021-03-16 02:20:51+00:00 2021-03-16 02:21:53+00:00 \n", - "15 0.795556 2021-03-16 02:20:34+00:00 2021-03-16 02:22:21+00:00 \n", - "16 0.808889 2021-03-16 02:16:39+00:00 2021-03-16 02:17:53+00:00 \n", - "17 0.845556 2021-03-16 02:16:56+00:00 2021-03-16 02:17:46+00:00 \n", - "18 0.824444 2021-03-16 02:12:43+00:00 2021-03-16 02:14:05+00:00 \n", - "19 0.827778 2021-03-16 02:13:00+00:00 2021-03-16 02:14:03+00:00 \n", - "\n", - " TrainingElapsedTimeSeconds \n", - "0 72.0 \n", - "1 81.0 \n", - "2 73.0 \n", - "3 82.0 \n", - "4 84.0 \n", - "5 86.0 \n", - "6 72.0 \n", - "7 87.0 \n", - "8 63.0 \n", - "9 70.0 \n", - "10 77.0 \n", - "11 68.0 \n", - "12 53.0 \n", - "13 107.0 \n", - "14 62.0 \n", - "15 107.0 \n", - "16 74.0 \n", - "17 50.0 \n", - "18 82.0 \n", - "19 63.0 " - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# here is the output of all of the hyperparameter tuning trial runs\n", "tuning_analytics.dataframe()" @@ -1247,7 +831,7 @@ "- Don't need a persistent endpoint that applications (for example, web or mobile apps) can call to get inferences\n", "- Don't need the subsecond latency that SageMaker hosted endpoints provide\n", "\n", - "Read more about [Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-batch.html) here. " + "Here is additional information about how to [Use Batch Transform](https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform.html). " ] }, { @@ -1380,7 +964,7 @@ "outputs": [], "source": [ "# call evaluation function and inspect results\n", - "test = pd.read_csv(path_to_test_data_prm, header=None)\n", + "test = pd.read_csv(\"test.csv\", header=None)\n", "test_y = test[0]\n", "evaluate_model(\"test_x.csv.out\", test_y, \"PrM-Classification-SVM\", metrics=True)" ] @@ -1402,7 +986,8 @@ "source": [ "def delete_endpoint(predictor):\n", " try:\n", - " boto3.client(\"sagemaker\").delete_endpoint(EndpointName=predictor.endpoint)\n", + " predictor.delete_model()\n", + " predictor.delete_endpoint()\n", " print(\"Deleted {}\".format(predictor.endpoint))\n", " except:\n", " print(\"Already deleted: {}\".format(predictor.endpoint))" diff --git a/use-cases/predictive_maintenance/demo_helpers.py b/use-cases/predictive_maintenance/demo_helpers.py index 022482ba31..a5aee6dafc 100644 --- a/use-cases/predictive_maintenance/demo_helpers.py +++ b/use-cases/predictive_maintenance/demo_helpers.py @@ -55,16 +55,4 @@ def update_dw_s3uri(flow_file_name): with open(flow_file_name, "w") as f: json.dump(flow, f) - - -dw_container_dict = { - "us-east-2": "415577184552.dkr.ecr.us-east-2.amazonaws.com/sagemaker-data-wrangler-container:1.0.1" -} - - -def get_dw_container_for_region(region_in): - """ - Get the Data Wrangler container based on the given region - """ - container_uri = dw_container_dict[region_in] - return container_uri + \ No newline at end of file diff --git a/use-cases/predictive_maintenance/engine_histogram_output.png b/use-cases/predictive_maintenance/engine_histogram_output.png new file mode 100644 index 0000000000..2017d1d326 Binary files /dev/null and b/use-cases/predictive_maintenance/engine_histogram_output.png differ