Merge branch 'main' into main

aws · Oct 22, 2022 · ee1b7a6 · ee1b7a6
2 parents 05321b3 + b0083b1
commit ee1b7a6
Show file tree

Hide file tree

Showing 44 changed files with 4,980 additions and 5,701 deletions.
diff --git a/README.md b/README.md
@@ -77,6 +77,7 @@ These examples introduce SageMaker Autopilot. Autopilot automatically performs f
 - [Targeted Direct Marketing AutoML](autopilot/) shows how to use SageMaker Autopilot to automatically train a model.
 - [Housing Prices AutoML](sagemaker-autopilot/housing_prices) shows how to use SageMaker Autopilot for a linear regression problem (predict housing prices).
 - [Portfolio Churn Prediction with Amazon SageMaker Autopilot and Neo4j](autopilot/sagemaker_autopilot_neo4j_portfolio_churn.ipynb) shows how to use SageMaker Autopilot with graph embeddings to predict investment portfolio churn.
+- [Move Amazon SageMaker Autopilot ML models from experimentation to production using Amazon SageMaker Pipelines](autopilot/sagemaker-autopilot-pipelines) shows how to use SageMaker Autopilot in combination with SageMaker Pipelines for end-to-end AutoML training automation.
 
 ### Introduction to Amazon Algorithms
 
@@ -151,6 +152,15 @@ These examples provide and introduction to SageMaker Debugger which allows debug
 - [Reacting to CloudWatch Events from Rules to take an action based on status with TensorFlow](sagemaker-debugger/tensorflow_action_on_rule/)
 - [Using SageMaker Debugger with a custom PyTorch container](sagemaker-debugger/pytorch_custom_container/)
 
+### Amazon SageMaker Distributed Training
+
+These examples provide an introduction to SageMaker Distributed Training Libraries for data parallelism and model parallelism. The libraries are optimized for the SageMaker training environment, help adapt your distributed training jobs to SageMaker, and improve training speed and throughput.
+More examples for models such as BERT and YOLOv5 can be found in [distributed_training/](https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training).
+
+- [Train GPT-2 with Sharded Data Parallel](https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb) shows how to train GPT-2 with near-linear scaling using Sharded Data Parallelism technique in SageMaker Model Parallelism Library.
+- [Train EleutherAI GPT-J with Model Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb) shows how to train EleutherAI GPT-J with PyTorch and Tensor Parallelism technique in the SageMaker Model Parallelism Library.
+- [Train MaskRCNN with Data Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb) shows how to train MaskRCNN with PyTorch and SageMaker Data Parallelism Library.
+
 ### Amazon SageMaker Clarify
 
 These examples provide an introduction to SageMaker Clarify which provides machine learning developers with greater visibility into their training data and models so they can identify and limit bias and explain predictions.

diff --git a/autopilot/sagemaker-autopilot-pipelines/autopilot_pipelines_demo_notebook.ipynb b/autopilot/sagemaker-autopilot-pipelines/autopilot_pipelines_demo_notebook.ipynb
diff --git a/autopilot/sagemaker-autopilot-pipelines/check_autopilot_job_status.py b/autopilot/sagemaker-autopilot-pipelines/check_autopilot_job_status.py
@@ -0,0 +1,37 @@
+import boto3
+import json
+import logging
+
+sagemaker_client = boto3.client("sagemaker")
+
+
+def lambda_handler(event, context):
+    try:
+        payload = json.loads(event["Records"][0]["body"])
+        callback_token = payload["token"]
+        autopilot_job = sagemaker_client.describe_auto_ml_job(
+            AutoMLJobName=payload["arguments"]["AutopilotJobName"]
+        )
+        autopilot_job_status = autopilot_job["AutoMLJobStatus"]
+        if autopilot_job_status == "Completed":
+            sagemaker_client.send_pipeline_execution_step_success(
+                CallbackToken=callback_token
+            )
+        elif autopilot_job_status in ["InProgress", "Stopping"]:
+            raise ValueError("Autopilot training not finished yet. Retrying later...")
+        else:
+            sagemaker_client.send_pipeline_execution_step_failure(
+                CallbackToken=callback_token,
+                FailureReason=autopilot_job.get(
+                    "FailureReason",
+                    f"Autopilot training job (status: {autopilot_job_status}) failed to finish.",
+                ),
+            )
+    except ValueError:
+        raise
+    except Exception as e:
+        logging.exception(e)
+        sagemaker_client.send_pipeline_execution_step_failure(
+            CallbackToken=callback_token,
+            FailureReason=str(e),
+        )
diff --git a/autopilot/sagemaker-autopilot-pipelines/evaluate_autopilot_model.py b/autopilot/sagemaker-autopilot-pipelines/evaluate_autopilot_model.py
@@ -0,0 +1,110 @@
+import argparse
+import boto3
+import json
+import os
+import pandas as pd
+import random
+import string
+import time
+from sklearn.metrics import f1_score, precision_score, recall_score
+from urllib.parse import urlparse
+
+RANDOM_SUFFIX = "".join(random.choices(string.ascii_lowercase, k=8))
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--autopilot-job-name", type=str)
+parser.add_argument("--aws-region", type=str)
+parser.add_argument("--x-test-s3-path", type=str)
+parser.add_argument("--y-test-file-name", type=str)
+parser.add_argument("--batch-transform-output-s3-path", type=str)
+parser.add_argument("--instance-type", type=str)
+parser.add_argument("--instance-count", type=int)
+parser.add_argument("--local-base-path", type=str)
+parser.add_argument("--sagemaker-execution-role-arn", type=str)
+args = parser.parse_args()
+
+boto_session = boto3.session.Session(region_name=args.aws_region)
+s3_client = boto_session.client("s3")
+sagemaker_client = boto_session.client("sagemaker")
+
+# Create model
+model_name = args.autopilot_job_name + RANDOM_SUFFIX
+response = sagemaker_client.create_model(
+    ModelName=model_name,
+    Containers=sagemaker_client.describe_auto_ml_job(
+        AutoMLJobName=args.autopilot_job_name
+    )["BestCandidate"]["InferenceContainers"],
+    ExecutionRoleArn=args.sagemaker_execution_role_arn,
+)
+
+# Create batch transform job
+batch_transform_job_name = args.autopilot_job_name + RANDOM_SUFFIX
+response = sagemaker_client.create_transform_job(
+    TransformJobName=batch_transform_job_name,
+    ModelName=model_name,
+    TransformInput={
+        "DataSource": {
+            "S3DataSource": {
+                "S3DataType": "S3Prefix",
+                "S3Uri": args.x_test_s3_path,
+            }
+        },
+        "ContentType": "text/csv",
+        "SplitType": "Line",
+    },
+    TransformOutput={
+        "S3OutputPath": args.batch_transform_output_s3_path,
+        "AssembleWith": "Line",
+    },
+    TransformResources={
+        "InstanceType": args.instance_type,
+        "InstanceCount": args.instance_count,
+    },
+)
+
+# Wait for the batch transform job to finish
+while (
+    sagemaker_client.describe_transform_job(TransformJobName=batch_transform_job_name)[
+        "TransformJobStatus"
+    ]
+    == "InProgress"
+):
+    time.sleep(10)
+
+# Download batch transform results
+x_test_file_name = args.x_test_s3_path.split("/")[-1]
+predictions_s3_path = os.path.join(
+    args.batch_transform_output_s3_path, x_test_file_name + ".out"
+)
+o = urlparse(predictions_s3_path)
+s3_client.download_file(
+    Bucket=o.netloc, Key=o.path.strip("/"), Filename="predictions.csv"
+)
+
+# Create best model evaluation report
+y_pred = pd.read_csv("predictions.csv", header=0).iloc[:, 0]
+y_true = pd.read_csv(
+    os.path.join(args.local_base_path, "data", args.y_test_file_name), header=1
+)
+evaluation_report = {
+    "multiclass_classification_metrics": {
+        "weighted_f1": {
+            "value": f1_score(y_pred, y_true, average="weighted"),
+            "standard_deviation": "NaN",
+        },
+        "weighted_precision": {
+            "value": precision_score(y_pred, y_true, average="weighted"),
+            "standard_deviation": "NaN",
+        },
+        "weighted_recall": {
+            "value": recall_score(y_pred, y_true, average="weighted"),
+            "standard_deviation": "NaN",
+        },
+    },
+}
+evaluation_report_path = os.path.join(
+    args.local_base_path, "evaluation_report", "evaluation_report.json"
+)
+os.makedirs(os.path.dirname(evaluation_report_path), exist_ok=True)
+with open(evaluation_report_path, "w") as f:
+    f.write(json.dumps(evaluation_report))
diff --git a/autopilot/sagemaker-autopilot-pipelines/register_autopilot_model.py b/autopilot/sagemaker-autopilot-pipelines/register_autopilot_model.py
@@ -0,0 +1,62 @@
+import boto3
+import os
+from botocore.exceptions import ClientError
+from urllib.parse import urlparse
+
+s3_client = boto3.client("s3")
+sagemaker_client = boto3.client("sagemaker")
+
+
+def get_explainability_report_json_s3_path(s3_path):
+    o = urlparse(s3_path)
+    bucket_name = o.netloc
+    s3_prefix = o.path.strip("/")
+    paginator = s3_client.get_paginator("list_objects_v2")
+    response = paginator.paginate(
+        Bucket=bucket_name, Prefix=s3_prefix, PaginationConfig={"PageSize": 1}
+    )
+    for page in response:
+        files = page.get("Contents")
+        for file in files:
+            if "analysis.json" in file["Key"]:
+                return os.path.join("s3://", bucket_name, file["Key"])
+
+
+def lambda_handler(event, context):
+    # Get the explainability results from the Autopilot job
+    autopilot_job = sagemaker_client.describe_auto_ml_job(
+        AutoMLJobName=event["AutopilotJobName"]
+    )
+    explainability_report_s3_path = autopilot_job["BestCandidate"][
+        "CandidateProperties"
+    ]["CandidateArtifactLocations"]["Explainability"]
+    autopilot_job["BestCandidate"]["InferenceContainers"][0].pop("Environment")
+    sagemaker_client.create_model_package(
+        ModelPackageName=event["ModelPackageName"],
+        InferenceSpecification={
+            "Containers": autopilot_job["BestCandidate"]["InferenceContainers"],
+            "SupportedContentTypes": ["text/csv"],
+            "SupportedResponseMIMETypes": ["text/csv"],
+            "SupportedTransformInstanceTypes": [event["InstanceType"]],
+            "SupportedRealtimeInferenceInstanceTypes": [event["InstanceType"]],
+        },
+        ModelApprovalStatus=event["ModelApprovalStatus"],
+        ModelMetrics={
+            "ModelQuality": {
+                "Statistics": {
+                    "ContentType": ".json",
+                    "S3Uri": os.path.join(
+                        event["EvaluationReportS3Path"], "evaluation_report.json"
+                    ),
+                },
+            },
+            "Explainability": {
+                "Report": {
+                    "ContentType": ".json",
+                    "S3Uri": get_explainability_report_json_s3_path(
+                        explainability_report_s3_path
+                    ),
+                }
+            },
+        },
+    )
diff --git a/autopilot/sagemaker-autopilot-pipelines/start_autopilot_job.py b/autopilot/sagemaker-autopilot-pipelines/start_autopilot_job.py
@@ -0,0 +1,51 @@
+import sys
+from pip._internal import main
+
+# Upgrading boto3 to the newest release to be able to use the latest SageMaker features
+main(
+    [
+        "install",
+        "-I",
+        "-q",
+        "boto3",
+        "--target",
+        "/tmp/",
+        "--no-cache-dir",
+        "--disable-pip-version-check",
+    ]
+)
+sys.path.insert(0, "/tmp/")
+import boto3
+
+sagemaker_client = boto3.client("sagemaker")
+
+
+def lambda_handler(event, context):
+    sagemaker_client.create_auto_ml_job(
+        AutoMLJobName=event["AutopilotJobName"],
+        InputDataConfig=[
+            {
+                "DataSource": {
+                    "S3DataSource": {
+                        "S3DataType": "S3Prefix",
+                        "S3Uri": event["TrainValDatasetS3Path"],
+                    }
+                },
+                "TargetAttributeName": event["TargetAttributeName"],
+            }
+        ],
+        OutputDataConfig={"S3OutputPath": event["TrainingOutputS3Path"]},
+        ProblemType=event["ProblemType"],
+        AutoMLJobObjective={"MetricName": event["AutopilotObjectiveMetricName"]},
+        AutoMLJobConfig={
+            "CompletionCriteria": {
+                "MaxCandidates": event["MaxCandidates"],
+                "MaxRuntimePerTrainingJobInSeconds": event[
+                    "MaxRuntimePerTrainingJobInSeconds"
+                ],
+                "MaxAutoMLJobRuntimeInSeconds": event["MaxAutoMLJobRuntimeInSeconds"],
+            },
+            "Mode": event["AutopilotMode"],
+        },
+        RoleArn=event["AutopilotExecutionRoleArn"],
+    )