Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronmarkham authored Oct 22, 2022
2 parents 05321b3 + b0083b1 commit ee1b7a6
Show file tree
Hide file tree
Showing 44 changed files with 4,980 additions and 5,701 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ These examples introduce SageMaker Autopilot. Autopilot automatically performs f
- [Targeted Direct Marketing AutoML](autopilot/) shows how to use SageMaker Autopilot to automatically train a model.
- [Housing Prices AutoML](sagemaker-autopilot/housing_prices) shows how to use SageMaker Autopilot for a linear regression problem (predict housing prices).
- [Portfolio Churn Prediction with Amazon SageMaker Autopilot and Neo4j](autopilot/sagemaker_autopilot_neo4j_portfolio_churn.ipynb) shows how to use SageMaker Autopilot with graph embeddings to predict investment portfolio churn.
- [Move Amazon SageMaker Autopilot ML models from experimentation to production using Amazon SageMaker Pipelines](autopilot/sagemaker-autopilot-pipelines) shows how to use SageMaker Autopilot in combination with SageMaker Pipelines for end-to-end AutoML training automation.

### Introduction to Amazon Algorithms

Expand Down Expand Up @@ -151,6 +152,15 @@ These examples provide and introduction to SageMaker Debugger which allows debug
- [Reacting to CloudWatch Events from Rules to take an action based on status with TensorFlow](sagemaker-debugger/tensorflow_action_on_rule/)
- [Using SageMaker Debugger with a custom PyTorch container](sagemaker-debugger/pytorch_custom_container/)

### Amazon SageMaker Distributed Training

These examples provide an introduction to SageMaker Distributed Training Libraries for data parallelism and model parallelism. The libraries are optimized for the SageMaker training environment, help adapt your distributed training jobs to SageMaker, and improve training speed and throughput.
More examples for models such as BERT and YOLOv5 can be found in [distributed_training/](https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training).

- [Train GPT-2 with Sharded Data Parallel](https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb) shows how to train GPT-2 with near-linear scaling using Sharded Data Parallelism technique in SageMaker Model Parallelism Library.
- [Train EleutherAI GPT-J with Model Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb) shows how to train EleutherAI GPT-J with PyTorch and Tensor Parallelism technique in the SageMaker Model Parallelism Library.
- [Train MaskRCNN with Data Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb) shows how to train MaskRCNN with PyTorch and SageMaker Data Parallelism Library.

### Amazon SageMaker Clarify

These examples provide an introduction to SageMaker Clarify which provides machine learning developers with greater visibility into their training data and models so they can identify and limit bias and explain predictions.
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import boto3
import json
import logging

sagemaker_client = boto3.client("sagemaker")


def lambda_handler(event, context):
try:
payload = json.loads(event["Records"][0]["body"])
callback_token = payload["token"]
autopilot_job = sagemaker_client.describe_auto_ml_job(
AutoMLJobName=payload["arguments"]["AutopilotJobName"]
)
autopilot_job_status = autopilot_job["AutoMLJobStatus"]
if autopilot_job_status == "Completed":
sagemaker_client.send_pipeline_execution_step_success(
CallbackToken=callback_token
)
elif autopilot_job_status in ["InProgress", "Stopping"]:
raise ValueError("Autopilot training not finished yet. Retrying later...")
else:
sagemaker_client.send_pipeline_execution_step_failure(
CallbackToken=callback_token,
FailureReason=autopilot_job.get(
"FailureReason",
f"Autopilot training job (status: {autopilot_job_status}) failed to finish.",
),
)
except ValueError:
raise
except Exception as e:
logging.exception(e)
sagemaker_client.send_pipeline_execution_step_failure(
CallbackToken=callback_token,
FailureReason=str(e),
)
110 changes: 110 additions & 0 deletions autopilot/sagemaker-autopilot-pipelines/evaluate_autopilot_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import argparse
import boto3
import json
import os
import pandas as pd
import random
import string
import time
from sklearn.metrics import f1_score, precision_score, recall_score
from urllib.parse import urlparse

RANDOM_SUFFIX = "".join(random.choices(string.ascii_lowercase, k=8))

parser = argparse.ArgumentParser()
parser.add_argument("--autopilot-job-name", type=str)
parser.add_argument("--aws-region", type=str)
parser.add_argument("--x-test-s3-path", type=str)
parser.add_argument("--y-test-file-name", type=str)
parser.add_argument("--batch-transform-output-s3-path", type=str)
parser.add_argument("--instance-type", type=str)
parser.add_argument("--instance-count", type=int)
parser.add_argument("--local-base-path", type=str)
parser.add_argument("--sagemaker-execution-role-arn", type=str)
args = parser.parse_args()

boto_session = boto3.session.Session(region_name=args.aws_region)
s3_client = boto_session.client("s3")
sagemaker_client = boto_session.client("sagemaker")

# Create model
model_name = args.autopilot_job_name + RANDOM_SUFFIX
response = sagemaker_client.create_model(
ModelName=model_name,
Containers=sagemaker_client.describe_auto_ml_job(
AutoMLJobName=args.autopilot_job_name
)["BestCandidate"]["InferenceContainers"],
ExecutionRoleArn=args.sagemaker_execution_role_arn,
)

# Create batch transform job
batch_transform_job_name = args.autopilot_job_name + RANDOM_SUFFIX
response = sagemaker_client.create_transform_job(
TransformJobName=batch_transform_job_name,
ModelName=model_name,
TransformInput={
"DataSource": {
"S3DataSource": {
"S3DataType": "S3Prefix",
"S3Uri": args.x_test_s3_path,
}
},
"ContentType": "text/csv",
"SplitType": "Line",
},
TransformOutput={
"S3OutputPath": args.batch_transform_output_s3_path,
"AssembleWith": "Line",
},
TransformResources={
"InstanceType": args.instance_type,
"InstanceCount": args.instance_count,
},
)

# Wait for the batch transform job to finish
while (
sagemaker_client.describe_transform_job(TransformJobName=batch_transform_job_name)[
"TransformJobStatus"
]
== "InProgress"
):
time.sleep(10)

# Download batch transform results
x_test_file_name = args.x_test_s3_path.split("/")[-1]
predictions_s3_path = os.path.join(
args.batch_transform_output_s3_path, x_test_file_name + ".out"
)
o = urlparse(predictions_s3_path)
s3_client.download_file(
Bucket=o.netloc, Key=o.path.strip("/"), Filename="predictions.csv"
)

# Create best model evaluation report
y_pred = pd.read_csv("predictions.csv", header=0).iloc[:, 0]
y_true = pd.read_csv(
os.path.join(args.local_base_path, "data", args.y_test_file_name), header=1
)
evaluation_report = {
"multiclass_classification_metrics": {
"weighted_f1": {
"value": f1_score(y_pred, y_true, average="weighted"),
"standard_deviation": "NaN",
},
"weighted_precision": {
"value": precision_score(y_pred, y_true, average="weighted"),
"standard_deviation": "NaN",
},
"weighted_recall": {
"value": recall_score(y_pred, y_true, average="weighted"),
"standard_deviation": "NaN",
},
},
}
evaluation_report_path = os.path.join(
args.local_base_path, "evaluation_report", "evaluation_report.json"
)
os.makedirs(os.path.dirname(evaluation_report_path), exist_ok=True)
with open(evaluation_report_path, "w") as f:
f.write(json.dumps(evaluation_report))
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import boto3
import os
from botocore.exceptions import ClientError
from urllib.parse import urlparse

s3_client = boto3.client("s3")
sagemaker_client = boto3.client("sagemaker")


def get_explainability_report_json_s3_path(s3_path):
o = urlparse(s3_path)
bucket_name = o.netloc
s3_prefix = o.path.strip("/")
paginator = s3_client.get_paginator("list_objects_v2")
response = paginator.paginate(
Bucket=bucket_name, Prefix=s3_prefix, PaginationConfig={"PageSize": 1}
)
for page in response:
files = page.get("Contents")
for file in files:
if "analysis.json" in file["Key"]:
return os.path.join("s3://", bucket_name, file["Key"])


def lambda_handler(event, context):
# Get the explainability results from the Autopilot job
autopilot_job = sagemaker_client.describe_auto_ml_job(
AutoMLJobName=event["AutopilotJobName"]
)
explainability_report_s3_path = autopilot_job["BestCandidate"][
"CandidateProperties"
]["CandidateArtifactLocations"]["Explainability"]
autopilot_job["BestCandidate"]["InferenceContainers"][0].pop("Environment")
sagemaker_client.create_model_package(
ModelPackageName=event["ModelPackageName"],
InferenceSpecification={
"Containers": autopilot_job["BestCandidate"]["InferenceContainers"],
"SupportedContentTypes": ["text/csv"],
"SupportedResponseMIMETypes": ["text/csv"],
"SupportedTransformInstanceTypes": [event["InstanceType"]],
"SupportedRealtimeInferenceInstanceTypes": [event["InstanceType"]],
},
ModelApprovalStatus=event["ModelApprovalStatus"],
ModelMetrics={
"ModelQuality": {
"Statistics": {
"ContentType": ".json",
"S3Uri": os.path.join(
event["EvaluationReportS3Path"], "evaluation_report.json"
),
},
},
"Explainability": {
"Report": {
"ContentType": ".json",
"S3Uri": get_explainability_report_json_s3_path(
explainability_report_s3_path
),
}
},
},
)
51 changes: 51 additions & 0 deletions autopilot/sagemaker-autopilot-pipelines/start_autopilot_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import sys
from pip._internal import main

# Upgrading boto3 to the newest release to be able to use the latest SageMaker features
main(
[
"install",
"-I",
"-q",
"boto3",
"--target",
"/tmp/",
"--no-cache-dir",
"--disable-pip-version-check",
]
)
sys.path.insert(0, "/tmp/")
import boto3

sagemaker_client = boto3.client("sagemaker")


def lambda_handler(event, context):
sagemaker_client.create_auto_ml_job(
AutoMLJobName=event["AutopilotJobName"],
InputDataConfig=[
{
"DataSource": {
"S3DataSource": {
"S3DataType": "S3Prefix",
"S3Uri": event["TrainValDatasetS3Path"],
}
},
"TargetAttributeName": event["TargetAttributeName"],
}
],
OutputDataConfig={"S3OutputPath": event["TrainingOutputS3Path"]},
ProblemType=event["ProblemType"],
AutoMLJobObjective={"MetricName": event["AutopilotObjectiveMetricName"]},
AutoMLJobConfig={
"CompletionCriteria": {
"MaxCandidates": event["MaxCandidates"],
"MaxRuntimePerTrainingJobInSeconds": event[
"MaxRuntimePerTrainingJobInSeconds"
],
"MaxAutoMLJobRuntimeInSeconds": event["MaxAutoMLJobRuntimeInSeconds"],
},
"Mode": event["AutopilotMode"],
},
RoleArn=event["AutopilotExecutionRoleArn"],
)
Loading

0 comments on commit ee1b7a6

Please sign in to comment.