forked from aws/amazon-sagemaker-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmisc.py
162 lines (139 loc) · 5.26 KB
/
misc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import
import base64
import contextlib
import json
import os
import shlex
import shutil
import subprocess
import sys
import tempfile
import time
import boto3
def wait_for_s3_object(
s3_bucket,
key,
local_dir,
local_prefix="",
aws_account=None,
aws_region=None,
timeout=1200,
limit=20,
fetch_only=None,
training_job_name=None,
):
"""
Keep polling s3 object until it is generated.
Pulling down latest data to local directory with short key
Arguments:
s3_bucket (string): s3 bucket name
key (string): key for s3 object
local_dir (string): local directory path to save s3 object
local_prefix (string): local prefix path append to the local directory
aws_account (string): aws account of the s3 bucket
aws_region (string): aws region where the repo is located
timeout (int): how long to wait for the object to appear before giving up
limit (int): maximum number of files to download
fetch_only (lambda): a function to decide if this object should be fetched or not
training_job_name (string): training job name to query job status
Returns:
A list of all downloaded files, as local filenames
"""
session = boto3.Session()
aws_account = aws_account or session.client("sts").get_caller_identity()["Account"]
aws_region = aws_region or session.region_name
s3 = session.resource("s3")
sagemaker = session.client("sagemaker")
bucket = s3.Bucket(s3_bucket)
objects = []
print("Waiting for s3://%s/%s..." % (s3_bucket, key), end="", flush=True)
start_time = time.time()
cnt = 0
while len(objects) == 0:
objects = list(bucket.objects.filter(Prefix=key))
if fetch_only:
objects = list(filter(fetch_only, objects))
if objects:
continue
print(".", end="", flush=True)
time.sleep(5)
cnt += 1
if cnt % 80 == 0:
print("")
if time.time() > start_time + timeout:
raise FileNotFoundError(
"S3 object s3://%s/%s never appeared after %d seconds" % (s3_bucket, key, timeout)
)
if training_job_name:
training_job_status = sagemaker.describe_training_job(
TrainingJobName=training_job_name
)["TrainingJobStatus"]
if training_job_status == "Failed":
raise RuntimeError(
"Training job {} failed while waiting for S3 object s3://{}/{}".format(
training_job_name, s3_bucket, key
)
)
print("\n", end="", flush=True)
if len(objects) > limit:
print("Only downloading %d of %d files" % (limit, len(objects)))
objects = objects[-limit:]
fetched_files = []
for obj in objects:
print("Downloading %s" % obj.key)
local_path = os.path.join(local_dir, local_prefix, obj.key.split("/")[-1])
obj.Object().download_file(local_path)
fetched_files.append(local_path)
return fetched_files
def get_execution_role(role_name="sagemaker", aws_account=None, aws_region=None):
"""
Create sagemaker execution role to perform sagemaker task
Args:
role_name (string): name of the role to be created
aws_account (string): aws account of the ECR repo
aws_region (string): aws region where the repo is located
"""
session = boto3.Session()
aws_account = aws_account or session.client("sts").get_caller_identity()["Account"]
aws_region = aws_region or session.region_name
assume_role_policy_document = json.dumps(
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": ["sagemaker.amazonaws.com", "robomaker.amazonaws.com"]
},
"Action": "sts:AssumeRole",
}
],
}
)
client = session.client("iam")
try:
client.get_role(RoleName=role_name)
except client.exceptions.NoSuchEntityException:
client.create_role(
RoleName=role_name, AssumeRolePolicyDocument=str(assume_role_policy_document)
)
print("Created new sagemaker execution role: %s" % role_name)
client.attach_role_policy(
PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", RoleName=role_name
)
return client.get_role(RoleName=role_name)["Role"]["Arn"]
def wait_for_training_job_to_complete(job_name):
sagemaker_client = boto3.client("sagemaker")
sagemaker_client.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=job_name)