-
Notifications
You must be signed in to change notification settings - Fork 3.1k
/
submit_groupwise_azureml_pytest.py
489 lines (430 loc) · 16.6 KB
/
submit_groupwise_azureml_pytest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.
"""
This python script sets up an environment on AzureML and submits a
script to it to run pytest. It is usually intended to be used as
part of a DevOps pipeline which runs testing on a github repo but
can also be used from command line.
Many parameters are set to default values and some are expected to be passed
in from either the DevOps pipeline or command line.
If calling from command line, there are some parameters you must pass in for
your job to run.
Args:
Required:
--clustername (str): the Azure cluster for this run. It can already exist
or it will be created.
--subid (str): the Azure subscription id
Optional but suggested, this info will be stored on Azure as
text information as part of the experiment:
--pr (str): the Github PR number
--reponame (str): the Github repository name
--branch (str): the branch being run
It is also possible to put any text string in these.
Example:
Usually, this script is run by a DevOps pipeline. It can also be
run from cmd line.
>>> python tests/ci/refac.py --clustername 'cluster-d3-v2'
--subid '12345678-9012-3456-abcd-123456789012'
--pr '666'
--reponame 'Recommenders'
--branch 'staging'
"""
import argparse
import logging
import glob
from azureml.core.authentication import AzureCliAuthentication
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.workspace import WorkspaceException
def setup_workspace(
workspace_name, subscription_id, resource_group, cli_auth, location
):
"""
This sets up an Azure Workspace.
An existing Azure Workspace is used or a new one is created if needed for
the pytest run.
Args:
workspace_name (str): Centralized location on Azure to work
with all the artifacts used by AzureML
service
subscription_id (str): the Azure subscription id
resource_group (str): Azure Resource Groups are logical collections of
assets associated with a project. Resource groups
make it easy to track or delete all resources
associated with a project by tracking or deleting
the Resource group.
cli_auth Azure authentication
location (str): workspace reference
Returns:
ws: workspace reference
"""
logger.debug("setup: workspace_name is {}".format(workspace_name))
logger.debug("setup: resource_group is {}".format(resource_group))
logger.debug("setup: subid is {}".format(subscription_id))
logger.debug("setup: location is {}".format(location))
try:
# use existing workspace if there is one
ws = Workspace.get(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
auth=cli_auth,
)
except WorkspaceException:
# this call might take a minute or two.
logger.debug("Creating new workspace")
ws = Workspace.create(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
# create_resource_group=True,
location=location,
auth=cli_auth,
show_output=False,
)
return ws
def setup_persistent_compute_target(workspace, cluster_name, vm_size, max_nodes):
"""
Set up a persistent compute target on AzureML.
A persistent compute target runs noticeably faster than a
regular compute target for subsequent runs. The benefit
is that AzureML manages turning the compute on/off as needed for
each job so the user does not need to do this.
Args:
workspace (str): Centralized location on Azure to work with
all the
artifacts used by AzureML service
cluster_name (str): the Azure cluster for this run. It can
already exist or it will be created.
vm_size (str): Azure VM size, like STANDARD_D3_V2
max_nodes (int): Number of VMs, max_nodes=4 will
autoscale up to 4 VMs
Returns:
cpu_cluster : cluster reference
"""
# setting vmsize and num nodes creates a persistent AzureML
# compute resource
logger.debug("setup: cluster_name {}".format(cluster_name))
# https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets
try:
cpu_cluster = ComputeTarget(workspace=workspace, name=cluster_name)
logger.debug("setup: Found existing cluster, use it.")
except ComputeTargetException:
logger.debug("setup: create cluster")
compute_config = AmlCompute.provisioning_configuration(
vm_size=vm_size,
max_nodes=max_nodes,
ssh_public_access_enabled=True,
idle_time_before_scale_down=3600, # 1 hour
)
cpu_cluster = ComputeTarget.create(workspace, cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=False)
return cpu_cluster
def create_run_config(
cpu_cluster,
docker_proc_type,
add_gpu_dependencies,
add_spark_dependencies,
conda_pkg_jdk,
conda_pkg_python,
commit_sha,
):
"""
AzureML requires the run environment to be setup prior to submission.
This configures a docker persistent compute. Even though
it is called Persistent compute, AzureML handles startup/shutdown
of the compute environment.
Args:
cpu_cluster (str) : Names the cluster for the test
In the case of unit tests, any of
the following:
- Reco_cpu_test
- Reco_gpu_test
docker_proc_type (str) : processor type, cpu or gpu
add_gpu_dependencies (bool) : True if gpu packages should be
added to the conda environment, else False
add_spark_dependencies (bool) : True if PySpark packages should be
added to the conda environment, else False
commit_sha (str) : the commit that triggers the workflow
Return:
run_azuremlcompute : AzureML run config
"""
run_azuremlcompute = RunConfiguration()
run_azuremlcompute.target = cpu_cluster
run_azuremlcompute.environment.docker.enabled = True
run_azuremlcompute.environment.docker.base_image = docker_proc_type
# Use conda_dependencies.yml to create a conda environment in
# the Docker image for execution
# False means the user will provide a conda file for setup
# True means the user will manually configure the environment
run_azuremlcompute.environment.python.user_managed_dependencies = False
conda_dep = CondaDependencies()
conda_dep.add_conda_package(conda_pkg_python)
conda_dep.add_pip_package(
"pymanopt@https://github.com/pymanopt/pymanopt/archive/fb36a272cdeecb21992cfd9271eb82baafeb316d.zip"
)
# install recommenders
reco_extras = "dev"
if add_gpu_dependencies and add_spark_dependencies:
conda_dep.add_channel("conda-forge")
conda_dep.add_conda_package(conda_pkg_jdk)
reco_extras = reco_extras + ",spark,gpu"
elif add_gpu_dependencies:
reco_extras = reco_extras + ",gpu"
elif add_spark_dependencies:
conda_dep.add_channel("conda-forge")
conda_dep.add_conda_package(conda_pkg_jdk)
reco_extras = reco_extras + ",spark"
conda_dep.add_pip_package(
f"recommenders[{reco_extras}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha}"
)
run_azuremlcompute.environment.python.conda_dependencies = conda_dep
return run_azuremlcompute
def create_experiment(workspace, experiment_name):
"""
AzureML requires an experiment as a container of trials.
This will either create a new experiment or use an
existing one.
Args:
workspace (str) : name of AzureML workspace
experiment_name (str) : AzureML experiment name
Return:
exp - AzureML experiment
"""
logger.debug("create: experiment_name {}".format(experiment_name))
exp = Experiment(workspace=workspace, name=experiment_name)
return exp
def submit_experiment_to_azureml(
test, run_config, experiment, test_group, test_kind, warnings
):
"""
Submitting the experiment to AzureML actually runs the script.
Args:
test (str): Pytest script, folder/test such as ./tests/ci/run_pytest.py
run_config (obj): Environment configuration
experiment (obj): Instance of an Experiment, a collection of
trials where each trial is a run.
test_group (str): Name of the test group.
test_kind (str): Name of the test kind, such as nightly or unit.
pytestargs (str): Pytest arguments.
Return:
obj: AzureML run or trial
"""
arguments = ["--testgroup", test_group, "--testkind", test_kind]
if warnings is True:
arguments.append("--disable-warnings")
script_run_config = ScriptRunConfig(
source_directory=".",
script=test,
run_config=run_config,
arguments=arguments,
)
run = experiment.submit(script_run_config)
# waits only for configuration to complete
run.wait_for_completion(show_output=True, wait_post_processing=True)
# test logs can also be found on azure
# go to azure portal to see log in azure ws and look for experiment name
# and look for individual run
logger.debug("files {}".format(run.get_file_names))
return run
def create_arg_parser():
"""
Many of the argument defaults are used as arg_parser makes it easy to
use defaults. The user has many options they can select.
"""
parser = argparse.ArgumentParser(description="Process some inputs")
parser.add_argument(
"--sha",
action="store",
help="the commit that triggers the workflow",
)
# script to run pytest
parser.add_argument(
"--test",
action="store",
default="./tests/ci/azureml_tests/run_groupwise_pytest.py",
help="location of script to run pytest",
)
# max num nodes in Azure cluster
parser.add_argument(
"--maxnodes",
action="store",
default=4,
help="specify the maximum number of nodes for the run",
)
# Test group
parser.add_argument(
"--testgroup", action="store", default="group_criteo", help="Test Group"
)
# Azure resource group
parser.add_argument(
"--rg", action="store", default="recommender", help="Azure Resource Group"
)
# AzureML workspace Name
parser.add_argument(
"--wsname", action="store", default="RecoWS", help="AzureML workspace name"
)
# AzureML clustername
parser.add_argument(
"--clustername",
action="store",
default="azuremlcompute",
help="Set name of Azure cluster",
)
# Azure VM size
parser.add_argument(
"--vmsize",
action="store",
default="STANDARD_D3_V2",
help="Set the size of the VM either STANDARD_D3_V2",
)
# cpu or gpu
parser.add_argument(
"--dockerproc",
action="store",
default="cpu",
help="Base image used in docker container",
)
# Azure subscription id, when used in a pipeline, it is stored in keyvault
parser.add_argument(
"--subid", action="store", default="123456", help="Azure Subscription ID"
)
# AzureML experiment name
parser.add_argument(
"--expname",
action="store",
default="persistentAzureML",
help="experiment name on Azure",
)
# Azure datacenter location
parser.add_argument("--location", default="EastUS", help="Azure location")
# github repo, stored in AzureML experiment for info purposes
parser.add_argument(
"--reponame",
action="store",
default="--reponame MyGithubRepo",
help="GitHub repo being tested",
)
# github branch, stored in AzureML experiment for info purposes
parser.add_argument(
"--branch",
action="store",
default="--branch MyGithubBranch",
help=" Identify the branch test test is run on",
)
# github pull request, stored in AzureML experiment for info purposes
parser.add_argument(
"--pr",
action="store",
default="--pr PRTestRun",
help="If a pr triggered the test, list it here",
)
# flag to indicate whether gpu dependencies should be included in conda env
parser.add_argument(
"--add_gpu_dependencies",
action="store_true",
help="include packages for GPU support",
)
# flag to indicate whether pyspark dependencies should be included in conda env
parser.add_argument(
"--add_spark_dependencies",
action="store_true",
help="include packages for PySpark support",
)
# path where test logs should be downloaded
parser.add_argument(
"--testlogs",
action="store",
default="test_logs.log",
help="Test logs will be downloaded to this path",
)
# conda package name for jdk
parser.add_argument(
"--conda_pkg_jdk",
action="store",
default="openjdk=8",
help="conda package name for jdk",
)
# conda package name for python
parser.add_argument(
"--conda_pkg_python",
action="store",
default="python=3.7",
help="conda package for Python",
)
parser.add_argument(
"--testkind",
action="store",
default="unit",
help="Test kind - nightly or unit",
)
# Flag to indicate whether to turn off the warnings
parser.add_argument(
"--disable-warnings",
action="store_true",
help="Turn off warnings",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
logger = logging.getLogger("submit_groupwise_azureml_pytest.py")
args = create_arg_parser()
if args.dockerproc == "cpu":
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
docker_proc_type = DEFAULT_CPU_IMAGE
else:
from azureml.core.runconfig import DEFAULT_GPU_IMAGE
docker_proc_type = DEFAULT_GPU_IMAGE
cli_auth = AzureCliAuthentication()
workspace = setup_workspace(
workspace_name=args.wsname,
subscription_id=args.subid,
resource_group=args.rg,
cli_auth=cli_auth,
location=args.location,
)
cpu_cluster = setup_persistent_compute_target(
workspace=workspace,
cluster_name=args.clustername,
vm_size=args.vmsize,
max_nodes=args.maxnodes,
)
run_config = create_run_config(
cpu_cluster=cpu_cluster,
docker_proc_type=docker_proc_type,
add_gpu_dependencies=args.add_gpu_dependencies,
add_spark_dependencies=args.add_spark_dependencies,
conda_pkg_jdk=args.conda_pkg_jdk,
conda_pkg_python=args.conda_pkg_python,
commit_sha=args.sha,
)
logger.info("exp: In Azure, look for experiment named {}".format(args.expname))
# create new or use existing experiment
experiment = Experiment(workspace=workspace, name=args.expname)
run = submit_experiment_to_azureml(
test=args.test,
run_config=run_config,
experiment=experiment,
test_group=args.testgroup,
test_kind=args.testkind,
warnings=args.disable_warnings,
)
# add helpful information to experiment on Azure
run.tag("Python", args.conda_pkg_python)
run.tag("RepoName", args.reponame)
run.tag("Branch", args.branch)
run.tag("PR", args.pr)
run.tag("script", args.test)
run.tag("testgroup", args.testgroup)
run.tag("testkind", args.testkind)
# download logs file from AzureML
run.download_file(name="test_logs", output_file_path=args.testlogs)
# save pytest exit code
metrics = run.get_metrics()
with open("pytest_exit_code.log", "w") as f:
f.write(str(metrics.get("pytest_exit_code")))