Skip to content

Commit

Permalink
Update arena component with git support (#1179)
Browse files Browse the repository at this point in the history
* update sample

* fix git  sync

* make downloading docker image automatically

* make downloading docker image automatically

* make downloading docker image automatically

* fix typo

* use extend to replace append
  • Loading branch information
cheyang authored and k8s-ci-robot committed Apr 19, 2019
1 parent abfdd29 commit 2eddf0e
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 28 deletions.
37 changes: 26 additions & 11 deletions components/arena/docker/arena_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def generate_job_command(args):
tensorboard_image = args.tensorboard_image
tensorboard = str2bool(args.tensorboard)
log_dir = args.log_dir
sync_source = args.sync_source

commandArray = [
'arena', 'submit', 'tfjob',
Expand All @@ -163,22 +164,22 @@ def generate_job_command(args):
]

if gpus > 0:
commandArray.append("--gpus={0}".format(gpus))
commandArray.extend(['--gpus', str(gpus)])

if cpu > 0:
commandArray.append("--cpu={0}".format(cpu))
commandArray.extend(['--cpu', str(cpu)])

if memory >0:
commandArray.append("--memory={0}".format(memory))
commandArray.extend(['--memory', str(memory)])

if tensorboard_image != "tensorflow/tensorflow:1.12.0":
commandArray.append("--tensorboardImage={0}".format(tensorboard_image))
commandArray.extend(['--tensorboardImage', tensorboard_image])

if tensorboard:
commandArray.append("--tensorboard")

if os.path.isdir(args.log_dir):
commandArray.append("--logdir={0}".format(args.log_dir))
commandArray.append(['--logdir', args.log_dir])
else:
logging.info("skip log dir :{0}".format(args.log_dir))

Expand All @@ -190,6 +191,12 @@ def generate_job_command(args):
for e in env:
commandArray.append("--env={0}".format(e))

if len(sync_source) > 0:
if not sync_source.endswith(".git"):
raise ValueError("sync_source must be an http git url")
commandArray.extend(['--sync-mode','git'])
commandArray.extend(['--sync-source',sync_source])

return commandArray, "tfjob"

# Generate mpi job
Expand All @@ -208,6 +215,7 @@ def generate_mpjob_command(args):
tensorboard = str2bool(args.tensorboard)
rdma = str2bool(args.rdma)
log_dir = args.log_dir
sync_source = args.sync_source

commandArray = [
'arena', 'submit', 'mpijob',
Expand All @@ -216,17 +224,17 @@ def generate_mpjob_command(args):
'--image={0}'.format(image),
]

if gpus > 0:
commandArray.append("--gpus={0}".format(gpus))
if gpus > 0:
commandArray.extend(['--gpus', str(gpus)])

if cpu > 0:
commandArray.append("--cpu={0}".format(cpu))
commandArray.extend(['--cpu', str(cpu)])

if memory >0:
commandArray.append("--memory={0}".format(memory))
commandArray.extend(['--memory', str(memory)])

if tensorboard_image != "tensorflow/tensorflow:1.12.0":
commandArray.append("--tensorboardImage={0}".format(tensorboard_image))
commandArray.extend(['--tensorboardImage', tensorboard_image])

if tensorboard:
commandArray.append("--tensorboard")
Expand All @@ -235,7 +243,7 @@ def generate_mpjob_command(args):
commandArray.append("--rdma")

if os.path.isdir(args.log_dir):
commandArray.append("--logdir={0}".format(args.log_dir))
commandArray.append(['--logdir', args.log_dir])
else:
logging.info("skip log dir :{0}".format(args.log_dir))

Expand All @@ -247,6 +255,12 @@ def generate_mpjob_command(args):
for e in env:
commandArray.append("--env={0}".format(e))

if len(sync_source) > 0:
if not sync_source.endswith(".git"):
raise ValueError("sync_source must be an http git url")
commandArray.extend(['--sync-mode','git'])
commandArray.extend(['--sync-source',sync_source])

return commandArray, "mpijob"

def str2bool(v):
Expand Down Expand Up @@ -281,6 +295,7 @@ def main(argv=None):
parser.add_argument('--env', action='append', type=str, default=[])
parser.add_argument('--data', action='append', type=str, default=[])
parser.add_argument('--metric', action='append', type=str, default=[])
parser.add_argument('--sync-source', type=str, default='')

subparsers = parser.add_subparsers(help='arena sub-command help')

Expand Down
6 changes: 4 additions & 2 deletions components/arena/python/arena/_arena_mpi_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def mpi_job_op(name, image, command, workers=1, gpus=0, cpu=0, memory=0, env=[],
options.append('--tensorboard-image')
options.append(str(tensorboard_image))

return dsl.ContainerOp(
op = dsl.ContainerOp(
name=name,
image=arenaImage,
command=['python','arena_launcher.py'],
Expand All @@ -81,4 +81,6 @@ def mpi_job_op(name, image, command, workers=1, gpus=0, cpu=0, memory=0, env=[],
"mpijob",
"--", str(command)],
file_outputs={'train': '/output.txt'}
)
)
op.set_image_pull_policy('Always')
return op
4 changes: 3 additions & 1 deletion components/arena/python/arena/_arena_standalone_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def standalone_job_op(name, image, command, gpus=0, cpu=0, memory=0, env=[],
options.append('--tensorboard-image')
options.append(str(tensorboard_image))

return dsl.ContainerOp(
op = dsl.ContainerOp(
name=name,
image=arena_image,
command=['python','arena_launcher.py'],
Expand All @@ -81,3 +81,5 @@ def standalone_job_op(name, image, command, gpus=0, cpu=0, memory=0, env=[],
"--", str(command)],
file_outputs={'train': '/output.txt'}
)
op.set_image_pull_policy('Always')
return op
29 changes: 15 additions & 14 deletions samples/arena-samples/standalonejob/standalone_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
)
def sample_pipeline(learning_rate='0.01',
dropout='0.9',
model_version='1'):
model_version='1',
commit='f097575656f927d86d99dd64931042e1a9003cb2'):
"""A pipeline for end to end machine learning workflow."""
data=["user-susan:/training"]
gpus=1
Expand All @@ -27,29 +28,25 @@ def sample_pipeline(learning_rate='0.01',
curl -O https://code.aliyun.com/xiaozhou/tensorflow-sample-code/raw/master/data/t10k-labels-idx1-ubyte.gz && \
curl -O https://code.aliyun.com/xiaozhou/tensorflow-sample-code/raw/master/data/train-images-idx3-ubyte.gz && \
curl -O https://code.aliyun.com/xiaozhou/tensorflow-sample-code/raw/master/data/train-labels-idx1-ubyte.gz")
# 2. prepare source code
prepare_code = arena.standalone_job_op(
name="source-code",
image="alpine/git",
data=data,
command="mkdir -p /training/models/ && \
cd /training/models/ && \
if [ ! -d /training/models/tensorflow-sample-code ]; then git clone https://code.aliyun.com/xiaozhou/tensorflow-sample-code.git; else echo no need download;fi")

# 3. train the models
# 2. download source code and train the models
train = arena.standalone_job_op(
name="train",
image="tensorflow/tensorflow:1.11.0-gpu-py3",
sync_source="https://code.aliyun.com/xiaozhou/tensorflow-sample-code.git",
env=["GIT_SYNC_REV=%s" % (commit)],
gpus=gpus,
data=data,
command="echo %s;echo %s;python /training/models/tensorflow-sample-code/tfjob/docker/mnist/main.py --max_steps 500 --data_dir /training/dataset/mnist --log_dir /training/output/mnist --learning_rate %s --dropout %s" % (prepare_data.output, prepare_code.output, learning_rate, dropout),
command="echo %s;python code/tensorflow-sample-code/tfjob/docker/mnist/main.py --max_steps 500 --data_dir /training/dataset/mnist --log_dir /training/output/mnist --learning_rate %s --dropout %s" % (prepare_data.output, learning_rate, dropout),
metrics=["Train-accuracy:PERCENTAGE"])
# 4. export the model
# 3. export the model
export_model = arena.standalone_job_op(
name="export-model",
image="tensorflow/tensorflow:1.11.0-py3",
sync_source="https://code.aliyun.com/xiaozhou/tensorflow-sample-code.git",
env=["GIT_SYNC_REV=%s" % (commit)],
data=data,
command="echo %s;python /training/models/tensorflow-sample-code/tfjob/docker/mnist/export_model.py --model_version=%s --checkpoint_path=/training/output/mnist /training/output/models" % (train.output, model_version))
command="echo %s;python code/tensorflow-sample-code/tfjob/docker/mnist/export_model.py --model_version=%s --checkpoint_path=/training/output/mnist /training/output/models" % (train.output, model_version))

if __name__ == '__main__':
parser = argparse.ArgumentParser()
Expand All @@ -60,11 +57,14 @@ def sample_pipeline(learning_rate='0.01',
help='Keep probability for training dropout.')
parser.add_argument('--learning_rate', type=str, default="0.001",
help='Initial learning rate.')
parser.add_argument('--commit', type=str, default="f097575656f927d86d99dd64931042e1a9003cb2",
help='commit id.')
FLAGS, unparsed = parser.parse_known_args()

model_version = FLAGS.model_version
dropout = FLAGS.dropout
learning_rate = FLAGS.learning_rate
commit = FLAGS.commit

EXPERIMENT_NAME="mnist"
RUN_ID="run"
Expand All @@ -79,4 +79,5 @@ def sample_pipeline(learning_rate='0.01',
run = client.run_pipeline(experiment_id, RUN_ID, __file__ + '.tar.gz',
params={'learning_rate':learning_rate,
'dropout':dropout,
'model_version':model_version})
'model_version':model_version,
'commit':commit})

0 comments on commit 2eddf0e

Please sign in to comment.