Skip to content

Commit

Permalink
specify output file with a parameter
Browse files Browse the repository at this point in the history
undo tf-launcher
  • Loading branch information
hougangliu committed Mar 3, 2019
1 parent 79b698a commit 3eb4b1a
Show file tree
Hide file tree
Showing 27 changed files with 745 additions and 14 deletions.
6 changes: 1 addition & 5 deletions .cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,8 @@ steps:
id: 'buildDeployer'
- name: 'gcr.io/cloud-builders/docker'
entrypoint: '/bin/bash'
args: ['-c', 'cd /workspace/components/kubeflow/tf-launcher && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
args: ['-c', 'cd /workspace/components/kubeflow/launcher && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
id: 'buildLauncher'
- name: 'gcr.io/cloud-builders/docker'
entrypoint: '/bin/bash'
args: ['-c', 'cd /workspace/components/kubeflow/katib-launcher && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
id: 'buildKatibLauncher'
- name: 'gcr.io/cloud-builders/docker'
entrypoint: '/bin/bash'
args: ['-c', 'cd /workspace/components/kubeflow/dnntrainer && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA -l ml-pipeline-kubeflow-tf-trainer-gpu -b 1.6.0-gpu']
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames,
parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec,
studyjob_timeout_minutes, output_dir=None, step_name='StudyJob-Launcher'):
studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'):
return dsl.ContainerOp(
name = step_name,
image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest',
Expand All @@ -33,7 +33,8 @@ def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectiveva
"--workertemplatepath", workertemplatepath,
"--mcollectortemplatepath", mcollectortemplatepath,
"--suggestionspec", suggestionspec,
"--outputfile", output_file,
'--studyjobtimeoutminutes', studyjob_timeout_minutes,
],
file_outputs = {'hyperparameter': '/output.txt'}
file_outputs = {'hyperparameter': output_file}
)
5 changes: 4 additions & 1 deletion components/kubeflow/katib-launcher/src/launch_study_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ def main(argv=None):
parser.add_argument('--suggestionspec', type=yamlOrJsonStr,
default={},
help='StudyJob suggestion spec.')
parser.add_argument('--outputfile', type=str,
default='/output.txt',
help='The file which stores the best trial of the studyJob.')
parser.add_argument('--studyjobtimeoutminutes', type=int,
default=10,
help='Time in minutes to wait for the StudyJob to complete')
Expand Down Expand Up @@ -147,7 +150,7 @@ def main(argv=None):
if wait_response.get("status", {}).get("condition") == "Completed":
succ = True
trial = get_best_trial(wait_response["status"]["bestTrialId"])
with open('/output.txt', 'w') as f:
with open(args.outputfile, 'w') as f:
ps_dict = {}
for ps in trial.parameter_set:
ps_dict[ps.name] = ps.value
Expand Down
62 changes: 62 additions & 0 deletions components/kubeflow/launcher/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2018 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

FROM ubuntu:16.04

ARG TRAINER_IMAGE_NAME

RUN apt-get update -y

RUN apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git

RUN easy_install pip

RUN pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 tensorflow==1.7.0 \
kubernetes google-api-python-client retrying

RUN wget -nv https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.zip && \
unzip -qq google-cloud-sdk.zip -d tools && \
rm google-cloud-sdk.zip && \
tools/google-cloud-sdk/install.sh --usage-reporting=false \
--path-update=false --bash-completion=false \
--disable-installation-options && \
tools/google-cloud-sdk/bin/gcloud -q components update \
gcloud core gsutil && \
tools/google-cloud-sdk/bin/gcloud -q components install kubectl && \
tools/google-cloud-sdk/bin/gcloud config set component_manager/disable_update_check true && \
touch /tools/google-cloud-sdk/lib/third_party/google.py

RUN wget -nv https://github.com/ksonnet/ksonnet/releases/download/v0.9.0/ks_0.9.0_linux_amd64.tar.gz && \
tar -xzf ks_0.9.0_linux_amd64.tar.gz && \
mkdir -p /tools/ks/bin && \
cp ./ks_0.9.0_linux_amd64/ks /tools/ks/bin && \
rm ks_0.9.0_linux_amd64.tar.gz && \
rm -r ks_0.9.0_linux_amd64

RUN wget https://github.com/kubeflow/tf-operator/archive/v0.3.0.zip && \
unzip v0.3.0.zip && \
mv tf-operator-0.3.0 tf-operator

ENV PYTHONPATH $PYTHONPATH:/tf-operator

ENV PATH $PATH:/tools/google-cloud-sdk/bin:/tools/ks/bin

ENV TRAINER_IMAGE_NAME $TRAINER_IMAGE_NAME

ADD build /ml

RUN mkdir /usr/licenses && \
/ml/license.sh /ml/third_party_licenses.csv /usr/licenses

ENTRYPOINT ["python", "/ml/launch_tf_job.py"]
88 changes: 88 additions & 0 deletions components/kubeflow/launcher/build_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash -e
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


while getopts ":hp:t:i:" opt; do
case "${opt}" in
h) echo "-p: project name"
echo "-t: tag name"
echo "-i: image name. If provided, project name and tag name are not necessary"
exit
;;
p) PROJECT_ID=${OPTARG}
;;
t) TAG_NAME=${OPTARG}
;;
i) LAUNCHER_IMAGE_NAME=${OPTARG}
;;
\? ) echo "Usage: cmd [-p] project [-t] tag [-i] image"
exit
;;
esac
done

LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-tf
LOCAL_TRAINER_IMAGE_NAME=ml-pipeline-kubeflow-tf-trainer

if [ -z "${PROJECT_ID}" ]; then
PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
fi

if [ -z "${TAG_NAME}" ]; then
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
fi

mkdir -p ./build
rsync -arvp ./src/ ./build/

cp ../../license.sh ./build
cp ../../third_party_licenses.csv ./build

# Build the trainer image
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
TRAINER_IMAGE_NAME=gcr.io/${PROJECT_ID}/${LOCAL_TRAINER_IMAGE_NAME}:${TAG_NAME}
else
# construct the trainer image name as "laucher_image_name"-trainer:"launcher_image_tag"
colon_index=`expr index "${LAUNCHER_IMAGE_NAME}" :`
if [ $colon_index == '0' ]; then
TRAINER_IMAGE_NAME=${LAUNCHER_IMAGE_NAME}-trainer
else
tag=${LAUNCHER_IMAGE_NAME:$colon_index}
TRAINER_IMAGE_NAME=${LAUNCHER_IMAGE_NAME:0:$colon_index-1}-trainer:${tag}
fi
fi

bash_dir=`dirname $0`
bash_dir_abs=`realpath $bash_dir`
parent_dir=`dirname ${bash_dir_abs}`
trainer_dir=${parent_dir}/dnntrainer
cd ${trainer_dir}
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
./build_image.sh -p ${PROJECT_ID} -t ${TAG_NAME}
else
./build_image.sh -i ${TRAINER_IMAGE_NAME}
fi
cd -

docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} . --build-arg TRAINER_IMAGE_NAME=${TRAINER_IMAGE_NAME}
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
else
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} "${LAUNCHER_IMAGE_NAME}"
docker push "${LAUNCHER_IMAGE_NAME}"
fi

rm -rf ./build
31 changes: 31 additions & 0 deletions components/kubeflow/launcher/kubeflow_tfjob_launcher_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from kfp import dsl

def kubeflow_tfjob_launcher_op(container_image, command, number_of_workers: int, number_of_parameter_servers: int, tfjob_timeout_minutes: int, output_dir=None, step_name='TFJob-launcher'):
return dsl.ContainerOp(
name = step_name,
image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf:7775692adf28d6f79098e76e839986c9ee55dd61',
arguments = [
'--workers', number_of_workers,
'--pss', number_of_parameter_servers,
'--tfjob-timeout-minutes', tfjob_timeout_minutes,
'--container-image', container_image,
'--output-dir', output_dir,
'--ui-metadata-type', 'tensorboard',
'--',
] + command,
file_outputs = {'train': '/output.txt'}
)
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import launcher
from launcher import train
import os
import shutil
import subprocess
Expand All @@ -35,7 +36,7 @@ def test_yaml_generation_basic(self):
pss = 1
args_list = []
args_list.append('--learning-rate=0.1')
generated_yaml = _generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
generated_yaml = train._generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
with open(os.path.join(test_data_dir, 'train_basic.yaml'), 'r') as f:
golden = yaml.load(f)
self.assertEqual(golden, generated_yaml)
Expand All @@ -50,7 +51,7 @@ def test_yaml_generation_advanced(self):
args_list = []
tfjob_ns = 'kubeflow'
args_list.append('--learning-rate=0.1')
generated_yaml = _generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
generated_yaml = train._generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
with open(os.path.join(test_data_dir, 'train_zero_worker.yaml'), 'r') as f:
golden = yaml.load(f)
self.assertEqual(golden, generated_yaml)
Expand Down
15 changes: 15 additions & 0 deletions components/kubeflow/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .kubeflow_tfjob_launcher_op import kubeflow_tfjob_launcher_op
Loading

0 comments on commit 3eb4b1a

Please sign in to comment.