specify output file with a parameter

undo tf-launcher
kubeflow · Mar 3, 2019 · 3eb4b1a · 3eb4b1a
1 parent 79b698a
commit 3eb4b1a
Show file tree

Hide file tree

Showing 27 changed files with 745 additions and 14 deletions.
diff --git a/.cloudbuild.yaml b/.cloudbuild.yaml
@@ -97,12 +97,8 @@ steps:
   id: 'buildDeployer'
 - name: 'gcr.io/cloud-builders/docker'
   entrypoint: '/bin/bash'
-  args: ['-c', 'cd /workspace/components/kubeflow/tf-launcher && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
+  args: ['-c', 'cd /workspace/components/kubeflow/launcher && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
   id: 'buildLauncher'
-- name: 'gcr.io/cloud-builders/docker'
-  entrypoint: '/bin/bash'
-  args: ['-c', 'cd /workspace/components/kubeflow/katib-launcher && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA']
-  id: 'buildKatibLauncher'
 - name: 'gcr.io/cloud-builders/docker'
   entrypoint: '/bin/bash'
   args: ['-c', 'cd /workspace/components/kubeflow/dnntrainer && ./build_image.sh -p $PROJECT_ID -t $COMMIT_SHA -l ml-pipeline-kubeflow-tf-trainer-gpu -b 1.6.0-gpu']

diff --git a/components/kubeflow/tf-launcher/Dockerfile → components/kubeflow/Dockerfile b/components/kubeflow/tf-launcher/Dockerfile → components/kubeflow/Dockerfile
diff --git a/...nents/kubeflow/tf-launcher/build_image.sh → components/kubeflow/build_image.sh b/...nents/kubeflow/tf-launcher/build_image.sh → components/kubeflow/build_image.sh
diff --git a/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py
@@ -16,7 +16,7 @@
 
 def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames,
                                   parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec,
-                                  studyjob_timeout_minutes, output_dir=None, step_name='StudyJob-Launcher'):
+                                  studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'):
     return dsl.ContainerOp(
         name = step_name,
         image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest',
@@ -33,7 +33,8 @@ def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectiveva
             "--workertemplatepath", workertemplatepath,
             "--mcollectortemplatepath", mcollectortemplatepath,
             "--suggestionspec", suggestionspec,
+            "--outputfile", output_file,
             '--studyjobtimeoutminutes', studyjob_timeout_minutes,
         ],
-        file_outputs = {'hyperparameter': '/output.txt'}
+        file_outputs = {'hyperparameter': output_file}
     )
diff --git a/components/kubeflow/katib-launcher/src/launch_study_job.py b/components/kubeflow/katib-launcher/src/launch_study_job.py
@@ -118,6 +118,9 @@ def main(argv=None):
   parser.add_argument('--suggestionspec', type=yamlOrJsonStr,
                       default={},
                       help='StudyJob suggestion spec.')
+  parser.add_argument('--outputfile', type=str,
+                      default='/output.txt',
+                      help='The file which stores the best trial of the studyJob.')
   parser.add_argument('--studyjobtimeoutminutes', type=int,
                       default=10,
                       help='Time in minutes to wait for the StudyJob to complete')
@@ -147,7 +150,7 @@ def main(argv=None):
   if wait_response.get("status", {}).get("condition") == "Completed":
     succ = True
     trial = get_best_trial(wait_response["status"]["bestTrialId"])
-    with open('/output.txt', 'w') as f:
+    with open(args.outputfile, 'w') as f:
       ps_dict = {}
       for ps in trial.parameter_set:
           ps_dict[ps.name] = ps.value

diff --git a/...tf-launcher/kubeflow_tfjob_launcher_op.py → ...ts/kubeflow/kubeflow_tfjob_launcher_op.py b/...tf-launcher/kubeflow_tfjob_launcher_op.py → ...ts/kubeflow/kubeflow_tfjob_launcher_op.py
diff --git a/components/kubeflow/launcher/Dockerfile b/components/kubeflow/launcher/Dockerfile
@@ -0,0 +1,62 @@
+# Copyright 2018 The Kubeflow Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM ubuntu:16.04
+
+ARG TRAINER_IMAGE_NAME
+
+RUN apt-get update -y
+
+RUN apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git
+
+RUN easy_install pip
+
+RUN pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 tensorflow==1.7.0 \
+      kubernetes google-api-python-client retrying
+
+RUN wget -nv https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.zip && \
+    unzip -qq google-cloud-sdk.zip -d tools && \
+    rm google-cloud-sdk.zip && \
+    tools/google-cloud-sdk/install.sh --usage-reporting=false \
+        --path-update=false --bash-completion=false \
+        --disable-installation-options && \
+    tools/google-cloud-sdk/bin/gcloud -q components update \
+        gcloud core gsutil && \
+    tools/google-cloud-sdk/bin/gcloud -q components install kubectl && \
+    tools/google-cloud-sdk/bin/gcloud config set component_manager/disable_update_check true && \
+    touch /tools/google-cloud-sdk/lib/third_party/google.py
+
+RUN wget -nv https://github.com/ksonnet/ksonnet/releases/download/v0.9.0/ks_0.9.0_linux_amd64.tar.gz && \
+    tar -xzf ks_0.9.0_linux_amd64.tar.gz && \
+    mkdir -p /tools/ks/bin && \
+    cp ./ks_0.9.0_linux_amd64/ks /tools/ks/bin && \
+    rm ks_0.9.0_linux_amd64.tar.gz && \
+    rm -r ks_0.9.0_linux_amd64
+
+RUN wget https://github.com/kubeflow/tf-operator/archive/v0.3.0.zip && \
+    unzip v0.3.0.zip && \
+    mv tf-operator-0.3.0 tf-operator
+
+ENV PYTHONPATH $PYTHONPATH:/tf-operator
+
+ENV PATH $PATH:/tools/google-cloud-sdk/bin:/tools/ks/bin
+
+ENV TRAINER_IMAGE_NAME $TRAINER_IMAGE_NAME
+
+ADD build /ml
+
+RUN mkdir /usr/licenses && \
+    /ml/license.sh /ml/third_party_licenses.csv /usr/licenses
+
+ENTRYPOINT ["python", "/ml/launch_tf_job.py"]
diff --git a/components/kubeflow/launcher/build_image.sh b/components/kubeflow/launcher/build_image.sh
@@ -0,0 +1,88 @@
+#!/bin/bash -e
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+while getopts ":hp:t:i:" opt; do
+  case "${opt}" in
+    h) echo "-p: project name"
+        echo "-t: tag name"
+        echo "-i: image name. If provided, project name and tag name are not necessary"
+        exit
+      ;;
+    p) PROJECT_ID=${OPTARG}
+      ;;
+    t) TAG_NAME=${OPTARG}
+      ;;
+    i) LAUNCHER_IMAGE_NAME=${OPTARG}
+      ;;
+    \? ) echo "Usage: cmd [-p] project [-t] tag [-i] image"
+      exit
+      ;;
+  esac
+done
+
+LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-tf
+LOCAL_TRAINER_IMAGE_NAME=ml-pipeline-kubeflow-tf-trainer
+
+if [ -z "${PROJECT_ID}" ]; then
+  PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
+fi
+
+if [ -z "${TAG_NAME}" ]; then
+  TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
+fi
+
+mkdir -p ./build
+rsync -arvp ./src/ ./build/
+
+cp ../../license.sh ./build
+cp ../../third_party_licenses.csv ./build
+
+# Build the trainer image
+if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
+  TRAINER_IMAGE_NAME=gcr.io/${PROJECT_ID}/${LOCAL_TRAINER_IMAGE_NAME}:${TAG_NAME}
+else
+  # construct the trainer image name as "laucher_image_name"-trainer:"launcher_image_tag"
+  colon_index=`expr index "${LAUNCHER_IMAGE_NAME}" :`
+  if [ $colon_index == '0' ]; then
+    TRAINER_IMAGE_NAME=${LAUNCHER_IMAGE_NAME}-trainer
+  else
+    tag=${LAUNCHER_IMAGE_NAME:$colon_index}
+    TRAINER_IMAGE_NAME=${LAUNCHER_IMAGE_NAME:0:$colon_index-1}-trainer:${tag}
+  fi
+fi
+
+bash_dir=`dirname $0`
+bash_dir_abs=`realpath $bash_dir`
+parent_dir=`dirname ${bash_dir_abs}`
+trainer_dir=${parent_dir}/dnntrainer
+cd ${trainer_dir}
+if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
+  ./build_image.sh -p ${PROJECT_ID} -t ${TAG_NAME}
+else
+  ./build_image.sh -i ${TRAINER_IMAGE_NAME}
+fi
+cd -
+
+docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} . --build-arg TRAINER_IMAGE_NAME=${TRAINER_IMAGE_NAME}
+if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
+  docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
+  docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
+else
+  docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} "${LAUNCHER_IMAGE_NAME}"
+  docker push "${LAUNCHER_IMAGE_NAME}"
+fi
+
+rm -rf ./build
diff --git a/components/kubeflow/launcher/kubeflow_tfjob_launcher_op.py b/components/kubeflow/launcher/kubeflow_tfjob_launcher_op.py
@@ -0,0 +1,31 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from kfp import dsl
+
+def kubeflow_tfjob_launcher_op(container_image, command, number_of_workers: int, number_of_parameter_servers: int, tfjob_timeout_minutes: int, output_dir=None, step_name='TFJob-launcher'):
+    return dsl.ContainerOp(
+        name = step_name,
+        image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf:7775692adf28d6f79098e76e839986c9ee55dd61',
+        arguments = [
+            '--workers', number_of_workers,
+            '--pss', number_of_parameter_servers,
+            '--tfjob-timeout-minutes', tfjob_timeout_minutes,
+            '--container-image', container_image,
+            '--output-dir', output_dir,
+            '--ui-metadata-type', 'tensorboard',
+            '--',
+        ] + command,
+        file_outputs = {'train': '/output.txt'}
+    )
diff --git a/...ents/kubeflow/tf-launcher/src/__init__.py → components/kubeflow/launcher/src/__init__.py b/...ents/kubeflow/tf-launcher/src/__init__.py → components/kubeflow/launcher/src/__init__.py
diff --git a/...kubeflow/tf-launcher/src/launch_tf_job.py → ...ts/kubeflow/launcher/src/launch_tf_job.py b/...kubeflow/tf-launcher/src/launch_tf_job.py → ...ts/kubeflow/launcher/src/launch_tf_job.py
diff --git a/...eflow/tf-launcher/src/train.template.yaml → ...kubeflow/launcher/src/train.template.yaml b/...eflow/tf-launcher/src/train.template.yaml → ...kubeflow/launcher/src/train.template.yaml
diff --git a/...nts/kubeflow/tf-launcher/test/__init__.py → ...onents/kubeflow/launcher/test/__init__.py b/...nts/kubeflow/tf-launcher/test/__init__.py → ...onents/kubeflow/launcher/test/__init__.py
diff --git a/...ubeflow/tf-launcher/test/test_launcher.py → ...s/kubeflow/launcher/test/test_launcher.py b/...ubeflow/tf-launcher/test/test_launcher.py → ...s/kubeflow/launcher/test/test_launcher.py
@@ -14,6 +14,7 @@
 
 
 import launcher
+from launcher import train
 import os
 import shutil
 import subprocess
@@ -35,7 +36,7 @@ def test_yaml_generation_basic(self):
     pss = 1
     args_list = []
     args_list.append('--learning-rate=0.1')
-    generated_yaml = _generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
+    generated_yaml = train._generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
     with open(os.path.join(test_data_dir, 'train_basic.yaml'), 'r') as f:
       golden = yaml.load(f)
     self.assertEqual(golden, generated_yaml)
@@ -50,7 +51,7 @@ def test_yaml_generation_advanced(self):
     args_list = []
     tfjob_ns = 'kubeflow'
     args_list.append('--learning-rate=0.1')
-    generated_yaml = _generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
+    generated_yaml = train._generate_train_yaml(train_template_file, tfjob_ns, worker, pss, args_list)
     with open(os.path.join(test_data_dir, 'train_zero_worker.yaml'), 'r') as f:
       golden = yaml.load(f)
     self.assertEqual(golden, generated_yaml)

diff --git a/...auncher/test/testdata/train.template.yaml → ...auncher/test/testdata/train.template.yaml b/...auncher/test/testdata/train.template.yaml → ...auncher/test/testdata/train.template.yaml
diff --git a/...f-launcher/test/testdata/train_basic.yaml → ...w/launcher/test/testdata/train_basic.yaml b/...f-launcher/test/testdata/train_basic.yaml → ...w/launcher/test/testdata/train_basic.yaml
diff --git a/...cher/test/testdata/train_zero_worker.yaml → ...cher/test/testdata/train_zero_worker.yaml b/...cher/test/testdata/train_zero_worker.yaml → ...cher/test/testdata/train_zero_worker.yaml
diff --git a/components/kubeflow/src/__init__.py b/components/kubeflow/src/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .kubeflow_tfjob_launcher_op import kubeflow_tfjob_launcher_op