diff --git a/exec/mpi-benchmark/dependencies.yaml b/exec/mpi-benchmark/dependencies.yaml index 8ce12f27..ea20bac9 100644 --- a/exec/mpi-benchmark/dependencies.yaml +++ b/exec/mpi-benchmark/dependencies.yaml @@ -1,19 +1,14 @@ +--- name: cluster_is_prepared spec: requirements: - - operator_is_prepared + - has_mpi_operator - in_mpi_namespace - has_mpi_base_image - has_mpi_osu_image --- -name: operator_is_prepared -spec: - requirements: - - has_mpijobs - - has_volcano ---- -name: has_mpijobs +name: has_mpi_operator spec: tests: - name: has_mpi_crd @@ -50,15 +45,13 @@ spec: tests: - name: in_mpi_namespace type: shell - spec: 'test "$(oc project -q)" == mpi-benchmark' + spec: 'test "$(oc project -q)" == mpi-benchmarking' install: - name: goto_mpi_namespace type: shell spec: | - oc new-project mpi-benchmark 2>/dev/null - oc project -q mpi-benchmark - oc adm policy add-scc-to-user privileged -z default -n mpi-benchmark - oc adm policy add-scc-to-user anyuid -z default + oc new-project mpi-benchmarking 2>/dev/null + oc project -q mpi-benchmarking --- name: has_mpi_imagestream spec: diff --git a/exec/mpi-benchmark/mpijob_template.yaml b/exec/mpi-benchmark/mpijob_template.yaml index 7c2d31ee..55d40d66 100644 --- a/exec/mpi-benchmark/mpijob_template.yaml +++ b/exec/mpi-benchmark/mpijob_template.yaml @@ -72,4 +72,5 @@ spec: securityContext: privileged: true nodeSelector: - node.kubernetes.io/instance-type: {{ .Machine }} \ No newline at end of file + node.kubernetes.io/instance-type: {{ .Machine }} + diff --git a/exec/mpi-benchmark/run_benchmark.sh b/exec/mpi-benchmark/run_benchmark.sh index 6047bed6..3e54ff37 100755 --- a/exec/mpi-benchmark/run_benchmark.sh +++ b/exec/mpi-benchmark/run_benchmark.sh @@ -84,16 +84,24 @@ echo "Done, collecting artifacts in $ARTIFACT_DIR ..." oc get "$mpijob_name" -oyaml > "$ARTIFACT_DIR/mpijob.status.yaml" -# if Pod logs are queried with the label selector, only the last lines are -launcher_pod_name=$(oc get pod -ltraining.kubeflow.org/job-name=$name,training.kubeflow.org/job-role=launcher -oname) -oc logs "$launcher_pod_name" > "$ARTIFACT_DIR/mpijob.launcher.log" - oc get pods -ltraining.kubeflow.org/job-name=$name -oyaml > "$ARTIFACT_DIR/mpijob.pods.yaml" for pod in $(oc get pods -ltraining.kubeflow.org/job-name=$name,training.kubeflow.org/job-role=worker -oname); do oc logs $pod > "$ARTIFACT_DIR/mpijob.$(echo "$pod" | sed "s|pod/${name}-||").log" done +oc get nodes -oyaml > "$ARTIFACT_DIR/nodes.yaml" + +# if Pod logs are queried with the label selector, only the last lines are +launcher_pod_name=$(oc get pod -ltraining.kubeflow.org/job-name=$name,training.kubeflow.org/job-role=launcher -oname) + +if [[ -z "$launcher_pod_name" ]]; then + echo "ERROR: the launcher Pod disappeared ..." + exit 1 +fi + +oc logs "$launcher_pod_name" > "$ARTIFACT_DIR/mpijob.launcher.log" + echo cat "$ARTIFACT_DIR/mpijob.launcher.log" diff --git a/exec/mpi-benchmark/setup/001_imagestream.yaml b/exec/mpi-benchmark/setup/001_imagestream.yaml index ed89e47d..df6ea82a 100644 --- a/exec/mpi-benchmark/setup/001_imagestream.yaml +++ b/exec/mpi-benchmark/setup/001_imagestream.yaml @@ -2,5 +2,4 @@ apiVersion: image.openshift.io/v1 kind: ImageStream metadata: - namespace: mpi-benchmark name: mpi-bench diff --git a/exec/mpi-benchmark/setup/002_base_image.buildconfig.yaml b/exec/mpi-benchmark/setup/002_base_image.buildconfig.yaml index 742d9040..6e185180 100644 --- a/exec/mpi-benchmark/setup/002_base_image.buildconfig.yaml +++ b/exec/mpi-benchmark/setup/002_base_image.buildconfig.yaml @@ -3,7 +3,6 @@ apiVersion: build.openshift.io/v1 kind: BuildConfig metadata: name: mpi-base-image - namespace: mpi-benchmark spec: output: to: @@ -13,20 +12,47 @@ spec: dockerfile: |2 FROM registry.access.redhat.com/ubi8/ubi + ENV USER_NAME=mpi \ + USER=mpi \ + HOME=/home/mpi + + WORKDIR ${HOME} + RUN dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm \ && dnf -y install --quiet \ - sudo pkg-config vim make gdb \ - curl wget git gcc-c++ \ - openssh-server openssh-clients \ - gcc-c++ openmpi-devel openmpi \ + vim openssh-server openssh-clients openmpi \ + \ && ln -s /usr/lib64/openmpi/bin/orted /usr/bin/orted \ - && ssh-keygen -A \ - && (echo "Host *"; echo " StrictHostKeyChecking no") >> /etc/ssh/ssh_config.d/no_StrictHostKeyChecking.conf \ - && echo "StrictModes no" >> /etc/ssh/sshd_config \ + \ + && (echo "Host *"; echo " StrictHostKeyChecking no") >> /etc/ssh/ssh_config.d/StrictHostKeyChecking.conf \ + && (echo "Host *"; echo " IdentityFile /home/mpi/.ssh/id_rsa") >> /etc/ssh/ssh_config.d/IdentityFile.conf \ + && (echo "Host *"; echo " Port 2222") >> /etc/ssh/ssh_config.d/Port_2222.conf \ + && (echo "Host *"; echo " UserKnownHostsFile /dev/null") >> /etc/ssh/ssh_config.d/UserKnownHostsFile.conf \ + \ + && mkdir -p ${HOME}/custom_ssh \ + && ssh-keygen -f ${HOME}/custom_ssh/ssh_host_rsa_key -N '' -t rsa \ + && ssh-keygen -f ${HOME}/custom_ssh/ssh_host_dsa_key -N '' -t dsa \ + \ + && echo -e > ${HOME}/custom_ssh/sshd_config \ + "Port 2222 \n\ + HostKey ${HOME}/custom_ssh/ssh_host_rsa_key \n\ + HostKey ${HOME}/custom_ssh/ssh_host_dsa_key \n\ + AuthorizedKeysFile .ssh/authorized_keys \n\ + ChallengeResponseAuthentication no \n\ + UsePAM no \n\ + Subsystem sftp /usr/lib/ssh/sftp-server \n\ + PidFile ${HOME}/custom_ssh/sshd.pid \n\ + StrictModes no \n\ + " \ + && rm /sbin/nologin && ln -s /usr/bin/bash /sbin/nologin \ + \ && touch /var/log/lastlog \ && chgrp utmp /var/log/lastlog \ && chmod 664 /var/log/lastlog + RUN chgrp -R 0 "${HOME}" \ + && chmod -R g=u "${HOME}" + ENV PATH="${PATH}:/usr/lib64/openmpi/bin/" type: Dockerfile diff --git a/exec/mpi-benchmark/setup/003_osu-bench.buildconfig.yaml b/exec/mpi-benchmark/setup/003_osu-bench.buildconfig.yaml index 5564edd2..c15a8122 100644 --- a/exec/mpi-benchmark/setup/003_osu-bench.buildconfig.yaml +++ b/exec/mpi-benchmark/setup/003_osu-bench.buildconfig.yaml @@ -2,7 +2,6 @@ apiVersion: build.openshift.io/v1 kind: BuildConfig metadata: name: mpi-osu-image - namespace: mpi-benchmark spec: output: to: @@ -11,8 +10,11 @@ spec: source: dockerfile: |2 FROM mpi-bench:base + RUN dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm \ + && dnf -y install --quiet make wget gcc-c++ openmpi-devel - RUN cd /root \ + + RUN cd "${HOME}" \ && OSU_VERSION=5.8 \ && wget --quiet https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tgz \ && tar xvf osu-micro-benchmarks-${OSU_VERSION}.tgz \ @@ -20,8 +22,8 @@ spec: && ./configure --prefix=/opt/osu-micro-benchmarks CC=/usr/lib64/openmpi/bin/mpicc LIBS="-L/usr/lib64/openmpi/lib -lmpi -lpthread" \ && make \ && make install \ - && cd /root \ - && rm -rf /root/osu-micro-benchmarks-${OSU_VERSION} osu-micro-benchmarks-${OSU_VERSION}.tar.gz + && cd "${HOME}" \ + && rm -rf "${HOME}/osu-micro-benchmarks-${OSU_VERSION} osu-micro-benchmarks-${OSU_VERSION}.tar.gz" type: Dockerfile strategy: diff --git a/exec/mpi-benchmark/setup/test_mpijob.yaml b/exec/mpi-benchmark/setup/test_mpijob.yaml new file mode 100644 index 00000000..5b80a541 --- /dev/null +++ b/exec/mpi-benchmark/setup/test_mpijob.yaml @@ -0,0 +1,73 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + label: + profile: rootless + name: hello-world +spec: + sshAuthMountPath: /home/mpi/.ssh + cleanPodPolicy: Running + slotsPerWorker: 1 + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + initContainers: + - name: wait-hostfilename + image: image-registry.openshift-image-registry.svc:5000/mpi-benchmarking/mpi-bench:base + command: + - bash + - -cx + - "[[ $(cat /etc/mpi/hostfile | wc -l) != 0 ]] && (date; echo 'Hostfile is ready'; cat /etc/mpi/hostfile) || (date; echo 'Hostfile not ready ...'; sleep 10; exit 1) && while read host; do while ! ssh $host echo $host ; do date; echo \"Pod $host is not up ...\"; sleep 10; done; date; echo \"Pod $host is ready\"; done <<< \"$(cat /etc/mpi/hostfile)\"" + volumeMounts: + - mountPath: /etc/mpi + name: mpi-job-config + - mountPath: /home/mpi/.ssh + name: ssh-auth + containers: + - name: mpi-launcher + command: + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - bash + - -c + - echo Hello World from $(cat /proc/sys/kernel/hostname) + image: image-registry.openshift-image-registry.svc:5000/mpi-benchmarking/mpi-bench:base + imagePullPolicy: Always + Worker: + replicas: 2 + template: + metadata: + labels: + app: mpi-app + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app: mpi-app + containers: + - name: mpi-worker + image: image-registry.openshift-image-registry.svc:5000/mpi-benchmarking/mpi-bench:base + imagePullPolicy: Always + command: + - /usr/sbin/sshd + - -De + - -f + - /home/mpi/custom_ssh/sshd_config +