-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathDockerfile-default-rocm
205 lines (163 loc) · 6.63 KB
/
Dockerfile-default-rocm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
RUN apt remove -y openmpi ucx
#Let's remove existing /opt/ompi; and, link to our version.
RUN rm -rf /opt/ompi
RUN ln -s /container/ompi /opt
RUN mkdir -p /var/run/sshd
RUN rm /etc/apt/sources.list.d/rocm.list
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
autotools-dev \
build-essential \
ca-certificates \
curl \
daemontools \
glibc-source \
ibverbs-providers \
libibverbs1 \
libkrb5-dev \
librdmacm1 \
libssl-dev \
libtool \
git \
krb5-user \
cmake \
g++ \
make \
openssh-client \
openssh-server \
pkg-config \
wget \
nfs-common \
libnuma1 \
libnuma-dev \
libpmi2-0-dev \
unattended-upgrades \
&& unattended-upgrade \
&& rm -rf /var/lib/apt/lists/* \
&& rm /etc/ssh/ssh_host_ecdsa_key \
&& rm /etc/ssh/ssh_host_ed25519_key \
&& rm /etc/ssh/ssh_host_rsa_key
RUN pip install --upgrade pip
COPY dockerfile_scripts /tmp/det_dockerfile_scripts
ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"
ARG CONDA="${PATH}"
ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
# Install fixed version of FFI package for Ubuntu 20.04.
# This is done after above stuff to make sure we get right version.
RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
RUN apt install rocm-libs
#USING OFI
ARG WITH_MPI=1
ARG WITH_OFI=1
ARG WITH_MPICH
ARG UCX_INSTALL_DIR=/container/ucx
ARG OMPI_INSTALL_DIR=/container/ompi
ARG MPICH_INSTALL_DIR=/container/mpich
ARG OFI_INSTALL_DIR=/container/ofi
ARG OMPI_WITH_CUDA=0
ARG OMPI_WITH_ROCM=1
RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi
# Make sure OMPI/UCX show up in the right paths
ARG VERBS_LIB_DIR=/usr/lib/libibverbs
ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin
# Set up UCX_LIBS and OFI_LIBS
ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"
# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"
# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"
# But, only add them if WITH_MPI
ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH
#USING OFI
ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}
#USING UCX
ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}
# Enable running OMPI as root
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}
RUN pip install cloudpickle
RUN pip install determined && pip uninstall -y determined
RUN pip install google-auth-oauthlib
RUN pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt && jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
ENV JUPYTER_DATA_DIR=/run/determined/jupyter/data
ENV JUPYTER_RUNTIME_DIR=/run/determined/jupyter/runtime
RUN /tmp/det_dockerfile_scripts/add_det_nobody_user.sh
RUN /tmp/det_dockerfile_scripts/install_libnss_determined.sh
RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh
# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf
# Horovod cannot build with protobuf > 3.20.x
# latest google-api-python-client requires protobuf >= 3.20.1
ARG TENSORFLOW_PIP
# Fix flake. The sha256sum of tensorboard_plugin_wit-1.8.1-py3-none-any.whl sometimes
# doesn't match the one in cache.
RUN rm -rf ~/.cache/pip
RUN if [ "$TENSORFLOW_PIP" ]; then pip install $TENSORFLOW_PIP; fi
ARG TORCH_TB_PROFILER_PIP
RUN if [ "$TORCH_TB_PROFILER_PIP" ]; then pip install $TORCH_TB_PROFILER_PIP; fi
ARG TF_PROFILER_PIP
RUN if [ "$TF_PROFILER_PIP" ]; then python -m pip install $TF_PROFILER_PIP; fi
# Reset these because we set GPU_OPERATIONS later.
ENV HOROVOD_GPU_BROADCAST=
ENV HOROVOD_GPU_ALLREDUCE=
ARG HOROVOD_PIP
ARG HOROVOD_NCCL_HOME=/opt/rocm/rccl
ARG HOROVOD_WITH_TENSORFLOW=1
ARG HOROVOD_WITH_PYTORCH=1
ARG HOROVOD_WITHOUT_MXNET=1
ARG HOROVOD_GPU_OPERATIONS=NCCL
ARG HOROVOD_WITHOUT_MPI=0
ARG HOROVOD_WITH_MPI=1
ARG HOROVOD_GPU=ROCM
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH;/opt/rocm/lib:/opt/rocm/hip/lib
ENV HOROVOD_PIP $HOROVOD_PIP
ENV HOROVOD_WITH_TENSORFLOW $HOROVOD_WITH_TENSORFLOW
ENV HOROVOD_WITH_PYTORCH $HOROVOD_WITH_PYTORCH
ENV HOROVOD_WITHOUT_MXNET $HOROVOD_WITHOUT_MXNET
ENV HOROVOD_GPU_OPERATIONS $HOROVOD_GPU_OPERATIONS
ENV HOROVOD_WITHOUT_MPI $HOROVOD_WITHOUT_MPI
ENV HOROVOD_WITH_MPI $HOROVOD_WITH_MPI
ENV HOROVOD_GPU $HOROVOD_GPU
ENV HOROVOD_NCCL_HOME $HOROVOD_NCCL_HOME
ENV NCCL_LIB_DIR=${HOROVOD_NCCL_HOME}/lib
ENV HOROVOD_NCCL_LINK=${WITH_OFI:+SHARED}
ENV LD_LIBRARY_PATH=${WITH_OFI:+$NCCL_LIB_DIR:}$LD_LIBRARY_PATH
RUN if [ "$HOROVOD_PIP" != "0" ]; then pip install "${HOROVOD_PIP}" ; fi
RUN pip uninstall -y tb-nightly tensorboardX
RUN pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
ARG WITH_AWS_TRACE
ARG INTERNAL_AWS_DS
ARG INTERNAL_AWS_PATH
ARG ROCM_DIR=/opt/rocm
ENV ROCM_DIR $ROCM_DIR
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH
RUN ldconfig
ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH
# Reset entrypoint.
# Set an entrypoint that can scrape up the host libfabric.so and then
# run the user command. This is intended to enable performant execution
# on non-IB systems that have a proprietary libfabric.
RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
ARG WITH_RCCL=1
ENV WITH_RCCL=$WITH_RCCL
ARG WITH_NFS_WORKAROUND=1
ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND
ENTRYPOINT ["/container/bin/scrape_libs.sh"]
CMD ["/bin/bash"]
USER root
RUN rm -r /tmp/*