11ARG BASE_IMAGE
22FROM ${BASE_IMAGE}
33
4+ RUN apt remove -y openmpi ucx
5+ #Let's remove existing /opt/ompi; and, link to our version.
6+ RUN rm -rf /opt/ompi
7+ RUN ln -s /container/ompi /opt
8+
49RUN mkdir -p /var/run/sshd
510RUN rm /etc/apt/sources.list.d/rocm.list
611RUN apt-get update \
7- && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
8- autoconf \
9- automake \
10- autotools-dev \
11- build-essential \
12- ca-certificates \
13- curl \
14- daemontools \
15- libkrb5-dev \
16- libssl-dev \
17- libtool \
18- git \
19- krb5-user \
20- cmake \
21- g++-4.8 \
22- make \
23- openssh-client \
24- openssh-server \
25- pkg-config \
26- wget \
27- nfs-common \
28- unattended-upgrades \
29- && unattended-upgrade \
30- && rm -rf /var/lib/apt/lists/* \
31- && rm /etc/ssh/ssh_host_ecdsa_key \
32- && rm /etc/ssh/ssh_host_ed25519_key \
33- && rm /etc/ssh/ssh_host_rsa_key
12+ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
13+ autoconf \
14+ automake \
15+ autotools-dev \
16+ build-essential \
17+ ca-certificates \
18+ curl \
19+ daemontools \
20+ glibc-source \
21+ ibverbs-providers \
22+ libibverbs1 \
23+ libkrb5-dev \
24+ librdmacm1 \
25+ libssl-dev \
26+ libtool \
27+ git \
28+ krb5-user \
29+ cmake \
30+ g++ \
31+ make \
32+ openssh-client \
33+ openssh-server \
34+ pkg-config \
35+ wget \
36+ nfs-common \
37+ libnuma1 \
38+ libnuma-dev \
39+ libpmi2-0-dev \
40+ unattended-upgrades \
41+ && unattended-upgrade \
42+ && rm -rf /var/lib/apt/lists/* \
43+ && rm /etc/ssh/ssh_host_ecdsa_key \
44+ && rm /etc/ssh/ssh_host_ed25519_key \
45+ && rm /etc/ssh/ssh_host_rsa_key
46+ RUN pip install pip install --upgrade pip
3447
3548COPY dockerfile_scripts /tmp/det_dockerfile_scripts
3649
50+ ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"
51+
52+ ARG CONDA="${PATH}"
53+
54+ ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
55+
56+ # Install fixed version of FFI package for Ubuntu 20.04.
57+ # This is done after above stuff to make sure we get right version.
58+ RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
59+
60+ RUN apt install rocm-libs
61+
62+ #USING OFI
63+ ARG WITH_MPI=1
64+ ARG WITH_OFI=1
65+ ARG WITH_MPICH
66+ ARG UCX_INSTALL_DIR=/container/ucx
67+ ARG OMPI_INSTALL_DIR=/container/ompi
68+ ARG MPICH_INSTALL_DIR=/container/mpich
69+ ARG OFI_INSTALL_DIR=/container/ofi
70+ ARG OMPI_WITH_CUDA=0
71+ ARG OMPI_WITH_ROCM=1
72+ RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi
73+
74+ # Make sure OMPI/UCX show up in the right paths
75+ ARG VERBS_LIB_DIR=/usr/lib/libibverbs
76+ ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
77+ ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
78+ ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
79+ ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
80+ ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
81+ ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
82+ ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
83+ ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin
84+
85+ # Set up UCX_LIBS and OFI_LIBS
86+ ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
87+ ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"
88+
89+ # If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
90+ ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"
91+
92+ # If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
93+ ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"
94+
95+ # But, only add them if WITH_MPI
96+ ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH
97+
98+ #USING OFI
99+ ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}
100+
101+ #USING UCX
102+ ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}
103+
104+ # Enable running OMPI as root
105+ ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
106+ ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}
107+
108+
109+
110+ RUN pip install cloudpickle
37111RUN pip install determined && pip uninstall -y determined
112+ RUN pip install google-auth-oauthlib
113+
114+
38115
39116RUN pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt
40117ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
@@ -48,7 +125,6 @@ RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh
48125# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf
49126# Horovod cannot build with protobuf > 3.20.x
50127# latest google-api-python-client requires protobuf >= 3.20.1
51- RUN pip install protobuf==3.20.1
52128
53129ARG TENSORFLOW_PIP
54130RUN if [ "$TENSORFLOW_PIP" ]; then pip install $TENSORFLOW_PIP; fi
@@ -69,13 +145,58 @@ ARG HOROVOD_WITH_TENSORFLOW=1
69145ARG HOROVOD_WITH_PYTORCH=1
70146ARG HOROVOD_WITHOUT_MXNET=1
71147ARG HOROVOD_GPU_OPERATIONS=NCCL
72- ARG HOROVOD_WITHOUT_MPI=1
148+ ARG HOROVOD_WITHOUT_MPI=0
149+ ARG HOROVOD_WITH_MPI=1
73150ARG HOROVOD_GPU=ROCM
74- ARG HOROVOD_WITHOUT_MPI=1
75- ENV LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/hip/lib
151+ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH;/opt/rocm/lib:/opt/rocm/hip/lib
152+
153+ ENV HOROVOD_PIP $HOROVOD_PIP
154+ ENV HOROVOD_WITH_TENSORFLOW $HOROVOD_WITH_TENSORFLOW
155+ ENV HOROVOD_WITH_PYTORCH $HOROVOD_WITH_PYTORCH
156+ ENV HOROVOD_WITHOUT_MXNET $HOROVOD_WITHOUT_MXNET
157+ ENV HOROVOD_GPU_OPERATIONS $HOROVOD_GPU_OPERATIONS
158+ ENV HOROVOD_WITHOUT_MPI $HOROVOD_WITHOUT_MPI
159+ ENV HOROVOD_WITH_MPI $HOROVOD_WITH_MPI
160+ ENV HOROVOD_GPU $HOROVOD_GPU
161+ ENV HOROVOD_NCCL_HOME $HOROVOD_NCCL_HOME
162+ ENV NCCL_LIB_DIR=${HOROVOD_NCCL_HOME}/lib
163+ ENV HOROVOD_NCCL_LINK=${WITH_OFI:+SHARED}
164+ ENV LD_LIBRARY_PATH=${WITH_OFI:+$NCCL_LIB_DIR:}$LD_LIBRARY_PATH
165+
76166RUN if [ "$HOROVOD_PIP" != "0" ]; then pip install "${HOROVOD_PIP}" ; fi
77167
78- RUN rm -r /tmp/*
168+ RUN pip uninstall -y tb-nightly tensorboardX
169+ RUN pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt
170+
171+
172+ ENV HSA_FORCE_FINE_GRAIN_PCIE=1
173+
174+ ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
175+ ARG WITH_AWS_TRACE
176+ ARG INTERNAL_AWS_DS
177+ ARG INTERNAL_AWS_PATH
178+ ARG ROCM_DIR=/opt/rocm
179+ ENV ROCM_DIR $ROCM_DIR
180+ RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
181+ ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH
182+ RUN ldconfig
183+
184+ ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH
79185# Reset entrypoint.
80- ENTRYPOINT []
186+
187+ # Set an entrypoint that can scrape up the host libfabric.so and then
188+ # run the user command. This is intended to enable performant execution
189+ # on non-IB systems that have a proprietary libfabric.
190+
191+ RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
192+
193+ ARG WITH_RCCL=1
194+ ENV WITH_RCCL=$WITH_RCCL
195+ ARG WITH_NFS_WORKAROUND=1
196+ ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND
197+
198+ ENTRYPOINT ["/container/bin/scrape_libs.sh"]
199+ CMD ["/bin/bash"]
81200USER root
201+
202+ RUN rm -r /tmp/*
0 commit comments