Skip to content

Commit 9fc2d0c

Browse files
committed
Pad-8: Upgrade to ROCM 5.6 and add multi-node support. (#230)
* Add docker_scripts; updated Makefile and Dockerfile with support for multinode execution. * Add docker_scripts. * large change in scrape_libs.sh to fix issues when multiple libfabric.so libs are present; and, fixed missing python libs. * Created clean branch with all the changes need for ROCM 5.6 multi-node execution * Bumped version; updated CircleCI config targets. * removed 'a few extraneous comments.'
1 parent 622d512 commit 9fc2d0c

File tree

8 files changed

+487
-64
lines changed

8 files changed

+487
-64
lines changed

.circleci/config.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,13 @@ workflows:
195195
- tf28-gpu
196196
- pt-gpu
197197
- pt2-gpu
198-
- pytorch10-tf27-rocm50
198+
- pytorch13-tf210-rocm56
199+
- pytorch20-tf210-rocm56
199200
exclude:
200201
- with-mpi: 1
201-
image-type: pytorch10-tf27-rocm50
202+
image-type:
203+
- pytorch13-tf210-rocm56
204+
- pytorch20-tf210-rocm56
202205
- build-and-publish-docker:
203206
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>
204207
context: determined-production
@@ -257,11 +260,14 @@ workflows:
257260
- tf28-gpu
258261
- pt-gpu
259262
- pt2-gpu
260-
- pytorch10-tf27-rocm50
263+
- pytorch13-tf210-rocm56
264+
- pytorch20-tf210-rocm56
261265
exclude:
262266
- dev-mode: true
263267
with-mpi: 1
264-
image-type: pytorch10-tf27-rocm50
268+
image-type:
269+
- pytorch13-tf210-rocm56
270+
- pytorch20-tf210-rocm56
265271

266272
- build-and-publish-docker:
267273
name: build-and-publish-docker-<<matrix.image-type>>-<<matrix.with-mpi>>-dev

Dockerfile-default-rocm

Lines changed: 154 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,117 @@
11
ARG BASE_IMAGE
22
FROM ${BASE_IMAGE}
33

4+
RUN apt remove -y openmpi ucx
5+
#Let's remove existing /opt/ompi; and, link to our version.
6+
RUN rm -rf /opt/ompi
7+
RUN ln -s /container/ompi /opt
8+
49
RUN mkdir -p /var/run/sshd
510
RUN rm /etc/apt/sources.list.d/rocm.list
611
RUN apt-get update \
7-
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
8-
autoconf \
9-
automake \
10-
autotools-dev \
11-
build-essential \
12-
ca-certificates \
13-
curl \
14-
daemontools \
15-
libkrb5-dev \
16-
libssl-dev \
17-
libtool \
18-
git \
19-
krb5-user \
20-
cmake \
21-
g++-4.8 \
22-
make \
23-
openssh-client \
24-
openssh-server \
25-
pkg-config \
26-
wget \
27-
nfs-common \
28-
unattended-upgrades \
29-
&& unattended-upgrade \
30-
&& rm -rf /var/lib/apt/lists/* \
31-
&& rm /etc/ssh/ssh_host_ecdsa_key \
32-
&& rm /etc/ssh/ssh_host_ed25519_key \
33-
&& rm /etc/ssh/ssh_host_rsa_key
12+
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
13+
autoconf \
14+
automake \
15+
autotools-dev \
16+
build-essential \
17+
ca-certificates \
18+
curl \
19+
daemontools \
20+
glibc-source \
21+
ibverbs-providers \
22+
libibverbs1 \
23+
libkrb5-dev \
24+
librdmacm1 \
25+
libssl-dev \
26+
libtool \
27+
git \
28+
krb5-user \
29+
cmake \
30+
g++ \
31+
make \
32+
openssh-client \
33+
openssh-server \
34+
pkg-config \
35+
wget \
36+
nfs-common \
37+
libnuma1 \
38+
libnuma-dev \
39+
libpmi2-0-dev \
40+
unattended-upgrades \
41+
&& unattended-upgrade \
42+
&& rm -rf /var/lib/apt/lists/* \
43+
&& rm /etc/ssh/ssh_host_ecdsa_key \
44+
&& rm /etc/ssh/ssh_host_ed25519_key \
45+
&& rm /etc/ssh/ssh_host_rsa_key
46+
RUN pip install pip install --upgrade pip
3447

3548
COPY dockerfile_scripts /tmp/det_dockerfile_scripts
3649

50+
ENV PATH="/opt/conda/envs/py_3.8/bin:${PATH}"
51+
52+
ARG CONDA="${PATH}"
53+
54+
ENV PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 PYTHONHASHSEED=0
55+
56+
# Install fixed version of FFI package for Ubuntu 20.04.
57+
# This is done after above stuff to make sure we get right version.
58+
RUN /tmp/det_dockerfile_scripts/install_package_fixes.sh
59+
60+
RUN apt install rocm-libs
61+
62+
#USING OFI
63+
ARG WITH_MPI=1
64+
ARG WITH_OFI=1
65+
ARG WITH_MPICH
66+
ARG UCX_INSTALL_DIR=/container/ucx
67+
ARG OMPI_INSTALL_DIR=/container/ompi
68+
ARG MPICH_INSTALL_DIR=/container/mpich
69+
ARG OFI_INSTALL_DIR=/container/ofi
70+
ARG OMPI_WITH_CUDA=0
71+
ARG OMPI_WITH_ROCM=1
72+
RUN if [ "$WITH_MPI" = "1" ]; then /tmp/det_dockerfile_scripts/ompi_rocm.sh "$UBUNTU_VERSION" "$WITH_OFI" "$OMPI_WITH_ROCM" "$WITH_MPICH"; fi
73+
74+
# Make sure OMPI/UCX show up in the right paths
75+
ARG VERBS_LIB_DIR=/usr/lib/libibverbs
76+
ARG UCX_LIB_DIR=${UCX_INSTALL_DIR}/lib:${UCX_INSTALL_DIR}/lib64
77+
ARG UCX_PATH_DIR=${UCX_INSTALL_DIR}/bin
78+
ARG OFI_LIB_DIR=${OFI_INSTALL_DIR}/lib:${OFI_INSTALL_DIR}/lib64
79+
ARG OFI_PATH_DIR=${OFI_INSTALL_DIR}/bin
80+
ARG OMPI_LIB_DIR=${OMPI_INSTALL_DIR}/lib
81+
ARG OMPI_PATH_DIR=${OMPI_INSTALL_DIR}/bin
82+
ARG MPICH_LIB_DIR=${MPICH_INSTALL_DIR}/lib
83+
ARG MPICH_PATH_DIR=${MPICH_INSTALL_DIR}/bin
84+
85+
# Set up UCX_LIBS and OFI_LIBS
86+
ENV UCX_LIBS="${VERBS_LIB_DIR}:${UCX_LIB_DIR}:${OMPI_LIB_DIR}:"
87+
ENV OFI_LIBS="${VERBS_LIB_DIR}:${OFI_LIB_DIR}:${MPICH_LIB_DIR}:"
88+
89+
# If WITH_OFI is true, then set EXTRA_LIBS to OFI libs, else set to empty string
90+
ENV EXTRA_LIBS="${WITH_OFI:+${OFI_LIBS}}"
91+
92+
# If EXTRA_LIBS is empty, set to UCX libs, else leave as OFI libs
93+
ENV EXTRA_LIBS="${EXTRA_LIBS:-${UCX_LIBS}}"
94+
95+
# But, only add them if WITH_MPI
96+
ENV LD_LIBRARY_PATH=${WITH_MPI:+$EXTRA_LIBS}$LD_LIBRARY_PATH
97+
98+
#USING OFI
99+
ENV PATH=${WITH_OFI:+$PATH:${WITH_MPI:+$OFI_PATH_DIR:$MPICH_PATH_DIR}}
100+
101+
#USING UCX
102+
ENV PATH=${PATH:-$CONDA:${WITH_MPI:+$UCX_PATH_DIR:$OMPI_PATH_DIR}}
103+
104+
# Enable running OMPI as root
105+
ENV OMPI_ALLOW_RUN_AS_ROOT ${WITH_MPI:+1}
106+
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM ${WITH_MPI:+1}
107+
108+
109+
110+
RUN pip install cloudpickle
37111
RUN pip install determined && pip uninstall -y determined
112+
RUN pip install google-auth-oauthlib
113+
114+
38115

39116
RUN pip install -r /tmp/det_dockerfile_scripts/notebook-requirements.txt
40117
ENV JUPYTER_CONFIG_DIR=/run/determined/jupyter/config
@@ -48,7 +125,6 @@ RUN /tmp/det_dockerfile_scripts/install_google_cloud_sdk.sh
48125
# google-api-python-client -> google-api-core -> googleapis-common-protos -> protobuf
49126
# Horovod cannot build with protobuf > 3.20.x
50127
# latest google-api-python-client requires protobuf >= 3.20.1
51-
RUN pip install protobuf==3.20.1
52128

53129
ARG TENSORFLOW_PIP
54130
RUN if [ "$TENSORFLOW_PIP" ]; then pip install $TENSORFLOW_PIP; fi
@@ -69,13 +145,58 @@ ARG HOROVOD_WITH_TENSORFLOW=1
69145
ARG HOROVOD_WITH_PYTORCH=1
70146
ARG HOROVOD_WITHOUT_MXNET=1
71147
ARG HOROVOD_GPU_OPERATIONS=NCCL
72-
ARG HOROVOD_WITHOUT_MPI=1
148+
ARG HOROVOD_WITHOUT_MPI=0
149+
ARG HOROVOD_WITH_MPI=1
73150
ARG HOROVOD_GPU=ROCM
74-
ARG HOROVOD_WITHOUT_MPI=1
75-
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/hip/lib
151+
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH;/opt/rocm/lib:/opt/rocm/hip/lib
152+
153+
ENV HOROVOD_PIP $HOROVOD_PIP
154+
ENV HOROVOD_WITH_TENSORFLOW $HOROVOD_WITH_TENSORFLOW
155+
ENV HOROVOD_WITH_PYTORCH $HOROVOD_WITH_PYTORCH
156+
ENV HOROVOD_WITHOUT_MXNET $HOROVOD_WITHOUT_MXNET
157+
ENV HOROVOD_GPU_OPERATIONS $HOROVOD_GPU_OPERATIONS
158+
ENV HOROVOD_WITHOUT_MPI $HOROVOD_WITHOUT_MPI
159+
ENV HOROVOD_WITH_MPI $HOROVOD_WITH_MPI
160+
ENV HOROVOD_GPU $HOROVOD_GPU
161+
ENV HOROVOD_NCCL_HOME $HOROVOD_NCCL_HOME
162+
ENV NCCL_LIB_DIR=${HOROVOD_NCCL_HOME}/lib
163+
ENV HOROVOD_NCCL_LINK=${WITH_OFI:+SHARED}
164+
ENV LD_LIBRARY_PATH=${WITH_OFI:+$NCCL_LIB_DIR:}$LD_LIBRARY_PATH
165+
76166
RUN if [ "$HOROVOD_PIP" != "0" ]; then pip install "${HOROVOD_PIP}" ; fi
77167

78-
RUN rm -r /tmp/*
168+
RUN pip uninstall -y tb-nightly tensorboardX
169+
RUN pip install -r /tmp/det_dockerfile_scripts/additional-requirements-rocm.txt
170+
171+
172+
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
173+
174+
ARG AWS_PLUGIN_INSTALL_DIR=/container/aws
175+
ARG WITH_AWS_TRACE
176+
ARG INTERNAL_AWS_DS
177+
ARG INTERNAL_AWS_PATH
178+
ARG ROCM_DIR=/opt/rocm
179+
ENV ROCM_DIR $ROCM_DIR
180+
RUN if [ "$WITH_OFI" = "1" ]; then /tmp/det_dockerfile_scripts/build_aws_rocm.sh "$WITH_OFI" "$WITH_AWS_TRACE" "$WITH_MPICH"; fi
181+
ENV LD_LIBRARY_PATH=${WITH_OFI:+$AWS_PLUGIN_INSTALL_DIR:}$LD_LIBRARY_PATH
182+
RUN ldconfig
183+
184+
ENV PATH=$OMPI_PATH_DIR:$OFI_INSTALL_DIR:$PATH
79185
# Reset entrypoint.
80-
ENTRYPOINT []
186+
187+
# Set an entrypoint that can scrape up the host libfabric.so and then
188+
# run the user command. This is intended to enable performant execution
189+
# on non-IB systems that have a proprietary libfabric.
190+
191+
RUN mkdir -p /container/bin && cp /tmp/det_dockerfile_scripts/scrape_libs.sh /container/bin
192+
193+
ARG WITH_RCCL=1
194+
ENV WITH_RCCL=$WITH_RCCL
195+
ARG WITH_NFS_WORKAROUND=1
196+
ENV WITH_NFS_WORKAROUND=$WITH_NFS_WORKAROUND
197+
198+
ENTRYPOINT ["/container/bin/scrape_libs.sh"]
199+
CMD ["/bin/bash"]
81200
USER root
201+
202+
RUN rm -r /tmp/*

Makefile

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ CUDA_111_PREFIX := $(REGISTRY_REPO):cuda-11.1-
1515
CUDA_112_PREFIX := $(REGISTRY_REPO):cuda-11.2-
1616
CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
1717
CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
18-
ROCM_50_PREFIX := $(REGISTRY_REPO):rocm-5.0-
18+
ROCM_56_PREFIX := $(REGISTRY_REPO):rocm-5.6-
1919

2020
CPU_SUFFIX := -cpu
2121
GPU_SUFFIX := -gpu
@@ -178,20 +178,38 @@ build-gpu-cuda-118-base:
178178
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(VERSION) \
179179
.
180180

181-
export ROCM50_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_50_PREFIX)pytorch-1.10-tf-2.7-rocm
182-
export TF_PROFILER_PIP := tensorboard-plugin-profile
183-
export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1
181+
ifeq ($(WITH_MPICH),1)
182+
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-mpich
183+
else
184+
ROCM56_TORCH13_MPI :=pytorch-1.3-tf-2.10-rocm-ompi
185+
endif
186+
export ROCM56_TORCH13_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH13_MPI)
187+
.PHONY: build-pytorch13-tf210-rocm56
188+
build-pytorch13-tf210-rocm56:
189+
docker build -f Dockerfile-default-rocm \
190+
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_1.13.1"\
191+
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
192+
--build-arg HOROVOD_PIP="horovod==0.28.1" \
193+
--build-arg WITH_MPICH=$(WITH_MPICH) \
194+
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
195+
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME)-$(VERSION) \
196+
.
184197

185-
.PHONY: build-pytorch10-tf27-rocm50
186-
build-pytorch10-tf27-rocm50:
198+
ifeq ($(WITH_MPICH),1)
199+
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-mpich
200+
else
201+
ROCM56_TORCH_MPI :=pytorch-2.0-tf-2.10-rocm-ompi
202+
endif
203+
export ROCM56_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_56_PREFIX)$(ROCM56_TORCH_MPI)
204+
.PHONY: build-pytorch20-tf210-rocm56
205+
build-pytorch20-tf210-rocm56:
187206
docker build -f Dockerfile-default-rocm \
188-
--build-arg BASE_IMAGE="amdih/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.10.0" \
189-
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
190-
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.7.1" \
191-
--build-arg TF_PROFILER_PIP="$(TF_PROFILER_PIP)" \
192-
--build-arg HOROVOD_PIP="horovod==0.25.0" \
193-
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
194-
-t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
207+
--build-arg BASE_IMAGE="rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1" \
208+
--build-arg TENSORFLOW_PIP="tensorflow-rocm==2.10.1.540" \
209+
--build-arg HOROVOD_PIP="horovod==0.28.1" \
210+
--build-arg WITH_MPICH=$(WITH_MPICH) \
211+
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
212+
-t $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \
195213
.
196214

197215
DEEPSPEED_VERSION := 0.8.3
@@ -520,9 +538,13 @@ ifneq ($(NGC_PUBLISH),)
520538
scripts/publish-docker.sh tf28-gpu-$(WITH_MPI) $(NGC_REGISTRY)/$(GPU_TF28_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION)
521539
endif
522540

523-
.PHONY: publish-pytorch10-tf27-rocm50
524-
publish-pytorch10-tf27-rocm50:
525-
scripts/publish-docker.sh pytorch10-tf27-rocm50-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
541+
.PHONY: publish-pytorch13-tf210-rocm56
542+
publish-pytorch13-tf210-rocm56:
543+
scripts/publish-docker.sh pytorch13-tf210-rocm56-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH13_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
544+
545+
.PHONY: publish-pytorch20-tf210-rocm56
546+
publish-pytorch20-tf210-rocm56:
547+
scripts/publish-docker.sh pytorch20-tf210-rocm56-$(WITH_MPI) $(DOCKERHUB_REGISTRY)/$(ROCM56_TORCH_TF_ENVIRONMENT_NAME) $(SHORT_GIT_HASH) $(VERSION) $(ARTIFACTS_DIR)
526548

527549
.PHONY: publish-cloud-images
528550
publish-cloud-images:

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.26.4
1+
0.26.5
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
attrdict3
2+
pandas
3+
matplotlib
4+
tensorflow-datasets==1.3.2
5+
Keras-Preprocessing[image]
6+
# TODO(DET-4259) Remove this when we fix the circular dependency with the main repo.
7+
petname
8+
azure-storage-blob
9+
Pillow>=8.3.2,<=9.5.0
10+
analytics-python
11+
nvidia-ml-py
12+
protobuf<=3.20.3
13+
tensorboard==2.10.1
14+
pynvml
15+
tokenizers==0.13.0
16+
huggingface-hub==0.16.4

0 commit comments

Comments
 (0)