Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 34 additions & 9 deletions micro-benchmarks/nccl-tests/nccl-tests.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
openssh-server \
pkg-config \
python3-distutils \
libhwloc-dev \
vim
RUN apt-get purge -y cuda-compat-*

Expand All @@ -53,10 +54,6 @@ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config &&
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Set paths for both aarch64 and x86_64
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH

RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
&& python3 /tmp/get-pip.py \
&& pip3 install awscli pynvml
Expand All @@ -68,10 +65,10 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
## that the cuda-compat-xx-x package is the latest.
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
&& cd /tmp/gdrcopy \
&& make prefix=/opt/gdrcopy install
&& make prefix=/opt/gdrcopy install \
&& echo "/opt/gdrcopy/lib" > /etc/ld.so.conf.d/000_gdrcopy.conf \
&& ldconfig

ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
ENV CPATH=/opt/gdrcopy/include:$CPATH
ENV PATH=/opt/gdrcopy/bin:$PATH

Expand All @@ -82,14 +79,23 @@ RUN cd $HOME \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer
&& rm -rf $HOME/aws-efa-installer \
&& echo "/opt/amazon/openmpi/lib" > /etc/ld.so.conf.d/000_efa_ompi.conf \
&& ldconfig

# For ofi-nccl set paths for both aarch64 and x86_64
ENV LD_LIBRARY_PATH=/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH

ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH

###################################################
## Install NCCL
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
&& cd /opt/nccl \
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" \
&& echo "/opt/nccl/build/lib" > /etc/ld.so.conf.d/000_nccl.conf \
&& ldconfig

###################################################
## Install NCCL-tests
Expand All @@ -102,6 +108,25 @@ RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git
NCCL_HOME=/opt/nccl/build \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"

###################################################
## Install AWS OFI NCCL
RUN git clone -b ${AWS_OFI_NCCL_VERSION} https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \
cd /opt/aws-ofi-nccl && \
./autogen.sh && \
./configure \
--with-libfabric=/opt/amazon/efa \
--prefix=/opt/aws-ofi-nccl/build \
--with-nccl=/opt/nccl/build \
--with-mpi=/opt/amazon/openmpi \
--enable-platform-aws \
--with-cuda=/usr/local/cuda \
--enable-cudart-dynamic \
--disable-tests \
--without-lttng \
--without-valgrind \
--disable-werror && \
make -j $(nproc) && make install

RUN rm -rf /var/lib/apt/lists/*

## Set Open MPI variables to exclude network interface and conduit.
Expand Down