Skip to content
This repository was archived by the owner on Nov 1, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
FROM metaseqopt.azurecr.io/metaseq-opt-singularity:latest
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You would likely have to change the base image.
I think this image is public, but there aren't guarantees about preserving it.
Also has some other code for private cluster that you would not need and adds extra complexity


# switch to aiscuser as the user we want to use in the devcontainer.
# Note: this is the non-root user configured in the metaseq-opt-singularity
# image.
USER aiscuser

RUN pip install pre-commit==3.0.2
RUN pip install amlt==9.12.1 --extra-index-url https://msrpypi.azurewebsites.net/stable/leloojoo

# install zsh and oh-my-zsh
RUN sudo apt update && \
sudo apt install -yq zsh && \
sudo chsh $USER -s /usr/bin/zsh && \
sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended

# install azcopy, a tool to copy to/from blob storage
# for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file
RUN wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \
tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \
mkdir -p ~/.local/bin && \
mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \
echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc && \
chmod +x ~/.local/bin/azcopy && \
rm -rf azcopy_linux_amd64*

# setup rust-based tools
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="~/.cargo/bin:${PATH}"

RUN cargo install --locked \
fd-find \
ripgrep \
http-server \
watchexec-cli

# add user tools
RUN sudo apt install -yq \
jq \
jp \
tree \
tldr

RUN conda init zsh
7 changes: 7 additions & 0 deletions .devcontainer/devcontainer.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SUBSCRIPTION_NAME=<removed>
SUBSCRIPTION_ID=<removed>
RESOURCE_GROUP_NAME=<removed>
STORAGE_ACCOUNT_NAME=<removed>
STORAGE_CONTAINER_DATA=<removed>
STORAGE_CONTAINER_EXP=<removed>
STORAGE_CONTAINER_OUTPUT=<removed>
70 changes: 70 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
{
"name": "Metaseq",
"build": {
"dockerfile": "Dockerfile",
"context": "..",
"args": {}
},
// Features to add to the dev container. More info: https://containers.dev/features.
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
"ghcr.io/devcontainers/features/azure-cli:1": {},
"ghcr.io/devcontainers/features/powershell:1": {},
},
"postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh",
"customizations": {
"vscode": {
"settings": {
"autoDocstring.docstringFormat": "one-line-sphinx",
"python.analysis.autoImportCompletions": true,
"python.analysis.autoImportUserSymbols": true,
"python.formatting.provider": "yapf",
"python.linting.enabled": true,
"python.linting.flake8Enabled": true,
"python.defaultInterpreterPath": "/opt/conda/envs/ptca/bin/python",
"isort.check": true,
"dev.containers.copyGitConfig": true,
"terminal.integrated.defaultProfile.linux": "zsh",
"terminal.integrated.profiles.linux": {
"zsh": {
"path": "/usr/bin/zsh"
},
}
},
"extensions": [
"aaron-bond.better-comments",
"eamodio.gitlens",
"EditorConfig.EditorConfig",
"foxundermoon.shell-format",
"ms-azuretools.vscode-docker",
"ms-python.python",
"ms-python.vscode-pylance",
"redhat.vscode-yaml",
"stkb.rewrap",
"yzhang.markdown-all-in-one",
"njpwerner.autodocstring",
"mhutchie.git-graph",
"GitHub.copilot",
"GitHub.copilot-labs",
"lehoanganh298.json-lines-viewer"
]
}
},
"mounts": [
"type=bind,source=${localEnv:HOME}/workspace/mnt/nlg-distill,target=/mnt/input_data_dir",
"type=bind,source=${localEnv:HOME}/workspace/mnt/amulet-output,target=/mnt/output_dir",
],
"runArgs": [
"--gpus",
"all",
"--ipc",
"host",
"--ulimit",
"memlock=-1",
"--env-file",
".devcontainer/devcontainer.env"
],
// This user must match the name of the user used in the singularity image.
"remoteUser": "aiscuser",
}
15 changes: 15 additions & 0 deletions .devcontainer/postCreateCommand.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
git config --global safe.directory '*'
git config --global core.editor "code --wait"
git config --global pager.branch false

# install precommit hooks
pre-commit install

# Install metaseq and dependencies
conda run -n ptca pip install --user -e ".[dev,docs]"
conda run -n ptca python setup.py develop --user

# Install docs dependencies and generate docs
cd docs
make html
cd ..
198 changes: 159 additions & 39 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,51 +1,171 @@
FROM nvidia/cuda:11.3.1-devel-ubuntu20.04
FROM singularitybase.azurecr.io/base/job/pytorch/ptca-1.13.1-cuda11.7:20230112T151134502

ARG DEBIAN_FRONTEND=noninteractive
##############################################################################
# Temporary Installation Directory
##############################################################################
ENV STAGE_DIR=/tmp/stage_dir
RUN mkdir -p ${STAGE_DIR} && \
chmod 777 ${STAGE_DIR}

RUN mkdir -p /build
WORKDIR /build
##############################################################################
# Installation/Basic Utilities
##############################################################################
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -yq --no-install-recommends \
software-properties-common build-essential autotools-dev \
pdsh g++ gcc \
curl wget vim tmux emacs less unzip \
htop iftop iotop ca-certificates \
rsync iputils-ping net-tools sudo \
libfuse-dev fuse \
git git-lfs \
# libnuma-dev is required by MLNX
libnuma-dev \
dos2unix psmisc graphviz llvm-10-dev ninja-build npm \
libaio-dev \
jq \
lshw \
dmidecode \
util-linux \
automake \
autoconf \
libtool \
perftest \
net-tools \
openssh-client \
openssh-server \
pciutils \
libaio-dev \
libcap2 \
default-jdk \
lsb-release

RUN apt-key del 7fa2af80 && \
apt-get -qq update && \
apt-get -qq install -y --no-install-recommends curl && \
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb
RUN cp -s /usr/share/pyshared/lsb_release.py /opt/conda/envs/ptca/lib/python3.8/site-packages/lsb_release.py
RUN apt-get clean -y all

RUN apt-get -qq update \
&& apt-get -qq install -y --no-install-recommends \
git \
python3-pip python3-dev
# Remove apt intermmediate files
RUN rm -rf /var/lib/apt/lists/*

# Install Pytorch
RUN pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
##############################################################################
# Mellanox OFED
##############################################################################
ENV MLNX_OFED_VERSION=5.1-2.5.8.0
RUN cd ${STAGE_DIR} && \
wget -q -O - http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
./mlnxofedinstall --user-space-only --without-fw-update --force --all -q --skip-unsupported-devices-check && \
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*

# Install APEX
RUN git clone https://github.com/NVIDIA/apex.git
WORKDIR /build/apex
##############################################################################
# Python (MLNX 5.1 requires python2 .......)
##############################################################################
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3
RUN rm -f /usr/bin/python /usr/bin/python3 /usr/bin/pip && \
ln -s /opt/conda/envs/ptca/bin/python3.8 /usr/bin/python3 && \
ln -s /opt/conda/envs/ptca/bin/python3.8 /usr/bin/python && \
ln -s /opt/conda/envs/ptca/bin/pip /usr/bin/pip && \
# Print python and pip version
python -V && pip -V

RUN git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea
RUN pip3 install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
##############################################################################
# nv_peer_mem
##############################################################################
ENV NV_PEER_MEM_VERSION=1.1
ENV NV_PEER_MEM_TAG=1.1-0
RUN mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
./build_module.sh && \
cd /tmp && \
tar xzf /tmp/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
cd nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
apt-get update && \
apt-get install -y dkms && \
dpkg-buildpackage -us -uc && \
dpkg -i /tmp/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb && \
rm -rf /var/lib/apt/lists/* ${STAGE_DIR}/nv_peer_memory /tmp/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz /tmp/nvidia-peer-memory-${NV_PEER_MEM_VERSION}

# Install Megatron-LM branch
WORKDIR /build
##############################################################################
# NCCL RDMA Sharp plugin
##############################################################################
RUN cd ${STAGE_DIR} && \
mkdir -p /usr/local/nccl-rdma-sharp-plugins && \
apt-get update && \
apt-get install -y zlib1g-dev && \
git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins.git && \
cd nccl-rdma-sharp-plugins && \
git checkout v2.0.x-ar && \
./autogen.sh && \
./configure --prefix=/usr/local/nccl-rdma-sharp-plugins --with-cuda=/usr/local/cuda && \
make && \
make install && \
LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:${LD_LIBRARY_PATH} && \
LD_PRELOAD=/usr/local/nccl-rdma-sharp-plugins/lib/libnccl-net.so:${LD_PRELOAD}

RUN git clone --branch fairseq_v3 https://github.com/ngoyal2707/Megatron-LM.git
WORKDIR /build/Megatron-LM
RUN pip3 install six regex
RUN pip3 install -e .
ENV LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:${LD_LIBRARY_PATH}

# Install Fairscale
WORKDIR /build
##############################################################################
# Create a non-root user. see https://aka.ms/vscode-remote/containers/non-root-user
##############################################################################
# we do this here to ensure that our user packages below are installed with the
# proper permissions

RUN git clone --branch prefetch_fsdp_params_simple https://github.com/facebookresearch/fairscale.git
WORKDIR /build/fairscale
RUN git checkout fixing_memory_issues_with_keeping_overlap_may24
RUN pip3 install -e .
RUN sudo echo -e "[No password prompt]\nIdentity=unix-group:sudo\nAction=*\nResultActive=yes" \
> /etc/polkit-1/localauthority/50-local.d/45-allow-no-password.pkla
RUN chmod -R 777 /opt/conda/envs/ptca
RUN chmod -R 777 /tmp

# Install metaseq
WORKDIR /build
RUN git clone https://github.com/facebookresearch/metaseq.git
WORKDIR /build/metaseq
RUN pip3 install -e .
# turn on pre-commit hooks
RUN pre-commit install
ARG USERNAME=aiscuser

RUN echo $USERNAME ALL=\(ALL\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME

ENV SHELL /bin/bash
USER $USERNAME
WORKDIR /home/$USERNAME
ENV PATH="/home/${USERNAME}/.local/bin:/opt/conda/condabin:${PATH}"
RUN conda init bash
RUN sudo passwd -d `whoami`

##############################################################################
# User Packages
##############################################################################
ARG TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX"

RUN cd ${STAGE_DIR} && \
git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea && \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./

# git checkout fa6c0860b62e4ed2ac13a513e7d950d72f576a44
RUN cd ${STAGE_DIR} && \
git clone --branch fairseq_v3 https://github.com/ngoyal2707/Megatron-LM.git && \
cd Megatron-LM && \
git checkout fa6c0860b62e4ed2ac13a513e7d950d72f576a44 && \
pip install six regex && \
pip install .

# git checkout 91132c7e997c5affe97ce002e52cadd798220b06
RUN cd ${STAGE_DIR} && \
git clone https://github.com/facebookresearch/fairscale.git && \
cd fairscale && \
git checkout fixing_memory_issues_with_keeping_overlap_may24 && \
pip install .

RUN pip install \
aim==3.16.2 \
py-rouge==1.1 \
rouge_score==0.1.2 \
parlai==1.7.1 \
evaluate==0.4.0

##############################################################################
# Switch back to root user so singularity can do runtime-setup
##############################################################################
USER root

ENV NLTK_DATA="/usr/share/nltk_data"
RUN python -c "import nltk; nltk.download('punkt', download_dir='${NLTK_DATA}')"