diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..95e3b0a83 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,44 @@ +FROM metaseqopt.azurecr.io/metaseq-opt-singularity:latest + +# switch to aiscuser as the user we want to use in the devcontainer. +# Note: this is the non-root user configured in the metaseq-opt-singularity +# image. +USER aiscuser + +RUN pip install pre-commit==3.0.2 +RUN pip install amlt==9.12.1 --extra-index-url https://msrpypi.azurewebsites.net/stable/leloojoo + +# install zsh and oh-my-zsh +RUN sudo apt update && \ + sudo apt install -yq zsh && \ + sudo chsh $USER -s /usr/bin/zsh && \ + sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended + +# install azcopy, a tool to copy to/from blob storage +# for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file +RUN wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \ + tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \ + mkdir -p ~/.local/bin && \ + mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \ + echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc && \ + chmod +x ~/.local/bin/azcopy && \ + rm -rf azcopy_linux_amd64* + +# setup rust-based tools +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y +ENV PATH="~/.cargo/bin:${PATH}" + +RUN cargo install --locked \ + fd-find \ + ripgrep \ + http-server \ + watchexec-cli + +# add user tools +RUN sudo apt install -yq \ + jq \ + jp \ + tree \ + tldr + +RUN conda init zsh diff --git a/.devcontainer/devcontainer.env b/.devcontainer/devcontainer.env new file mode 100644 index 000000000..835e37ced --- /dev/null +++ b/.devcontainer/devcontainer.env @@ -0,0 +1,7 @@ +SUBSCRIPTION_NAME= +SUBSCRIPTION_ID= +RESOURCE_GROUP_NAME= +STORAGE_ACCOUNT_NAME= +STORAGE_CONTAINER_DATA= +STORAGE_CONTAINER_EXP= +STORAGE_CONTAINER_OUTPUT= diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..e1c785a1c --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,70 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +{ + "name": "Metaseq", + "build": { + "dockerfile": "Dockerfile", + "context": "..", + "args": {} + }, + // Features to add to the dev container. More info: https://containers.dev/features. + "features": { + "ghcr.io/devcontainers/features/docker-in-docker:2": {}, + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/devcontainers/features/powershell:1": {}, + }, + "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh", + "customizations": { + "vscode": { + "settings": { + "autoDocstring.docstringFormat": "one-line-sphinx", + "python.analysis.autoImportCompletions": true, + "python.analysis.autoImportUserSymbols": true, + "python.formatting.provider": "yapf", + "python.linting.enabled": true, + "python.linting.flake8Enabled": true, + "python.defaultInterpreterPath": "/opt/conda/envs/ptca/bin/python", + "isort.check": true, + "dev.containers.copyGitConfig": true, + "terminal.integrated.defaultProfile.linux": "zsh", + "terminal.integrated.profiles.linux": { + "zsh": { + "path": "/usr/bin/zsh" + }, + } + }, + "extensions": [ + "aaron-bond.better-comments", + "eamodio.gitlens", + "EditorConfig.EditorConfig", + "foxundermoon.shell-format", + "ms-azuretools.vscode-docker", + "ms-python.python", + "ms-python.vscode-pylance", + "redhat.vscode-yaml", + "stkb.rewrap", + "yzhang.markdown-all-in-one", + "njpwerner.autodocstring", + "mhutchie.git-graph", + "GitHub.copilot", + "GitHub.copilot-labs", + "lehoanganh298.json-lines-viewer" + ] + } + }, + "mounts": [ + "type=bind,source=${localEnv:HOME}/workspace/mnt/nlg-distill,target=/mnt/input_data_dir", + "type=bind,source=${localEnv:HOME}/workspace/mnt/amulet-output,target=/mnt/output_dir", + ], + "runArgs": [ + "--gpus", + "all", + "--ipc", + "host", + "--ulimit", + "memlock=-1", + "--env-file", + ".devcontainer/devcontainer.env" + ], + // This user must match the name of the user used in the singularity image. + "remoteUser": "aiscuser", +} diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh new file mode 100755 index 000000000..c489f9036 --- /dev/null +++ b/.devcontainer/postCreateCommand.sh @@ -0,0 +1,15 @@ +git config --global safe.directory '*' +git config --global core.editor "code --wait" +git config --global pager.branch false + +# install precommit hooks +pre-commit install + +# Install metaseq and dependencies +conda run -n ptca pip install --user -e ".[dev,docs]" +conda run -n ptca python setup.py develop --user + +# Install docs dependencies and generate docs +cd docs +make html +cd .. diff --git a/Dockerfile b/Dockerfile index 8eb3bfa2a..668948db0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,51 +1,171 @@ -FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 +FROM singularitybase.azurecr.io/base/job/pytorch/ptca-1.13.1-cuda11.7:20230112T151134502 -ARG DEBIAN_FRONTEND=noninteractive +############################################################################## +# Temporary Installation Directory +############################################################################## +ENV STAGE_DIR=/tmp/stage_dir +RUN mkdir -p ${STAGE_DIR} && \ + chmod 777 ${STAGE_DIR} -RUN mkdir -p /build -WORKDIR /build +############################################################################## +# Installation/Basic Utilities +############################################################################## +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -yq --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + pdsh g++ gcc \ + curl wget vim tmux emacs less unzip \ + htop iftop iotop ca-certificates \ + rsync iputils-ping net-tools sudo \ + libfuse-dev fuse \ + git git-lfs \ + # libnuma-dev is required by MLNX + libnuma-dev \ + dos2unix psmisc graphviz llvm-10-dev ninja-build npm \ + libaio-dev \ + jq \ + lshw \ + dmidecode \ + util-linux \ + automake \ + autoconf \ + libtool \ + perftest \ + net-tools \ + openssh-client \ + openssh-server \ + pciutils \ + libaio-dev \ + libcap2 \ + default-jdk \ + lsb-release -RUN apt-key del 7fa2af80 && \ - apt-get -qq update && \ - apt-get -qq install -y --no-install-recommends curl && \ - curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \ - dpkg -i cuda-keyring_1.0-1_all.deb +RUN cp -s /usr/share/pyshared/lsb_release.py /opt/conda/envs/ptca/lib/python3.8/site-packages/lsb_release.py +RUN apt-get clean -y all -RUN apt-get -qq update \ - && apt-get -qq install -y --no-install-recommends \ - git \ - python3-pip python3-dev +# Remove apt intermmediate files +RUN rm -rf /var/lib/apt/lists/* -# Install Pytorch -RUN pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html +############################################################################## +# Mellanox OFED +############################################################################## +ENV MLNX_OFED_VERSION=5.1-2.5.8.0 +RUN cd ${STAGE_DIR} && \ + wget -q -O - http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --force --all -q --skip-unsupported-devices-check && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* -# Install APEX -RUN git clone https://github.com/NVIDIA/apex.git -WORKDIR /build/apex +############################################################################## +# Python (MLNX 5.1 requires python2 .......) +############################################################################## +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHON_VERSION=3 +RUN rm -f /usr/bin/python /usr/bin/python3 /usr/bin/pip && \ + ln -s /opt/conda/envs/ptca/bin/python3.8 /usr/bin/python3 && \ + ln -s /opt/conda/envs/ptca/bin/python3.8 /usr/bin/python && \ + ln -s /opt/conda/envs/ptca/bin/pip /usr/bin/pip && \ + # Print python and pip version + python -V && pip -V -RUN git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea -RUN pip3 install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ +############################################################################## +# nv_peer_mem +############################################################################## +ENV NV_PEER_MEM_VERSION=1.1 +ENV NV_PEER_MEM_TAG=1.1-0 +RUN mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd /tmp && \ + tar xzf /tmp/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i /tmp/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb && \ + rm -rf /var/lib/apt/lists/* ${STAGE_DIR}/nv_peer_memory /tmp/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz /tmp/nvidia-peer-memory-${NV_PEER_MEM_VERSION} -# Install Megatron-LM branch -WORKDIR /build +############################################################################## +# NCCL RDMA Sharp plugin +############################################################################## +RUN cd ${STAGE_DIR} && \ + mkdir -p /usr/local/nccl-rdma-sharp-plugins && \ + apt-get update && \ + apt-get install -y zlib1g-dev && \ + git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins.git && \ + cd nccl-rdma-sharp-plugins && \ + git checkout v2.0.x-ar && \ + ./autogen.sh && \ + ./configure --prefix=/usr/local/nccl-rdma-sharp-plugins --with-cuda=/usr/local/cuda && \ + make && \ + make install && \ + LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:${LD_LIBRARY_PATH} && \ + LD_PRELOAD=/usr/local/nccl-rdma-sharp-plugins/lib/libnccl-net.so:${LD_PRELOAD} -RUN git clone --branch fairseq_v3 https://github.com/ngoyal2707/Megatron-LM.git -WORKDIR /build/Megatron-LM -RUN pip3 install six regex -RUN pip3 install -e . +ENV LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:${LD_LIBRARY_PATH} -# Install Fairscale -WORKDIR /build +############################################################################## +# Create a non-root user. see https://aka.ms/vscode-remote/containers/non-root-user +############################################################################## +# we do this here to ensure that our user packages below are installed with the +# proper permissions -RUN git clone --branch prefetch_fsdp_params_simple https://github.com/facebookresearch/fairscale.git -WORKDIR /build/fairscale -RUN git checkout fixing_memory_issues_with_keeping_overlap_may24 -RUN pip3 install -e . +RUN sudo echo -e "[No password prompt]\nIdentity=unix-group:sudo\nAction=*\nResultActive=yes" \ +> /etc/polkit-1/localauthority/50-local.d/45-allow-no-password.pkla +RUN chmod -R 777 /opt/conda/envs/ptca +RUN chmod -R 777 /tmp -# Install metaseq -WORKDIR /build -RUN git clone https://github.com/facebookresearch/metaseq.git -WORKDIR /build/metaseq -RUN pip3 install -e . -# turn on pre-commit hooks -RUN pre-commit install +ARG USERNAME=aiscuser + +RUN echo $USERNAME ALL=\(ALL\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME + +ENV SHELL /bin/bash +USER $USERNAME +WORKDIR /home/$USERNAME +ENV PATH="/home/${USERNAME}/.local/bin:/opt/conda/condabin:${PATH}" +RUN conda init bash +RUN sudo passwd -d `whoami` + +############################################################################## +# User Packages +############################################################################## +ARG TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" + +RUN cd ${STAGE_DIR} && \ + git clone https://github.com/NVIDIA/apex.git && \ + cd apex && \ + git checkout 265b451de8ba9bfcb67edc7360f3d8772d0a8bea && \ + pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./ + +# git checkout fa6c0860b62e4ed2ac13a513e7d950d72f576a44 +RUN cd ${STAGE_DIR} && \ + git clone --branch fairseq_v3 https://github.com/ngoyal2707/Megatron-LM.git && \ + cd Megatron-LM && \ + git checkout fa6c0860b62e4ed2ac13a513e7d950d72f576a44 && \ + pip install six regex && \ + pip install . + +# git checkout 91132c7e997c5affe97ce002e52cadd798220b06 +RUN cd ${STAGE_DIR} && \ + git clone https://github.com/facebookresearch/fairscale.git && \ + cd fairscale && \ + git checkout fixing_memory_issues_with_keeping_overlap_may24 && \ + pip install . + +RUN pip install \ + aim==3.16.2 \ + py-rouge==1.1 \ + rouge_score==0.1.2 \ + parlai==1.7.1 \ + evaluate==0.4.0 + +############################################################################## +# Switch back to root user so singularity can do runtime-setup +############################################################################## +USER root + +ENV NLTK_DATA="/usr/share/nltk_data" +RUN python -c "import nltk; nltk.download('punkt', download_dir='${NLTK_DATA}')"