test a setup for running fuji 1B on slurm #1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: ~test MaxText functionality | ||
on: | ||
workflow_call: | ||
inputs: | ||
AXLEARN_DOCKER_IMAGE: | ||
type: string | ||
description: Axlearn image from ghcr.io/nvidia | ||
default: ghcr.io/nvidia/jax:axlearn | ||
required: false | ||
jobs: | ||
single-process-single-node: | ||
runs-on: jumpbox | ||
steps: | ||
- name: Check out the repository under ${GITHUB_WORKSPACE} | ||
uses: actions/checkout@v4 | ||
- name: Setup SSH | ||
id: setup-ssh | ||
uses: ./.github/actions/setup-ssh | ||
with: | ||
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} | ||
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} | ||
- name: Labels and metadata | ||
id: meta | ||
shell: bash -x -e {0} | ||
run: | | ||
IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')" | ||
TOTAL_TASKS=1 | ||
MAX_GPUS_PER_NODE=8 | ||
NODES=1 | ||
GPUS_PER_NODE=8 | ||
JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME} | ||
LOG_FILE=/nfs/cluster/${JOB_NAME}.log | ||
MODEL_PATH=/nfs/cluster/${JOB_NAME} | ||
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do | ||
echo "$var=${!var}" >> $GITHUB_OUTPUT | ||
done | ||
- name: Submit SLURM jobs over SSH | ||
id: submit | ||
shell: bash -O expand_aliases -x -e {0} | ||
run: | | ||
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' | ||
sshx "date && hostname && sinfo" | ||
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} | ||
JOB=$(sshx sbatch --parsable << EOF | ||
#!/bin/bash | ||
#SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} | ||
#SBATCH --exclusive | ||
#SBATCH --nodes=${{ steps.meta.outputs.NODES }} | ||
#SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} | ||
#SBATCH --time=00:30:00 | ||
#SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} | ||
# preload enroot container using one task per node | ||
time srun \ | ||
--ntasks-per-node=1 \ | ||
--container-name=runtime \ | ||
--container-image=${{ steps.meta.outputs.IMAGE }} \ | ||
true | ||
# run job with tasks on each node sharing one container | ||
time srun \ | ||
--ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ | ||
--container-name=runtime \ | ||
--container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ | ||
--container-entrypoint \ | ||
test-fuji-1B.sh | ||
EOF | ||
) | ||
echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT | ||
. .github/workflows/scripts/wait_for_slurm_job.sh | ||
wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} | ||
# Gather job info | ||
SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) | ||
SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') | ||
echo "SLURM Job state is ${SLURM_STATE}" | ||
echo "SLURM Job exit code is ${SLURM_EXITCODE}" | ||
echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" | ||
echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" | ||
set -x | ||
- name: Remove orphaned SLURM job if the CI job is canceled | ||
if: cancelled() | ||
shell: bash -x -e {0} | ||
run: | | ||
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ | ||
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} | ||
- name: Write SLURM job status to file | ||
shell: bash -x -e {0} | ||
run: | | ||
python << EOF | ||
import json | ||
with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: | ||
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} | ||
json.dump(dump, f) | ||
EOF |