Skip to content

test a setup for running fuji 1B on slurm #1

test a setup for running fuji 1B on slurm

test a setup for running fuji 1B on slurm #1

name: ~test MaxText functionality
on:
workflow_call:
inputs:
AXLEARN_DOCKER_IMAGE:
type: string
description: Axlearn image from ghcr.io/nvidia
default: ghcr.io/nvidia/jax:axlearn
required: false
jobs:
single-process-single-node:
runs-on: jumpbox
steps:
- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4
- name: Setup SSH
id: setup-ssh
uses: ./.github/actions/setup-ssh
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
- name: Labels and metadata
id: meta
shell: bash -x -e {0}
run: |
IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} | sed 's/\//#/')"
TOTAL_TASKS=1
MAX_GPUS_PER_NODE=8
NODES=1
GPUS_PER_NODE=8
JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
echo "$var=${!var}" >> $GITHUB_OUTPUT
done
- name: Submit SLURM jobs over SSH
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'

Check failure on line 46 in .github/workflows/_test_fuji_1B.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/_test_fuji_1B.yaml

Invalid workflow file

You have an error in your yaml syntax on line 46
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
#SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
#SBATCH --exclusive
#SBATCH --nodes=${{ steps.meta.outputs.NODES }}
#SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
#SBATCH --time=00:30:00
#SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
# preload enroot container using one task per node
time srun \
--ntasks-per-node=1 \
--container-name=runtime \
--container-image=${{ steps.meta.outputs.IMAGE }} \
true
# run job with tasks on each node sharing one container
time srun \
--ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
--container-name=runtime \
--container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
--container-entrypoint \
test-fuji-1B.sh
EOF
)
echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT
. .github/workflows/scripts/wait_for_slurm_job.sh
wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}
# Gather job info
SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
echo "SLURM Job state is ${SLURM_STATE}"
echo "SLURM Job exit code is ${SLURM_EXITCODE}"
echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
set -x
- name: Remove orphaned SLURM job if the CI job is canceled
if: cancelled()
shell: bash -x -e {0}
run: |
ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Write SLURM job status to file
shell: bash -x -e {0}
run: |
python << EOF
import json
with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF