test a setup for running fuji 1B on slurm #1

Workflow file for this run

.github/workflows/_test_fuji_1B.yaml at 026b37a

	name: ~test MaxText functionality

	on:
	workflow_call:
	inputs:
	AXLEARN_DOCKER_IMAGE:
	type: string
	description: Axlearn image from ghcr.io/nvidia
	default: ghcr.io/nvidia/jax:axlearn
	required: false

	jobs:
	single-process-single-node:
	runs-on: jumpbox
	steps:
	- name: Check out the repository under ${GITHUB_WORKSPACE}
	uses: actions/checkout@v4

	- name: Setup SSH
	id: setup-ssh
	uses: ./.github/actions/setup-ssh
	with:
	ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
	ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}

	- name: Labels and metadata
	id: meta
	shell: bash -x -e {0}
	run: \|
	IMAGE="$(echo ${{inputs.AXLEARN_IMAGE}} \| sed 's/\//#/')"
	TOTAL_TASKS=1
	MAX_GPUS_PER_NODE=8
	NODES=1
	GPUS_PER_NODE=8
	JOB_NAME=axlearn-fuji-1B-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
	LOG_FILE=/nfs/cluster/${JOB_NAME}.log
	MODEL_PATH=/nfs/cluster/${JOB_NAME}
	for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
	echo "$var=${!var}" >> $GITHUB_OUTPUT
	done

	- name: Submit SLURM jobs over SSH
	id: submit
	shell: bash -O expand_aliases -x -e {0}
	run: \|
	alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
Check failure on line 46 in .github/workflows/_test_fuji_1B.yaml View workflow run for this annotation GitHub Actions / .github/workflows/_test_fuji_1B.yaml Invalid workflow file `You have an error in your yaml syntax on line 46`
	sshx "date && hostname && sinfo"
	sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
	JOB=$(sshx sbatch --parsable << EOF
	#!/bin/bash
	#SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
	#SBATCH --exclusive
	#SBATCH --nodes=${{ steps.meta.outputs.NODES }}
	#SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }}
	#SBATCH --time=00:30:00
	#SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}

	# preload enroot container using one task per node
	time srun \
	--ntasks-per-node=1 \
	--container-name=runtime \
	--container-image=${{ steps.meta.outputs.IMAGE }} \
	true

	# run job with tasks on each node sharing one container
	time srun \
	--ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \
	--container-name=runtime \
	--container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
	--container-entrypoint \
	test-fuji-1B.sh
	EOF
	)

	echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT

	. .github/workflows/scripts/wait_for_slurm_job.sh

	wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB}

	# Gather job info
	SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader \|& head -n 1)
	SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader \| sort -r -u \| head -1 \| cut -f 1 -d":" \| sed 's/ //g')
	echo "SLURM Job state is ${SLURM_STATE}"
	echo "SLURM Job exit code is ${SLURM_EXITCODE}"
	echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
	echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
	set -x


	- name: Remove orphaned SLURM job if the CI job is canceled
	if: cancelled()
	shell: bash -x -e {0}
	run: \|
	ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
	scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}

	- name: Write SLURM job status to file
	shell: bash -x -e {0}
	run: \|
	python << EOF
	import json
	with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
	dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
	json.dump(dump, f)
	EOF

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

test a setup for running fuji 1B on slurm #1

Workflow file

test a setup for running fuji 1B on slurm #1

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/_test_fuji_1B.yaml