Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions pretrain/scripts/v3-980m-sakura/convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
#SBATCH --job-name=0088_convert
#SBATCH --partition=gpu-debug
#SBATCH --nodes=1
#SBATCH --gpus=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=240G
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -e

MEGATRON_CHECKPOINT_DIR=${1%/}
HF_CHECKPOINT_DIR=$2

ENV_DIR=/home/shared/experiments/0088_llmjp3-980m/environment

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

TOKENIZER_MODEL_DIR=${ENV_DIR}/src/llm-jp-tokenizer/hf/ver3.0/llm-jp-tokenizer-100k.ver3.0b2

TARGET_ITER_DIR=$(basename $MEGATRON_CHECKPOINT_DIR) # iter_NNNNNNN
ITER=$(( 10#$(echo $TARGET_ITER_DIR | sed 's/^iter_//') )) # NNNNNNN (no 0 padding)
echo ITER=$ITER

if [[ -z "$ITER" || ! "$ITER" =~ ^[0-9]+$ ]]; then # check if directory is valid
>&2 echo "Error: ITER=$ITER is not a valid number. Exiting."
exit 1
fi

# Create a unique temporal working directory to avoid affecting the original directory and
# to allow multiple runs to execute simultaneously.
TMP_DIR=$(mktemp -d "${HOME}/ckpt_convert.XXXXXXXX")
>&2 echo TMP_DIR=$TMP_DIR
ln -s $(readlink -f $MEGATRON_CHECKPOINT_DIR) ${TMP_DIR}/${TARGET_ITER_DIR}
echo $ITER > "${TMP_DIR}/latest_checkpointed_iteration.txt"

echo "Converting $MEGATRON_CHECKPOINT_DIR"

python ${ENV_DIR}/src/Megatron-LM/tools/checkpoint/convert.py \
--model-type GPT \
--loader mcore \
--saver llama2_hf \
--load-dir $TMP_DIR \
--save-dir $HF_CHECKPOINT_DIR \
--hf-tokenizer-path $TOKENIZER_MODEL_DIR \
--save-dtype bfloat16 \
--loader-transformer-impl "transformer_engine" \
--megatron-path ${ENV_DIR}/src/Megatron-LM

cp ${TOKENIZER_MODEL_DIR}/* $HF_CHECKPOINT_DIR

rm -r $TMP_DIR
echo "Done"
34 changes: 34 additions & 0 deletions pretrain/scripts/v3-980m-sakura/run_convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

src_root=/home/shared/experiments/0088_llmjp3-980m/checkpoints
dest_root=/home/shared/experiments/0088_llmjp3-980m/checkpoints_hf

for src_ckpt_dir in ${src_root}/iter_???????; do
ckpt_rel=$(basename ${src_ckpt_dir})
dest_ckpt_dir=${dest_root}/${ckpt_rel}

if [ ${ckpt_rel} == 'iter_0000000' ]; then
echo "Ignore: ${ckpt_rel}"
continue
fi

if [ -e ${dest_ckpt_dir} ]; then
echo "Exists: ${ckpt_rel}"
continue
fi

mkdir -p ${dest_ckpt_dir}

sbatch \
scripts/pretrain/scripts/v3-980m-sakura/convert.sh \
${src_ckpt_dir} \
${dest_ckpt_dir}
sbatch_result=$?

if [ ${sbatch_result} -eq 0 ]; then
echo "Queued: ${ckpt_rel}"
else
echo "Error: ${ckpt_rel}"
rmdir ${dest_ckpt_dir}
fi
done
41 changes: 41 additions & 0 deletions pretrain/scripts/v3-980m-sakura/sbatch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash
#SBATCH --job-name=0088_train
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

EXPERIMENT_DIR=/home/shared/experiments/0088_llmjp3-980m
ENV_DIR=${EXPERIMENT_DIR}/environment

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE}))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
-x NUM_NODES=$NUM_NODES \
-x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \
bash scripts/pretrain/scripts/v3-980m-sakura/train.sh
Loading