|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# For details about the model, see: |
| 4 | +# https://github.com/llm-jp/model-cards/pull/23 |
| 5 | + |
| 6 | +set -eu -o pipefail |
| 7 | + |
| 8 | +# EXPERIMENT_DIR= # set by sbatch |
| 9 | +ENV_DIR=${EXPERIMENT_DIR}/environment |
| 10 | +CACHE_DIR=${EXPERIMENT_DIR}/cache |
| 11 | + |
| 12 | +source ${ENV_DIR}/scripts/environment.sh |
| 13 | +source ${ENV_DIR}/scripts/mpi_variables.sh |
| 14 | +source ${ENV_DIR}/venv/bin/activate |
| 15 | + |
| 16 | +# open file limit |
| 17 | +ulimit -n 65536 1048576 |
| 18 | + |
| 19 | +export LOGLEVEL=INFO |
| 20 | +export NCCL_DEBUG=WARN |
| 21 | +export NCCL_DEBUG_SUBSYS=WARN |
| 22 | +export PYTHONFAULTHANDLER=1 |
| 23 | +export CUDA_DEVICE_MAX_CONNECTIONS=1 |
| 24 | +export CUDA_LAUNCH_BLOCKING=0 |
| 25 | +export CUDNN_LOGDEST_DBG=stderr |
| 26 | +export CUDNN_LOGERR_DBG=1 |
| 27 | + |
| 28 | +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) |
| 29 | + |
| 30 | +# model config |
| 31 | +HIDDEN_SIZE=2048 |
| 32 | +FFN_HIDDEN_SIZE=7168 |
| 33 | +NUM_LAYERS=24 |
| 34 | +NUM_HEADS=16 |
| 35 | +SEQ_LENGTH=4096 |
| 36 | + |
| 37 | +# distributed settings |
| 38 | +TENSOR_PARALLEL_SIZE=1 |
| 39 | +PIPELINE_PARALLEL_SIZE=1 |
| 40 | +CONTEXT_PARALLEL_SIZE=1 |
| 41 | +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) |
| 42 | + |
| 43 | +# training config |
| 44 | +MICRO_BATCH_SIZE=8 |
| 45 | +GLOBAL_BATCH_SIZE=512 |
| 46 | + |
| 47 | +LR=3e-4 |
| 48 | +MIN_LR=3e-5 |
| 49 | +WEIGHT_DECAY=0.1 |
| 50 | +GRAD_CLIP=1 |
| 51 | + |
| 52 | +# total number of iterations |
| 53 | +# 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 |
| 54 | +# 988240 + 988240 = 1976480 |
| 55 | +LR_WARMUP_STEPS=2000 |
| 56 | +LR_DECAY_ITERS=988240 |
| 57 | +TRAIN_STEPS=$(((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS}) * 2) |
| 58 | +
|
| 59 | +# model config |
| 60 | +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model |
| 61 | +
|
| 62 | +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints |
| 63 | +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} |
| 64 | +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} |
| 65 | +
|
| 66 | +mkdir -p ${CHECKPOINT_SAVE_DIR} |
| 67 | +
|
| 68 | +# data config |
| 69 | +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 |
| 70 | +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 |
| 71 | +
|
| 72 | +TRAIN_DATA_PATH="" |
| 73 | +
|
| 74 | +# code stack |
| 75 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" |
| 76 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" |
| 77 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" |
| 78 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" |
| 79 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" |
| 80 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" |
| 81 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" |
| 82 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" |
| 83 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" |
| 84 | +
|
| 85 | +# ja cc 1 |
| 86 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" |
| 87 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" |
| 88 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" |
| 89 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" |
| 90 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" |
| 91 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" |
| 92 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" |
| 93 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" |
| 94 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" |
| 95 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" |
| 96 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" |
| 97 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" |
| 98 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" |
| 99 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" |
| 100 | +
|
| 101 | +# ja cc 2 |
| 102 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" |
| 103 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" |
| 104 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" |
| 105 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" |
| 106 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" |
| 107 | +
|
| 108 | +# ja cc 3 |
| 109 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" |
| 110 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" |
| 111 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" |
| 112 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" |
| 113 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" |
| 114 | +
|
| 115 | +# ja kaken |
| 116 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" |
| 117 | +
|
| 118 | +# ja warp html |
| 119 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" |
| 120 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" |
| 121 | +
|
| 122 | +# ja warp pdf |
| 123 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" |
| 124 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" |
| 125 | +
|
| 126 | +# ja warp pdf 0.2 |
| 127 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" |
| 128 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" |
| 129 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" |
| 130 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" |
| 131 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" |
| 132 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" |
| 133 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" |
| 134 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" |
| 135 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" |
| 136 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" |
| 137 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" |
| 138 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" |
| 139 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" |
| 140 | +
|
| 141 | +# ja wiki |
| 142 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" |
| 143 | +
|
| 144 | +# en dolma books |
| 145 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" |
| 146 | +
|
| 147 | +# en dolma c4 |
| 148 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" |
| 149 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" |
| 150 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" |
| 151 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" |
| 152 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" |
| 153 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" |
| 154 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" |
| 155 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" |
| 156 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" |
| 157 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" |
| 158 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" |
| 159 | +
|
| 160 | +# en dolma cc |
| 161 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" |
| 162 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" |
| 163 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" |
| 164 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" |
| 165 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" |
| 166 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" |
| 167 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" |
| 168 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" |
| 169 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" |
| 170 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" |
| 171 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" |
| 172 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" |
| 173 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" |
| 174 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" |
| 175 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" |
| 176 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" |
| 177 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" |
| 178 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" |
| 179 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" |
| 180 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" |
| 181 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" |
| 182 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" |
| 183 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" |
| 184 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" |
| 185 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" |
| 186 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" |
| 187 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" |
| 188 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" |
| 189 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" |
| 190 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" |
| 191 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" |
| 192 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" |
| 193 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" |
| 194 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" |
| 195 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" |
| 196 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" |
| 197 | +
|
| 198 | +# en dolma science paper |
| 199 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" |
| 200 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" |
| 201 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" |
| 202 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" |
| 203 | +
|
| 204 | +# en dolma reddit |
| 205 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" |
| 206 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" |
| 207 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" |
| 208 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" |
| 209 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" |
| 210 | +
|
| 211 | +# en dolma wiki |
| 212 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" |
| 213 | +
|
| 214 | +# en wiki |
| 215 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" |
| 216 | +
|
| 217 | +# zh wiki |
| 218 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" |
| 219 | +
|
| 220 | +# ko wiki |
| 221 | +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" |
| 222 | +
|
| 223 | +# job name |
| 224 | +WANDB_ENTITY="llm-jp" |
| 225 | +WANDB_PROJECT="nii-geniac-1.7B" |
| 226 | +WANDB_JOB="llama-2-1.7b-exp2-cpt-2epoch" |
| 227 | +
|
| 228 | +# run |
| 229 | +export NVTE_FUSED_ATTN=0 |
| 230 | +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ |
| 231 | + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ |
| 232 | + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ |
| 233 | + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ |
| 234 | + --sequence-parallel \ |
| 235 | + --use-distributed-optimizer \ |
| 236 | + --num-layers ${NUM_LAYERS} \ |
| 237 | + --hidden-size ${HIDDEN_SIZE} \ |
| 238 | + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ |
| 239 | + --num-attention-heads ${NUM_HEADS} \ |
| 240 | + --seq-length ${SEQ_LENGTH} \ |
| 241 | + --max-position-embeddings ${SEQ_LENGTH} \ |
| 242 | + --micro-batch-size ${MICRO_BATCH_SIZE} \ |
| 243 | + --global-batch-size ${GLOBAL_BATCH_SIZE} \ |
| 244 | + --train-iters ${TRAIN_STEPS} \ |
| 245 | + --tokenizer-type Llama2Tokenizer \ |
| 246 | + --tokenizer-model ${TOKENIZER_MODEL} \ |
| 247 | + --load ${CHECKPOINT_LOAD_DIR} \ |
| 248 | + --save ${CHECKPOINT_SAVE_DIR} \ |
| 249 | + --data-path ${TRAIN_DATA_PATH} \ |
| 250 | + --split 1,0,0 \ |
| 251 | + --data-cache-path ${CACHE_DIR} \ |
| 252 | + --distributed-backend nccl \ |
| 253 | + --init-method-std 0.02 \ |
| 254 | + --lr ${LR} \ |
| 255 | + --min-lr ${MIN_LR} \ |
| 256 | + --override-opt_param-scheduler \ |
| 257 | + --lr-decay-style cosine \ |
| 258 | + --lr-decay-iters ${LR_DECAY_ITERS} \ |
| 259 | + --weight-decay ${WEIGHT_DECAY} \ |
| 260 | + --clip-grad ${GRAD_CLIP} \ |
| 261 | + --lr-warmup-iters ${LR_WARMUP_STEPS} \ |
| 262 | + --optimizer adam \ |
| 263 | + --adam-beta1 0.9 \ |
| 264 | + --adam-beta2 0.95 \ |
| 265 | + --adam-eps 1e-8 \ |
| 266 | + --log-interval 1 \ |
| 267 | + --eval-interval ${TRAIN_STEPS} \ |
| 268 | + --eval-iters 0 \ |
| 269 | + --bf16 \ |
| 270 | + --untie-embeddings-and-output-weights \ |
| 271 | + --position-embedding-type rope \ |
| 272 | + --disable-bias-linear \ |
| 273 | + --use-mcore-models \ |
| 274 | + --normalization RMSNorm \ |
| 275 | + --norm-epsilon 1e-5 \ |
| 276 | + --no-masked-softmax-fusion \ |
| 277 | + --attention-dropout 0.0 \ |
| 278 | + --hidden-dropout 0.0 \ |
| 279 | + --swiglu \ |
| 280 | + --use-flash-attn \ |
| 281 | + --recompute-activations \ |
| 282 | + --recompute-granularity "selective" \ |
| 283 | + --attention-softmax-in-fp32 \ |
| 284 | + --transformer-impl "transformer_engine" \ |
| 285 | + --use-mpi \ |
| 286 | + --use-z-loss \ |
| 287 | + --log-throughput \ |
| 288 | + --wandb-entity ${WANDB_ENTITY} \ |
| 289 | + --wandb-project ${WANDB_PROJECT} \ |
| 290 | + --wandb-name ${WANDB_JOB} \ |
0 commit comments