Skip to content

Commit ab953b1

Browse files
committed
add training scripts for v3-1.7b-exp2-cpt-2epoch
1 parent eb0702b commit ab953b1

File tree

2 files changed

+331
-0
lines changed

2 files changed

+331
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=0030_train
3+
#SBATCH --partition=gpu-small
4+
#SBATCH --nodes=4
5+
#SBATCH --gpus-per-node=8
6+
#SBATCH --ntasks-per-node=8
7+
#SBATCH --output=outputs/%x-%j.out
8+
#SBATCH --error=outputs/%x-%j.err
9+
10+
set -eu -o pipefail
11+
12+
EXPERIMENT_DIR=/home/shared/experiments/0030_v3-1.7b-exp2-cpt-2epoch
13+
ENV_DIR=${EXPERIMENT_DIR}/environment
14+
15+
source ${ENV_DIR}/scripts/environment.sh
16+
source ${ENV_DIR}/venv/bin/activate
17+
18+
export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
19+
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))
20+
21+
echo "MASTER_ADDR=${MASTER_ADDR}"
22+
23+
NUM_NODES=$SLURM_JOB_NUM_NODES
24+
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
25+
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE}))
26+
27+
echo NUM_NODES=$NUM_NODES
28+
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
29+
echo NUM_GPUS=$NUM_GPUS
30+
31+
mpirun \
32+
-np $NUM_GPUS \
33+
--npernode $NUM_GPUS_PER_NODE \
34+
-bind-to none \
35+
-map-by slot \
36+
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
37+
-x MASTER_ADDR=$MASTER_ADDR \
38+
-x MASTER_PORT=$MASTER_PORT \
39+
-x NUM_NODES=$NUM_NODES \
40+
-x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \
41+
bash scripts/pretrain/scripts/v3-1.7b-exp2-cpt-2epoch-sakura/train.sh
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
#!/bin/bash
2+
3+
# For details about the model, see:
4+
# https://github.com/llm-jp/model-cards/pull/23
5+
6+
set -eu -o pipefail
7+
8+
# EXPERIMENT_DIR= # set by sbatch
9+
ENV_DIR=${EXPERIMENT_DIR}/environment
10+
CACHE_DIR=${EXPERIMENT_DIR}/cache
11+
12+
source ${ENV_DIR}/scripts/environment.sh
13+
source ${ENV_DIR}/scripts/mpi_variables.sh
14+
source ${ENV_DIR}/venv/bin/activate
15+
16+
# open file limit
17+
ulimit -n 65536 1048576
18+
19+
export LOGLEVEL=INFO
20+
export NCCL_DEBUG=WARN
21+
export NCCL_DEBUG_SUBSYS=WARN
22+
export PYTHONFAULTHANDLER=1
23+
export CUDA_DEVICE_MAX_CONNECTIONS=1
24+
export CUDA_LAUNCH_BLOCKING=0
25+
export CUDNN_LOGDEST_DBG=stderr
26+
export CUDNN_LOGERR_DBG=1
27+
28+
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE}))
29+
30+
# model config
31+
HIDDEN_SIZE=2048
32+
FFN_HIDDEN_SIZE=7168
33+
NUM_LAYERS=24
34+
NUM_HEADS=16
35+
SEQ_LENGTH=4096
36+
37+
# distributed settings
38+
TENSOR_PARALLEL_SIZE=1
39+
PIPELINE_PARALLEL_SIZE=1
40+
CONTEXT_PARALLEL_SIZE=1
41+
DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE})))
42+
43+
# training config
44+
MICRO_BATCH_SIZE=8
45+
GLOBAL_BATCH_SIZE=512
46+
47+
LR=3e-4
48+
MIN_LR=3e-5
49+
WEIGHT_DECAY=0.1
50+
GRAD_CLIP=1
51+
52+
# total number of iterations
53+
# 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240
54+
# 988240 + 988240 = 1976480
55+
LR_WARMUP_STEPS=2000
56+
LR_DECAY_ITERS=988240
57+
TRAIN_STEPS=$(((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS}) * 2)
58+
59+
# model config
60+
TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model
61+
62+
CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints
63+
CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT}
64+
CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT}
65+
66+
mkdir -p ${CHECKPOINT_SAVE_DIR}
67+
68+
# data config
69+
DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0
70+
DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1
71+
72+
TRAIN_DATA_PATH=""
73+
74+
# code stack
75+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document"
76+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document"
77+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document"
78+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document"
79+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document"
80+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document"
81+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document"
82+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document"
83+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document"
84+
85+
# ja cc 1
86+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document"
87+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document"
88+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document"
89+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document"
90+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document"
91+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document"
92+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document"
93+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document"
94+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document"
95+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document"
96+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document"
97+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document"
98+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document"
99+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document"
100+
101+
# ja cc 2
102+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document"
103+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document"
104+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document"
105+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document"
106+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document"
107+
108+
# ja cc 3
109+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document"
110+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document"
111+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document"
112+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document"
113+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document"
114+
115+
# ja kaken
116+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document"
117+
118+
# ja warp html
119+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document"
120+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document"
121+
122+
# ja warp pdf
123+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document"
124+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document"
125+
126+
# ja warp pdf 0.2
127+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document"
128+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document"
129+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document"
130+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document"
131+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document"
132+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document"
133+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document"
134+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document"
135+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document"
136+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document"
137+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document"
138+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document"
139+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document"
140+
141+
# ja wiki
142+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document"
143+
144+
# en dolma books
145+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document"
146+
147+
# en dolma c4
148+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document"
149+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document"
150+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document"
151+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document"
152+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document"
153+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document"
154+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document"
155+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document"
156+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document"
157+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document"
158+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document"
159+
160+
# en dolma cc
161+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document"
162+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document"
163+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document"
164+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document"
165+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document"
166+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document"
167+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document"
168+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document"
169+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document"
170+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document"
171+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document"
172+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document"
173+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document"
174+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document"
175+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document"
176+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document"
177+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document"
178+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document"
179+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document"
180+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document"
181+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document"
182+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document"
183+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document"
184+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document"
185+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document"
186+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document"
187+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document"
188+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document"
189+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document"
190+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document"
191+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document"
192+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document"
193+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document"
194+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document"
195+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document"
196+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document"
197+
198+
# en dolma science paper
199+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document"
200+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document"
201+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document"
202+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document"
203+
204+
# en dolma reddit
205+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document"
206+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document"
207+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document"
208+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document"
209+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document"
210+
211+
# en dolma wiki
212+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document"
213+
214+
# en wiki
215+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document"
216+
217+
# zh wiki
218+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document"
219+
220+
# ko wiki
221+
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document"
222+
223+
# job name
224+
WANDB_ENTITY="llm-jp"
225+
WANDB_PROJECT="nii-geniac-1.7B"
226+
WANDB_JOB="llama-2-1.7b-exp2-cpt-2epoch"
227+
228+
# run
229+
export NVTE_FUSED_ATTN=0
230+
python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \
231+
--tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \
232+
--pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \
233+
--context-parallel-size ${CONTEXT_PARALLEL_SIZE} \
234+
--sequence-parallel \
235+
--use-distributed-optimizer \
236+
--num-layers ${NUM_LAYERS} \
237+
--hidden-size ${HIDDEN_SIZE} \
238+
--ffn-hidden-size ${FFN_HIDDEN_SIZE} \
239+
--num-attention-heads ${NUM_HEADS} \
240+
--seq-length ${SEQ_LENGTH} \
241+
--max-position-embeddings ${SEQ_LENGTH} \
242+
--micro-batch-size ${MICRO_BATCH_SIZE} \
243+
--global-batch-size ${GLOBAL_BATCH_SIZE} \
244+
--train-iters ${TRAIN_STEPS} \
245+
--tokenizer-type Llama2Tokenizer \
246+
--tokenizer-model ${TOKENIZER_MODEL} \
247+
--load ${CHECKPOINT_LOAD_DIR} \
248+
--save ${CHECKPOINT_SAVE_DIR} \
249+
--data-path ${TRAIN_DATA_PATH} \
250+
--split 1,0,0 \
251+
--data-cache-path ${CACHE_DIR} \
252+
--distributed-backend nccl \
253+
--init-method-std 0.02 \
254+
--lr ${LR} \
255+
--min-lr ${MIN_LR} \
256+
--override-opt_param-scheduler \
257+
--lr-decay-style cosine \
258+
--lr-decay-iters ${LR_DECAY_ITERS} \
259+
--weight-decay ${WEIGHT_DECAY} \
260+
--clip-grad ${GRAD_CLIP} \
261+
--lr-warmup-iters ${LR_WARMUP_STEPS} \
262+
--optimizer adam \
263+
--adam-beta1 0.9 \
264+
--adam-beta2 0.95 \
265+
--adam-eps 1e-8 \
266+
--log-interval 1 \
267+
--eval-interval ${TRAIN_STEPS} \
268+
--eval-iters 0 \
269+
--bf16 \
270+
--untie-embeddings-and-output-weights \
271+
--position-embedding-type rope \
272+
--disable-bias-linear \
273+
--use-mcore-models \
274+
--normalization RMSNorm \
275+
--norm-epsilon 1e-5 \
276+
--no-masked-softmax-fusion \
277+
--attention-dropout 0.0 \
278+
--hidden-dropout 0.0 \
279+
--swiglu \
280+
--use-flash-attn \
281+
--recompute-activations \
282+
--recompute-granularity "selective" \
283+
--attention-softmax-in-fp32 \
284+
--transformer-impl "transformer_engine" \
285+
--use-mpi \
286+
--use-z-loss \
287+
--log-throughput \
288+
--wandb-entity ${WANDB_ENTITY} \
289+
--wandb-project ${WANDB_PROJECT} \
290+
--wandb-name ${WANDB_JOB} \

0 commit comments

Comments
 (0)