Description
Describe the bug
再进行多机lora微调时出错:
failed (exitcode: -11) local_rank: 5 (pid: 11514) of binary: /home/jovyan/data-ws-enr/zconda/envs/swift_ft/bin/python
Traceback (most recent call last):
File "/home/jovyan/data-ws-enr/zconda/envs/swift_ft/bin/torchrun", line 8, in
sys.exit(main())
File "/home/jovyan/data-ws-enr/zconda/envs/swift_ft/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/home/jovyan/data-ws-enr/zconda/envs/swift_ft/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/home/jovyan/data-ws-enr/zconda/envs/swift_ft/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/home/jovyan/data-ws-enr/zconda/envs/swift_ft/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/jovyan/data-ws-enr/zconda/envs/swift_ft/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
这是我的脚本:
#!/bin/bash
默认参数设置
VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
NNODES=4
NODE_RANK=0
MASTER_ADDR="10.178.141.248"
MASTER_PORT=29500
NPROC_PER_NODE=8
MODEL_TYPE="qwen2-7b"
MODEL_PATH="/home/jovyan/kys-workspace-zzzc/models/Qwen2-7B"
DATASET="/home/jovyan/dataws1/fine-wenshu/data/patent_gpt4o/train_data_1000.json"
MAX_LENGTH=32768
NUM_TRAIN_EPOCHS=1
BATCH_SIZE=1
LEARNING_RATE=1e-4
EVAL_STEPS=100
LOGGING_STEPS=10
SEQUENCE_PARALLEL_SIZE=4
DEEPSPEED="default-zero3"
DDP_BACKEND="nccl"
OUTPUT_DIR="/home/jovyan/dataws1/fine-wenshu/model/qwen2_7b_patent_model"
GRADIENT_CHECKPOINTING=true
USE_FLASH_ATTN=true
LAZY_TOKENIZE=true
CHECK_MODEL_IS_LATEST=false
SAVE_ON_EACH_NODE=false
DISABLE_TQDM=true
解析命令行参数
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--visible_devices) VISIBLE_DEVICES="$2"; shift; shift ;;
--nnodes) NNODES="$2"; shift; shift ;;
--node_rank) NODE_RANK="$2"; shift; shift ;;
--master_addr) MASTER_ADDR="$2"; shift; shift ;;
--master_port) MASTER_PORT="$2"; shift; shift ;;
--nproc_per_node) NPROC_PER_NODE="$2"; shift; shift ;;
--sft_type) SFT_TYPE="$2"; shift; shift ;;
--model_type) MODEL_TYPE="$2"; shift; shift ;;
--model_path) MODEL_PATH="$2"; shift; shift ;;
--dataset) DATASET="$2"; shift; shift ;;
--max_length) MAX_LENGTH="$2"; shift; shift ;;
--num_train_epochs) NUM_TRAIN_EPOCHS="$2"; shift; shift ;;
--batch_size) BATCH_SIZE="$2"; shift; shift ;;
--learning_rate) LEARNING_RATE="$2"; shift; shift ;;
--eval_steps) EVAL_STEPS="$2"; shift; shift ;;
--logging_steps) LOGGING_STEPS="$2"; shift; shift ;;
--sequence_parallel_size) SEQUENCE_PARALLEL_SIZE="$2"; shift; shift ;;
--deepspeed) DEEPSPEED="$2"; shift; shift ;;
--ddp_backend) DDP_BACKEND="$2"; shift; shift ;;
--output_dir) OUTPUT_DIR="$2"; shift; shift ;;
--gradient_checkpointing) GRADIENT_CHECKPOINTING="$2"; shift; shift ;;
--use_flash_attn) USE_FLASH_ATTN="$2"; shift; shift ;;
--lazy_tokenize) LAZY_TOKENIZE="$2"; shift; shift ;;
--check_model_is_latest) CHECK_MODEL_IS_LATEST="$2"; shift; shift ;;
--save_on_each_node) SAVE_ON_EACH_NODE="$2"; shift; shift ;;
--disable_tqdm) DISABLE_TQDM="$2"; shift; shift ;;
*) echo "未知参数 $1"; exit 1 ;;
esac
done
定义清理函数
cleanup() {
echo "捕获到异常退出,执行清理操作..."
kill -9 $(lsof -t -i :$MASTER_PORT)
rm -rf $OUTPUT_DIR
}
设置陷阱以捕获错误和退出信号
trap cleanup ERR EXIT
运行命令
CUDA_VISIBLE_DEVICES=$VISIBLE_DEVICES
NNODES=$NNODES
NODE_RANK=$NODE_RANK
MASTER_ADDR=$MASTER_ADDR
MASTER_PORT=$MASTER_PORT
NPROC_PER_NODE=$NPROC_PER_NODE
swift sft
--model_type qwen2-7b-instruct
--model_id_or_path /home/jovyan/kys-workspace-zzzc/models/Qwen2-7B-Instruct
--model_revision master
--sft_type lora
--tuner_backend peft
--template_type AUTO
--dtype AUTO
--output_dir /home/jovyan/dataws1/fine-wenshu/model/qwen2_7b_patent_model
--dataset /home/jovyan/dataws1/fine-wenshu/data/patent_gpt4o/train_data_1000.json
--val_dataset /home/jovyan/dataws1/fine-wenshu/data/patent_gpt4o/dev_data_1000.json
--use_loss_scale true
--num_train_epochs 2
--max_length $MAX_LENGTH
--truncation_strategy delete
--check_dataset_strategy warning
--lora_rank 5
--lora_alpha 32
--lora_dropout_p 0.05
--lora_target_modules ALL
--gradient_checkpointing true
--batch_size 1
--eval_batch_size 1
--weight_decay 0.1
--learning_rate 1e-4
--gradient_accumulation_steps 16
--max_grad_norm 0.5
--warmup_ratio 0.03
--eval_steps 100
--save_steps 100
--save_total_limit 2
--logging_steps 10
--use_flash_attn false
--self_cognition_sample 0
--deepspeed default-zero3
--sequence_parallel_size $SEQUENCE_PARALLEL_SIZE
--ddp_backend $DDP_BACKEND
--gradient_checkpointing $GRADIENT_CHECKPOINTING
--check_model_is_latest $CHECK_MODEL_IS_LATEST
--save_on_each_node $SAVE_ON_EACH_NODE
--disable_tqdm $DISABLE_TQDM
可以帮忙看看解决一下吗