Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support 405B training with single/multi node #3454

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
"\n",
"# @markdown **Only select and fill one of the following sections.**\n",
"# fmt: off\n",
"LOAD_MODEL_FROM = \"Hugging Face\" # @param [\"Hugging Face\", \"Google Cloud\"] {isTemplate:true}\n",
"LOAD_MODEL_FROM = \"Google Cloud\" # @param [\"Hugging Face\", \"Google Cloud\"] {isTemplate:true}\n",
"# fmt: on\n",
"\n",
"# @markdown ---\n",
Expand Down Expand Up @@ -319,22 +319,22 @@
"\n",
"# @markdown **Note**:\n",
"# @markdown 1. We recommend setting `finetuning_precision_mode` to `4bit` because it enables using fewer hardware resources for finetuning.\n",
"# @markdown 1. We recommend using NVIDIA_L4 for 8B models and NVIDIA_A100_80GB for 70B models.\n",
"# @markdown 1. We recommend using NVIDIA_A100_80GB for 8B and 70B models, and NVIDIA_H100_80GB for 405B model.\n",
"# @markdown 1. If `max_steps>0`, it will precedence over `epochs`. One can set a small `max_steps` value to quickly check the pipeline.\n",
"# @markdown 1. With the default setting, training takes between 1.5 ~ 2 hours.\n",
"\n",
"# The Llama 3.1 base model.\n",
"MODEL_ID = \"meta-llama/Meta-Llama-3.1-8B-Instruct\" # @param [\"meta-llama/Meta-Llama-3.1-8B\", \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"meta-llama/Meta-Llama-3.1-70B\", \"meta-llama/Meta-Llama-3.1-70B-Instruct\"] {isTemplate:true}\n",
"MODEL_ID = \"meta-llama/Meta-Llama-3.1-8B-Instruct\" # @param [\"meta-llama/Meta-Llama-3.1-8B\", \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"meta-llama/Meta-Llama-3.1-70B\", \"meta-llama/Meta-Llama-3.1-70B-Instruct\", \"meta-llama/Meta-Llama-3.1-405B\", \"meta-llama/Meta-Llama-3.1-405B-Instruct\"] {isTemplate:true}\n",
"if LOAD_MODEL_FROM == \"Google Cloud\":\n",
" base_model_id = os.path.join(MODEL_BUCKET, MODEL_ID.split(\"/\")[-1])\n",
"else:\n",
" base_model_id = MODEL_ID\n",
"\n",
"# The pre-built training docker image.\n",
"TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240724_0936_RC00\"\n",
"TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240822_0936_RC00\"\n",
"\n",
"# The accelerator to use.\n",
"accelerator_type = \"NVIDIA_A100_80GB\" # @param [\"NVIDIA_A100_80GB\"]\n",
"accelerator_type = \"NVIDIA_A100_80GB\" # @param [\"NVIDIA_A100_80GB\", \"NVIDIA_H100_80GB\"]\n",
"\n",
"# Batch size for finetuning.\n",
"per_device_train_batch_size = 1 # @param{type:\"integer\"}\n",
Expand All @@ -355,28 +355,61 @@
"lora_dropout = 0.05 # @param{type:\"number\"}\n",
"enable_gradient_checkpointing = True\n",
"attn_implementation = \"flash_attention_2\"\n",
"optimizer = \"paged_adamw_32bit\"\n",
"optimizer = \"adamw_torch\"\n",
"warmup_ratio = \"0.01\"\n",
"report_to = \"tensorboard\"\n",
"save_steps = 10\n",
"logging_steps = save_steps\n",
"\n",
"# Worker pool spec.\n",
"if accelerator_type == \"NVIDIA_A100_80GB\":\n",
" accelerator_count = 4\n",
" machine_type = \"a2-ultragpu-4g\"\n",
"machine_type = None\n",
"if \"405b\" in MODEL_ID.lower():\n",
" if accelerator_type == \"NVIDIA_H100_80GB\":\n",
" accelerator_count = 8\n",
" machine_type = \"a3-highgpu-8g\"\n",
" boot_disk_size_gb = 2000\n",
" merge_model_precision_mode = \"float8\"\n",
" else:\n",
" raise ValueError(\n",
" f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n",
" )\n",
"else:\n",
" raise ValueError(\n",
" f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n",
" )\n",
" if accelerator_type == \"NVIDIA_A100_80GB\":\n",
" accelerator_count = 4\n",
" machine_type = \"a2-ultragpu-4g\"\n",
" boot_disk_size_gb = 500\n",
" merge_model_precision_mode = \"float16\"\n",
" else:\n",
" raise ValueError(\n",
" f\"Recommended machine settings not found for: {accelerator_type}. To use another accelerator, edit this code block to pass in an appropriate `machine_type`, `accelerator_type`, and `accelerator_count` to the deploy_model_vllm function by clicking `Show Code` and then modifying the code.\"\n",
" )\n",
"\n",
"# The number of nodes to use for this worker pool in distributed training.\n",
"replica_count = 1 # @param{type:\"integer\"}\n",
"\n",
"# Set config file.\n",
"if \"405b\" in MODEL_ID.lower():\n",
" if replica_count > 4:\n",
" raise ValueError(\n",
" f\"Recommended config settings not found for replica_count: {replica_count}.\"\n",
" )\n",
" elif replica_count == 1:\n",
" config_file = \"vertex_vision_model_garden_peft/llama_fsdp_8gpu.yaml\"\n",
" else:\n",
" config_file = (\n",
" \"vertex_vision_model_garden_peft/\"\n",
" f\"llama_hsdp_{replica_count * accelerator_count}gpu.yaml\"\n",
" )\n",
"else:\n",
" raise ValueError(f\"Unsupported model ID or GCS path: {MODEL_ID}.\")\n",
"\n",
"replica_count = 1\n",
"\n",
"common_util.check_quota(\n",
" project_id=PROJECT_ID,\n",
" region=REGION,\n",
" accelerator_type=accelerator_type,\n",
" accelerator_count=accelerator_count,\n",
" accelerator_count=accelerator_count * replica_count,\n",
" is_for_training=True,\n",
")\n",
"\n",
Expand All @@ -400,7 +433,7 @@
"]\n",
"\n",
"train_job_args = [\n",
" \"--config_file=vertex_vision_model_garden_peft/deepspeed_zero2_4gpu.yaml\",\n",
" f\"--config_file={config_file}\",\n",
" \"--task=instruct-lora\",\n",
" \"--completion_only=True\",\n",
" f\"--pretrained_model_id={base_model_id}\",\n",
Expand All @@ -409,6 +442,7 @@
" f\"--instruct_column_in_dataset={instruct_column_in_dataset}\",\n",
" f\"--output_dir={lora_output_dir}\",\n",
" f\"--merge_base_and_lora_output_dir={merged_model_output_dir}\",\n",
" f\"--merge_model_precision_mode={merge_model_precision_mode}\",\n",
" f\"--per_device_train_batch_size={per_device_train_batch_size}\",\n",
" f\"--gradient_accumulation_steps={gradient_accumulation_steps}\",\n",
" f\"--lora_rank={lora_rank}\",\n",
Expand Down Expand Up @@ -451,7 +485,7 @@
" machine_type=machine_type,\n",
" accelerator_type=accelerator_type,\n",
" accelerator_count=accelerator_count,\n",
" boot_disk_size_gb=500,\n",
" boot_disk_size_gb=boot_disk_size_gb,\n",
" service_account=SERVICE_ACCOUNT,\n",
" tensorboard=tensorboard.resource_name,\n",
" base_output_dir=base_output_dir,\n",
Expand Down Expand Up @@ -485,10 +519,16 @@
" machine_type = \"g2-standard-12\"\n",
" accelerator_type = \"NVIDIA_L4\"\n",
" accelerator_count = 1\n",
"else:\n",
"elif \"70b\" in MODEL_ID.lower():\n",
" machine_type = \"g2-standard-96\"\n",
" accelerator_type = \"NVIDIA_L4\"\n",
" accelerator_count = 8\n",
"elif \"405b\" in MODEL_ID.lower():\n",
" machine_type = \"a3-highgpu-8g\"\n",
" accelerator_type = \"NVIDIA_H100_80GB\"\n",
" accelerator_count = 8\n",
"else:\n",
" raise ValueError(f\"Unsupported model ID or GCS path: {MODEL_ID}.\")\n",
"\n",
"common_util.check_quota(\n",
" project_id=PROJECT_ID,\n",
Expand Down