Changes for 32B (#1164)

hamishivi · web-flow · commit d18452eb7fac · 2025-11-11T20:22:41.000Z
* new 32b script

* new 32b script

* beaker eval freq not upstreaming

* new var

* longer timeout on capturing cuda

* longer timeout on capturing cuda

* update params

* reduce more

* no optim

* working script

* zpg inc

* newer changes

* higher zpg

* changes

* fix

* zpg as arg

* debug

* update

* update

* del tmp script
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -351,6 +351,12 @@ class Args:
     """vLLM top p for nucleus sampling"""
     deepspeed_stage: int = 0
     """the deepspeed stage"""
+    deepspeed_zpg: int = 8
+    """the deepspeed zpg value. Higher values are more memory efficient but slower. Set to 1 to disable zpg, which uses less memory but is significantly slower. Ideally is set to the number of GPUs per node (usually 8, default)."""
+    deepspeed_offload_param: bool = False
+    """whether to offload parameters to CPU (reduces GPU memory usage)"""
+    deepspeed_offload_optimizer: bool = False
+    """whether to offload optimizer states to CPU (reduces GPU memory usage)"""
     gather_whole_model: bool = True
     """whether to gather the whole model to boardcast (not doable for 70B but can be faster for 8B)"""
     enable_queue_dashboard: bool = True
@@ -766,7 +772,13 @@ def load(self, path: str, map_location=None):
 
         deepspeed.init_distributed(timeout=timedelta(minutes=args.backend_timeout))
 
-        ds_config = get_train_ds_config(offload=False, adam_offload=False, stage=args.deepspeed_stage, bf16=True)
+        ds_config = get_train_ds_config(
+            offload=args.deepspeed_offload_param,
+            adam_offload=args.deepspeed_offload_optimizer,
+            stage=args.deepspeed_stage,
+            bf16=True,
+            zpg=args.deepspeed_zpg,
+        )
         ds_config["train_micro_batch_size_per_gpu"] = args.per_device_train_batch_size
         ds_config["gradient_accumulation_steps"] = 1
         # @vwxyzjn: MAGIC: it's actually needed to initialize this `dschf`, so
@@ -861,7 +873,7 @@ def load(self, path: str, map_location=None):
 
         # reference model
         ds_config = get_eval_ds_config(
-            offload=False,
+            offload=args.deepspeed_offload_param,
             # inference model only has stage 3 (sharding) or stage 0 (no sharding)
             # stage 2 is optimizer sharding which doesn't apply to inference
             stage=args.deepspeed_stage if args.deepspeed_stage == 3 else 0,
@@ -965,7 +977,7 @@ def setup_model_update_group(self, vllm_engines):
                 group_name="openrlhf",
                 timeout=timedelta(minutes=self.args.backend_timeout),
             )
-            ray_get_with_progress(refs, desc="Initializing vLLM process groups", timeout=60)
+            ray_get_with_progress(refs, desc="Initializing vLLM process groups", timeout=600)
         torch.distributed.barrier()
 
     def broadcast_to_vllm(self):
@@ -1276,6 +1288,8 @@ def train(
                         args.masked_mean_denominator,
                     )
                     loss = loss / accumulation_steps
+                    # Clear CUDA cache before backward pass to free memory for reduce_scatter operations
+                    torch.cuda.empty_cache()
                     self.model.backward(loss)
                     if (local_step + 1) % accumulation_steps == 0:
                         self.model.step()
diff --git a/open_instruct/vllm_utils.py b/open_instruct/vllm_utils.py
@@ -881,6 +881,8 @@ def create_vllm_engines(
             )
         )
 
-    ray_get_with_progress([engine.ready.remote() for engine in vllm_engines], "Initializing vLLM engines", timeout=300)
+    ray_get_with_progress(
+        [engine.ready.remote() for engine in vllm_engines], "Initializing vLLM engines", timeout=1200
+    )
 
     return vllm_engines
diff --git a/scripts/train/olmo3/32b_rl_smoke_test.sh b/scripts/train/olmo3/32b_rl_smoke_test.sh
@@ -1,68 +1,86 @@
 #!/bin/bash
-# Note: This was originally a script that Saurabh came up to run some experiments.
-# Finbarr has been using it a lot for testing, so we thought we'd check it in.
-num_prompts=25376
-exp_name=rlvr_ace_fn_and_og_ocr_stdio_from_base_with_perf_penalty
-BEAKER_IMAGE="${1:-${BEAKER_USER}/open-instruct-integration-test}"
-uv run python mason.py \
-        --cluster ai2/augusta \
-        --image "$BEAKER_IMAGE" \
-	--pure_docker_mode \
-        --workspace ai2/open-instruct-dev \
-        --gs_model_name "stego32" \
-        --priority urgent \
-	--preemptible \
-        --num_nodes 4 \
-	--description "Large (multi-node) test script." \
-        --timeout 3600 \
-        --max_retries 0 \
-        --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
-        --budget ai2/oe-adapt \
-        --gpus 8 -- source configs/beaker_configs/ray_node_setup.sh \&\& source configs/beaker_configs/code_api_setup.sh \&\&python open_instruct/grpo_fast.py \
+
+
+export exp_name=test_olmo3_32b_rl_run_${RANDOM}
+export data_mix="hamishivi/math_rlvr_mixture_dpo 1.0 hamishivi/code_rlvr_mixture_dpo 1.0 hamishivi/IF_multi_constraints_upto5_filtered_dpo_0625_filter 30186 allenai/rlvr_general_mix-keyword-filtered 21387"
+export beaker_image=hamishivi/open_instruct_rl32_test10
+export model_path=/weka/oe-adapt-default/hamishi/model_checkpoints/olmo3-merge-32b-1e-4-5e-5/olmo3-merge-32b-1e-4-5e-5/
+
+
+python mason.py \
+    --budget ai2/oe-adapt \
+    --cluster ai2/augusta \
+    --image ${beaker_image} \
+    --pure_docker_mode \
+    --workspace ai2/olmo-instruct \
+    --priority urgent \
+    --gs_model_name "sft_olmo3_32b_rl_run_testing" \
+    --preemptible \
+    --num_nodes 18 \
+    --gpus 8 \
+    --max_retries 0 \
+    --env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+    --env LD_LIBRARY_PATH=/var/lib/tcpxo/lib64 \
+    --env NCCL_LIB_DIR=/var/lib/tcpxo/lib64 \
+    --env HOSTED_VLLM_API_BASE=http://ceres-cs-aus-447.reviz.ai2.in:8001/v1 \
+    -- source configs/beaker_configs/ray_node_setup.sh \&\& source configs/beaker_configs/code_api_setup.sh \&\& python open_instruct/grpo_fast.py \
         --exp_name ${exp_name} \
         --beta 0.0 \
-        --num_samples_per_prompt_rollout 16 \
+        --num_samples_per_prompt_rollout 8 \
         --num_unique_prompts_rollout 64 \
         --num_mini_batches 1 \
         --num_epochs 1 \
-        --learning_rate 5e-7 \
+        --learning_rate 1e-6 \
         --per_device_train_batch_size 1 \
+        --output_dir /output \
         --kl_estimator kl3 \
-        --dataset_mixer_list saurabh5/rlvr_acecoder_filtered ${num_prompts} saurabh5/open-code-reasoning-rlvr-stdio ${num_prompts} \
+        --dataset_mixer_list ${data_mix} \
         --dataset_mixer_list_splits train \
-        --dataset_mixer_eval_list saurabh5/rlvr_acecoder_filtered 8 saurabh5/open-code-reasoning-rlvr-stdio 8 \
+        --dataset_mixer_eval_list hamishivi/omega-combined 8 allenai/IF_multi_constraints_upto5 8 saurabh5/rlvr_acecoder_filtered 8 hamishivi/tulu_3_rewritten_400k_string_f1_only_v2_nocode_all_filtered_qwen2_5_openthoughts2 4 hamishivi/virtuoussy_multi_subject_rlvr 4 \
         --dataset_mixer_eval_list_splits train \
         --max_prompt_token_length 2048 \
-        --response_length 4096 \
-        --pack_length 20480 \
-        --model_name_or_path "/weka/oe-adapt-default/finbarrt/stego32/step358000-hf" \
-	--tokenizer_name_or_path "allenai/OLMo-2-1124-7B" \
-        --chat_template_name tulu_thinker \
-	--inflight_updates True \
-        --stop_strings "</answer>" \
+        --response_length 32768 \
+        --pack_length 35840 \
+        --model_name_or_path ${model_path} \
+        --chat_template_name olmo_thinker \
         --non_stop_penalty False \
+        --mask_truncated_completions False \
         --temperature 1.0 \
-        --verbose False \
         --ground_truths_key ground_truth \
         --sft_messages_key messages \
-        --total_episodes 10240 \
-	--gather_whole_model False \
+        --total_episodes 10000000 \
         --deepspeed_stage 3 \
-        --num_learners_per_node 8 8 8 \
-        --vllm_num_engines 2 \
-        --vllm_tensor_parallel_size 4 \
+        --num_learners_per_node 8 8 8 8 8 8 8 8 8 8 8 8 \
+        --vllm_num_engines 6 \
+        --gather_whole_model False \
+        --vllm_tensor_parallel_size 8 \
         --lr_scheduler_type constant \
         --apply_verifiable_reward true \
-        --code_api_url \$CODE_API_URL/test_program \
         --seed 1 \
-        --local_eval_every 1 \
-	--add_bos \
-        --gradient_checkpointing \
+        --local_eval_every 50 \
+        --save_freq 25 \
+        --eval_priority urgent \
         --try_launch_beaker_eval_jobs_on_weka True \
+        --gradient_checkpointing \
         --with_tracking \
-	--update_progress_every 1 \
-        --vllm_enable_prefix_caching \
+        --llm_judge_model hosted_vllm/Qwen/Qwen3-32B \
+        --llm_judge_timeout 600 \
+        --llm_judge_max_tokens 2048 \
+        --llm_judge_max_context_length 32768 \
+        --clip_higher 0.272 \
+        --allow_world_padding False \
+        --use_fp8_kv_cache False \
+        --code_api_url https://p9f1719l7f.execute-api.us-west-2.amazonaws.com/prod/test_program \
+        --code_pass_rate_reward_threshold 0.99 \
         --oe_eval_max_length 32768 \
-        --oe_eval_tasks "codex_humanevalplus:0-shot-chat-v1::tulu-thinker,mbppplus:0-shot-chat::tulu-thinker,livecodebench_codegeneration::tulu-thinker" \
-        --dataset_skip_cache True \
-	--push_to_hub False
+        --checkpoint_state_freq 100 \
+        --backend_timeout 1200 \
+        --inflight_updates true \
+        --async_steps 8 \
+        --active_sampling \
+        --advantage_normalization_type centered \
+        --truncated_importance_sampling_ratio_cap 2.0 \
+        --oe_eval_beaker_image oe-eval-beaker/oe_eval_olmo2_retrofit_auto \
+        --oe_eval_tasks mmlu:cot::hamish_zs_reasoning_deepseek,bbh:cot::hamish_zs_reasoning_deepseek_v2,gpqa:0shot_cot::qwen3-instruct,zebralogic::hamish_zs_reasoning_deepseek,agi_eval_english:0shot_cot::hamish_zs_reasoning_deepseek,omega_500:0-shot-chat_deepseek,aime:zs_cot_r1::pass_at_32_2024_deepseek,aime:zs_cot_r1::pass_at_32_2025_deepseek,codex_humanevalplus:0-shot-chat::tulu-thinker_deepseek,mbppplus:0-shot-chat::tulu-thinker_deepseek,livecodebench_codegeneration::tulu-thinker_deepseek,alpaca_eval_v3::hamish_zs_reasoning_deepseek,ifeval::hamish_zs_reasoning_deepseek \
+        --vllm_enforce_eager \
+        --deepspeed_zpg 32

Original file line number	Diff line number	Diff line change
`@@ -881,6 +881,8 @@ def create_vllm_engines(`
`881`	`881`	`)`
`882`	`882`	`)`
`883`	`883`
`884`		`- ray_get_with_progress([engine.ready.remote() for engine in vllm_engines], "Initializing vLLM engines", timeout=300)`
	`884`	`+ ray_get_with_progress(`
	`885`	`+ [engine.ready.remote() for engine in vllm_engines], "Initializing vLLM engines", timeout=1200`
	`886`	`+ )`
`885`	`887`
`886`	`888`	`return vllm_engines`