NovaSky-AI · justinvyu · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/docs/content/docs/configuration/config.mdx b/docs/content/docs/configuration/config.mdx
@@ -557,7 +557,7 @@ def ppo_policy_loss(
       pg_losses3 = -advantages * config.clip_ratio_c
       clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1)
       loss = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1)
-    loss = reduce_loss(loss, loss_mask, config.loss_reduction)
+    loss = reduce_loss(loss, loss_mask)
     return loss, {"clip_ratio": clip_ratio}
 
 ```

diff --git a/skyrl-train/docs/configuration/config.rst b/skyrl-train/docs/configuration/config.rst
@@ -575,7 +575,7 @@ It can be helpful to understand the final loss formulation to see how the differ
         pg_losses3 = -advantages * config.clip_ratio_c
         clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1)
         loss = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1)
-      loss = reduce_loss(loss, loss_mask, config.loss_reduction)
+      loss = reduce_loss(loss, loss_mask)
       return loss, {"clip_ratio": clip_ratio}
 
 

diff --git a/skyrl-train/examples/async/async_trainer.py b/skyrl-train/examples/async/async_trainer.py
@@ -5,7 +5,6 @@
 from skyrl_train.trainer import RayPPOTrainer
 from tqdm import tqdm
 from skyrl_train.utils import Timer
-from skyrl_train.utils.ppo_utils import normalize_advantages_dict
 from skyrl_train.training_batch import TrainingInputBatch
 from skyrl_train.generators.base import GeneratorOutput
 from skyrl_train.utils.trainer_utils import ResumeMode
@@ -145,9 +144,6 @@ async def _run_training(self, generation_buffer):
                 training_input.pop(key)
             training_input.metadata.pop("uids")
 
-            if self.cfg.trainer.algorithm.advantage_batch_normalize:
-                training_input = normalize_advantages_dict(training_input)
-
         if self.cfg.trainer.dump_data_batch:
             # dump data to file
             with Timer("dump_data_batch"):

diff --git a/skyrl-train/examples/megatron/run_fsdp_baseline.sh b/skyrl-train/examples/megatron/run_fsdp_baseline.sh
@@ -25,7 +25,7 @@ uv run --isolated --extra $INFERENCE_BACKEND -m skyrl_train.entrypoints.main_bas
   generator.inference_engine_tensor_parallel_size=1 \
   trainer.epochs=20 \
   trainer.eval_batch_size=1024 \
-  trainer.eval_before_train=true \
+  trainer.eval_before_train=false \
   trainer.eval_interval=5 \
   trainer.update_epochs_per_batch=1 \
   trainer.train_batch_size=128 \
@@ -35,8 +35,8 @@ uv run --isolated --extra $INFERENCE_BACKEND -m skyrl_train.entrypoints.main_bas
   trainer.ckpt_interval=10 \
   trainer.max_prompt_length=512 \
   generator.sampling_params.max_generate_length=1024 \
-  trainer.policy.optimizer_config.lr=4.0e-6 \
-  trainer.algorithm.use_kl_loss=true \
+  trainer.policy.optimizer_config.lr=1.0e-6 \
+  trainer.algorithm.use_kl_loss=false \
   generator.backend=$INFERENCE_BACKEND \
   generator.run_engines_locally=true \
   generator.weight_sync_backend=nccl \
@@ -47,7 +47,7 @@ uv run --isolated --extra $INFERENCE_BACKEND -m skyrl_train.entrypoints.main_bas
   generator.gpu_memory_utilization=0.8 \
   trainer.logger="$LOGGER" \
   trainer.project_name="gsm8k_megatron" \
-  trainer.run_name="gsm8k_fsdp1_4gpus" \
+  trainer.run_name="gsm8k_fsdp1_4gpus_loss_sum" \
   trainer.resume_mode=null \
   trainer.ckpt_path="$HOME/ckpts/gsm8k_fsdp_ckpt" \
   $@
diff --git a/skyrl-train/examples/megatron/run_megatron.sh b/skyrl-train/examples/megatron/run_megatron.sh
@@ -13,14 +13,19 @@ MODEL_NAME="Qwen/Qwen3-0.6B"
 
 INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron
 
-MEGATRON_TP=2
-MEGATRON_PP=2
+MEGATRON_TP=1
+MEGATRON_PP=1
 MEGATRON_CP=1
 
-# torch profiler config
-ENABLE_TORCH_PROFILER=false
-RANKS_TO_PROFILE="[0]"
-SAVE_PATH="$HOME/megatron_prof/tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}"
+# # torch profiler config
+# ENABLE_TORCH_PROFILER=false
+# RANKS_TO_PROFILE="[0]"
+# SAVE_PATH="$HOME/megatron_prof/tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}"
+
+# TIS_TYPE="token"
+# TIS_RATIO_CLIP_HIGH=2.0
+  # trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_TYPE \
+  # trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_RATIO_CLIP_HIGH \
 
 uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
   data.train_data="['$DATA_DIR/train.parquet']" \
@@ -52,11 +57,12 @@ uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
   trainer.policy_mini_batch_size=64 \
   trainer.micro_forward_batch_size_per_gpu=4 \
   trainer.micro_train_batch_size_per_gpu=4 \
+  trainer.algorithm.loss_reduction="seq_mean_token_sum_norm" \
   trainer.ckpt_interval=10 \
   trainer.max_prompt_length=512 \
   generator.sampling_params.max_generate_length=1024 \
   trainer.policy.optimizer_config.lr=1.0e-6 \
-  trainer.algorithm.use_kl_loss=true \
+  trainer.algorithm.use_kl_loss=false \
   generator.backend=$INFERENCE_BACKEND \
   generator.run_engines_locally=true \
   generator.weight_sync_backend=nccl \
@@ -67,7 +73,7 @@ uv run --isolated --extra mcore -m skyrl_train.entrypoints.main_base \
   generator.gpu_memory_utilization=0.7 \
   trainer.logger="$LOGGER" \
   trainer.project_name="gsm8k_megatron" \
-  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}" \
+  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}_seq_mean_token_sum_norm" \
   trainer.resume_mode=null \
   trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \
   $@
diff --git a/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_1.7b.sh b/skyrl-train/examples/megatron/run_megatron_dapo_qwen3_1.7b.sh
@@ -0,0 +1,121 @@
+set -x
+
+# Colocated DAPO training+generation for Qwen3-1.7B-Base on DAPO training data with Megatron.
+# bash examples/algorithms/dapo/prepare_dapo_data.sh
+# bash examples/megatron/run_megatron_dapo_qwen3_1.7b.sh
+
+MODEL_NAME="Qwen/Qwen3-1.7B-Base"
+DATA_DIR="$HOME/data/dapo"
+TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet"
+TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet"
+NUM_NODES=1
+NUM_GPUS_PER_NODE=8
+NUM_INFERENCE_ENGINES=8
+INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=1
+LOGGER="wandb"  # change to "console" to print to stdout
+
+CLIP_RATIO_LOW=0.2
+CLIP_RATIO_HIGH=0.28
+# use token mean loss reduction
+LOSS_REDUCTION="token_mean"
+# applies overlong filtering (but not soft overlong punishment)
+APPLY_OVERLONG_FILTERING=true
+# apply soft overlong punishment with custom trainer impl in main_dapo.py
+OVERLONG_BUFFER_LEN=$((1024 * 4))
+OVERLONG_BUFFER_PENALTY_FACTOR=1.0
+
+# other DAPO parameters
+USE_KL_LOSS=false
+TEMPERATURE=1.0
+TOP_P=1.0
+EVAL_TOP_P=0.7
+CLIP_RATIO_C=10.0
+MAX_PROMPT_LENGTH=$((1024 * 2))
+MAX_RESPONSE_LENGTH=$((1024 * 8))
+
+# repro run parameters
+TRAIN_BATCH_SIZE=512
+MINI_BATCH_SIZE=32
+N_SAMPLES_PER_PROMPT=16
+EVAL_N_SAMPLES_PER_PROMPT=32
+ENFORCE_EAGER=true # cuda graphs can cause some instability
+LR=1e-6
+
+# megatron config
+MEGATRON_TP=4
+MEGATRON_PP=2
+MEGATRON_CP=1
+MEGATRON_EP=1
+MEGATRON_ETP=null
+
+
+# TIS parameters
+TIS_IMP_RATIO_CAP=2.0
+TIS_TYPE=token
+
+uv run --isolated --extra mcore -m examples.algorithms.dapo.main_dapo \
+  data.train_data="['$TRAIN_FILE']" \
+  data.val_data="['$TEST_FILE']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.policy_loss_type="dual_clip" \
+  +trainer.algorithm.overlong_buffer.len=$OVERLONG_BUFFER_LEN \
+  +trainer.algorithm.overlong_buffer.penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \
+  trainer.algorithm.loss_reduction=$LOSS_REDUCTION \
+  generator.enforce_eager=$ENFORCE_EAGER \
+  generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \
+  generator.sampling_params.temperature=$TEMPERATURE \
+  generator.sampling_params.top_p=$TOP_P \
+  generator.eval_sampling_params.top_p=$EVAL_TOP_P \
+  generator.eval_sampling_params.temperature=$TEMPERATURE \
+  trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
+  trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \
+  trainer.policy.model.path="$MODEL_NAME" \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \
+  generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine_tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \
+  trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \
+  trainer.algorithm.off_policy_correction.tis_ratio_type=$TIS_TYPE \
+  trainer.algorithm.off_policy_correction.token_tis_ratio_clip_high=$TIS_IMP_RATIO_CAP \
+  trainer.epochs=20 \
+  trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \
+  trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=true \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=$TRAIN_BATCH_SIZE \
+  trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \
+  trainer.micro_forward_batch_size_per_gpu=8 \
+  trainer.micro_train_batch_size_per_gpu=8 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=$MAX_PROMPT_LENGTH \
+  generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \
+  trainer.policy.optimizer_config.lr=$LR \
+  trainer.policy.optimizer_config.num_warmup_steps=160 \
+  trainer.policy.optimizer_config.weight_decay=0.1 \
+  trainer.policy.optimizer_config.max_grad_norm=1.0 \
+  generator.backend=vllm \
+  generator.run_engines_locally=true \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=false \
+  generator.batched=true \
+  environment.env_class=aime \
+  generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \
+  generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \
+  generator.gpu_memory_utilization=0.8 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="dapo_aime" \
+  trainer.run_name="dapo_qwen3_1.7b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_loss_sum_dp1" \
+  trainer.export_path="$HOME/exports/dapo_qwen3_1.7b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_loss_sum_dp1" \
+  trainer.hf_save_interval=300 \
+  trainer.resume_mode=latest \
+  trainer.max_ckpts_to_keep=3 \
+  trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_1.7b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_loss_sum_dp1" \
+  $@
diff --git a/skyrl-train/examples/on_policy_distillation/main_on_policy_distill.py b/skyrl-train/examples/on_policy_distillation/main_on_policy_distill.py
@@ -51,7 +51,7 @@ def compute_importance_sampling_policy_loss(
     # as defined here: https://tinker-docs.thinkingmachines.ai/losses#policy-gradient-importance_sampling
     loss = -torch.exp(log_probs - old_log_probs) * advantages
 
-    loss = reduce_loss(loss, loss_mask, "seq_mean_token_sum_norm", config.max_seq_len)
+    loss = reduce_loss(loss, loss_mask)
     # return loss and a dummy clip ratio value as we aren't clipping here
     return loss, {"clip_ratio": 0.0}
 

diff --git a/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh b/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh
@@ -74,6 +74,7 @@ uv run --isolated --extra vllm -m examples.on_policy_distillation.main_on_policy
   trainer.policy.optimizer_config.weight_decay=0.1 \
   trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
   trainer.algorithm.use_kl_in_reward=$USE_KL_IN_REWARD \
+  trainer.algorithm.loss_reduction="seq_mean_token_sum_norm" \
   generator.backend=vllm \
   generator.run_engines_locally=true \
   generator.async_engine=false \

diff --git a/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh b/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh
@@ -74,6 +74,7 @@ uv run --isolated --extra vllm -m examples.on_policy_distillation.main_on_policy
   trainer.policy.optimizer_config.weight_decay=0.1 \
   trainer.algorithm.use_kl_loss=$USE_KL_LOSS \
   trainer.algorithm.use_kl_in_reward=$USE_KL_IN_REWARD \
+  trainer.algorithm.loss_reduction="seq_mean_token_sum_norm" \
   generator.backend=vllm \
   generator.run_engines_locally=true \
   generator.async_engine=false \

diff --git a/skyrl-train/skyrl_train/distributed/strategy.py b/skyrl-train/skyrl_train/distributed/strategy.py
@@ -67,11 +67,11 @@ def get_rank(self) -> int:
         """Get current process rank"""
         return dist.get_rank()
 
-    def all_reduce(self, data: DataT, op="mean") -> DataT:
+    def all_reduce(self, data: DataT, op="mean", group=None) -> DataT:
         """Perform all_reduce across all processes"""
         assert op in ("mean", "max", "sum", "min")
         if isinstance(data, dict):
-            return {k: self.all_reduce(v, op) for k, v in data.items()}
+            return {k: self.all_reduce(v, op, group) for k, v in data.items()}
         else:
             is_tensor = True
             if not isinstance(data, torch.Tensor):
@@ -82,14 +82,17 @@ def all_reduce(self, data: DataT, op="mean") -> DataT:
             if is_cpu_tensor:
                 data = data.to(torch.cuda.current_device())
             if op == "mean":
-                data /= self.world_size
-                dist.all_reduce(data, op=dist.ReduceOp.SUM)
+                if group is None:
+                    data /= self.world_size
+                else:
+                    data /= group.size()
+                dist.all_reduce(data, op=dist.ReduceOp.SUM, group=group)
             elif op == "max":
-                dist.all_reduce(data, op=dist.ReduceOp.MAX)
+                dist.all_reduce(data, op=dist.ReduceOp.MAX, group=group)
             elif op == "min":
-                dist.all_reduce(data, op=dist.ReduceOp.MIN)
+                dist.all_reduce(data, op=dist.ReduceOp.MIN, group=group)
             elif op == "sum":
-                dist.all_reduce(data, op=dist.ReduceOp.SUM)
+                dist.all_reduce(data, op=dist.ReduceOp.SUM, group=group)
             if is_cpu_tensor:
                 data = data.cpu()
             return data.item() if not is_tensor else data

diff --git a/skyrl-train/skyrl_train/fully_async_trainer.py b/skyrl-train/skyrl_train/fully_async_trainer.py
@@ -20,7 +20,6 @@
 from skyrl_train.trainer import RayPPOTrainer
 from tqdm import tqdm
 from skyrl_train.utils import Timer
-from skyrl_train.utils.ppo_utils import normalize_advantages_dict
 from skyrl_train.training_batch import TrainingInputBatch
 from skyrl_train.generators.base import GeneratorOutput
 from skyrl_train.utils.trainer_utils import ResumeMode, build_dataloader
@@ -511,9 +510,6 @@ async def _run_training(self, training_input: TrainingInputBatch):
                 training_input.pop(key)
             training_input.metadata.pop("uids")
 
-            if self.cfg.trainer.algorithm.advantage_batch_normalize:
-                training_input = normalize_advantages_dict(training_input)
-
         if self.cfg.trainer.dump_data_batch:
             # dump data to file
             with Timer("dump_data_batch"):