Integrate verl GRPO trainer into train script (#1652)

wizeng23 · web-flow · commit 279e9ba2ad0c · 2025-04-24T16:22:05.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -117,7 +117,6 @@ gpu = [
     "nvidia-ml-py>=12.560.30,<12.561",
     "bitsandbytes>=0.45.0,<0.46",      # Used for QLora, and PagedAdam implementation
     "verl>=0.3.0,<0.4",                # Used for the VERL_GRPO trainer.
-    "ray[default]",                    # Used for the VERL_GRPO trainer.
     "vllm>=0.7.3,<0.8.0",              # For VLLMInferenceEngine
 ]
 
diff --git a/src/oumi/builders/training.py b/src/oumi/builders/training.py
@@ -22,7 +22,7 @@
 from oumi.core.configs import TrainerType, TrainingParams
 from oumi.core.distributed import is_world_process_zero
 from oumi.core.processors.base_processor import BaseProcessor
-from oumi.core.trainers import BaseTrainer, HuggingFaceTrainer
+from oumi.core.trainers import BaseTrainer, HuggingFaceTrainer, VerlGrpoTrainer
 from oumi.core.trainers import Trainer as OumiTrainer
 from oumi.utils.logging import logger
 
@@ -94,6 +94,12 @@ def _init_oumi_trainer(*args, **kwargs) -> BaseTrainer:
 
         return _init_oumi_trainer
 
+    def _create_verl_grpo_builder_fn() -> Callable[..., BaseTrainer]:
+        def _init_verl_grpo_trainer(*args, **kwargs) -> BaseTrainer:
+            return VerlGrpoTrainer(*args, **kwargs)
+
+        return _init_verl_grpo_trainer
+
     if trainer_type == TrainerType.TRL_SFT:
         return _create_hf_builder_fn(trl.SFTTrainer)
     elif trainer_type == TrainerType.TRL_DPO:
@@ -108,5 +114,7 @@ def _init_oumi_trainer(*args, **kwargs) -> BaseTrainer:
             "Prefer to use HF trainer when possible."
         )
         return _create_oumi_builder_fn()
+    elif trainer_type == TrainerType.VERL_GRPO:
+        return _create_verl_grpo_builder_fn()
 
     raise NotImplementedError(f"Trainer type {trainer_type} not supported.")
diff --git a/src/oumi/core/configs/params/training_params.py b/src/oumi/core/configs/params/training_params.py
@@ -68,6 +68,15 @@ class TrainerType(Enum):
     designed to provide additional flexibility and features.
     """
 
+    VERL_GRPO = "verl_grpo"
+    """Group Relative Policy Optimization trainer from `verl` library.
+
+    This trainer implements the Group Relative Policy Optimization algorithm
+    introduced in the paper https://arxiv.org/pdf/2402.03300
+    for fine-tuning language models.
+    Optionally, supports user-defined reward functions.
+    """
+
 
 class SchedulerType(str, Enum):
     """Enum representing the supported learning rate schedulers.
@@ -153,7 +162,9 @@ class TrainingParams(BaseParams):
     - HF: HuggingFace's Trainer
     - TRL_SFT: TRL's SFT Trainer
     - TRL_DPO: TRL's DPO Trainer
+    - TRL_GRPO: TRL's GRPO Trainer
     - OUMI: Custom generic trainer implementation
+    - VERL_GRPO: verl's GRPO Trainer
     """
 
     enable_gradient_checkpointing: bool = False
@@ -312,8 +323,14 @@ class TrainingParams(BaseParams):
     """The names of the reward function in the Oumi registry to use for reinforcement
     learning.
 
-    Only supported with the TRL_GRPO trainer currently. Refer to
-    https://huggingface.co/docs/trl/main/en/grpo_trainer
+    Only supported with the TRL_GRPO and VERL_GRPO trainers. Currently,
+    VERL_GRPO only supports specifying a single reward function.
+
+    For TRL_GRPO, refer to https://huggingface.co/docs/trl/main/en/grpo_trainer
+    for documentation about the function signature.
+
+    For VERL_GRPO, refer to
+    https://verl.readthedocs.io/en/latest/preparation/reward_function.html
     for documentation about the function signature.
     """
 
@@ -798,14 +815,21 @@ def __post_init__(self):
 
         if (
             self.trainer_type != TrainerType.TRL_GRPO
+            and self.trainer_type != TrainerType.VERL_GRPO
             and self.reward_functions is not None
         ):
             function_names = [name for name in self.reward_functions if name]
             if len(function_names) > 0:
                 raise ValueError(
-                    "reward_functions may only be defined for the TRL_GRPO trainer. "
-                    f"Actual: {self.trainer_type}"
+                    "reward_functions may only be defined for the TRL_GRPO or VERL_GRPO"
+                    f"trainers. Actual: {self.trainer_type}"
                 )
+            if self.trainer_type == TrainerType.VERL_GRPO:
+                if len(function_names) > 1:
+                    raise ValueError(
+                        "VERL_GRPO only supports a single reward function. "
+                        f"Actual: {function_names}"
+                    )
 
         # TODO: #1540 - Remove when TRL bug is fixed.
         if (
diff --git a/src/oumi/core/configs/training_config.py b/src/oumi/core/configs/training_config.py
@@ -203,3 +203,12 @@ def __post_init__(self):
                     dataset_params.dataset_kwargs["processor_kwargs"] = {
                         **self.model.processor_kwargs
                     }
+
+        # Verl will error without a validation dataset.
+        if (
+            self.training.trainer_type == TrainerType.VERL_GRPO
+            and not self.data.validation.datasets
+        ):
+            raise ValueError(
+                "At least one validation dataset is required for VERL_GRPO training."
+            )
diff --git a/src/oumi/train.py b/src/oumi/train.py

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,6 @@ gpu = [`
`117`	`117`	`"nvidia-ml-py>=12.560.30,<12.561",`
`118`	`118`	`"bitsandbytes>=0.45.0,<0.46", # Used for QLora, and PagedAdam implementation`
`119`	`119`	`"verl>=0.3.0,<0.4", # Used for the VERL_GRPO trainer.`
`120`		`- "ray[default]", # Used for the VERL_GRPO trainer.`
`121`	`120`	`"vllm>=0.7.3,<0.8.0", # For VLLMInferenceEngine`
`122`	`121`	`]`
`123`	`122`