pytorch
diff --git a/‎recipes/configs/dev/qwen3B_async_grpo.yaml
+11 b/‎recipes/configs/dev/qwen3B_async_grpo.yaml
+11
diff --git a/‎recipes/configs/dev/qwen3B_sync_grpo.yaml
+10 b/‎recipes/configs/dev/qwen3B_sync_grpo.yaml
+10
diff --git a/‎tests/torchtune/dev/rl/rewards/__init__.py
+5 b/‎tests/torchtune/dev/rl/rewards/__init__.py
+5
diff --git a/‎tests/torchtune/dev/rl/rewards/test_rewards.py
+47 b/‎tests/torchtune/dev/rl/rewards/test_rewards.py
+47
diff --git a/‎tests/torchtune/dev/rl/workers/test_postprocessing.py
+5-8 b/‎tests/torchtune/dev/rl/workers/test_postprocessing.py
+5-8
diff --git a/‎torchtune/dev/rl/datatypes/trajectory.py
+3-4 b/‎torchtune/dev/rl/datatypes/trajectory.py
+3-4
@@ -97,6 +97,17 @@ training:
     epsilon: 0.2
   seed: null
 
+reward_functions:
+    - _component_: torchtune.dev.rl.rewards.FormattedMathCorrectnessReward
+      answer_tag: answer
+      positive_reward: 10.0
+      negative_reward: 0.0
+    - _component_: torchtune.dev.rl.rewards.ThinkingAnswerFormattingReward
+      think_tag: think
+      answer_tag: answer
+      positive_reward: 1.0
+      negative_reward: 0.0
+
 # All logging args
 metric_logger:
   _component_: torchtune.training.metric_logging.WandBLogger
 
@@ -107,6 +107,16 @@ compile: False  # pytorch compile, set to true for better perf/memory
 # Reduced precision
 dtype: bf16
 
+reward_functions:
+    - _component_: torchtune.dev.rl.rewards.FormattedMathCorrectnessReward
+      answer_tag: answer
+      positive_reward: 10.0
+      negative_reward: 0.0
+    - _component_: torchtune.dev.rl.rewards.ThinkingAnswerFormattingReward
+      think_tag: think
+      answer_tag: answer
+      positive_reward: 1.0
+      negative_reward: 0.0
 
 # Logging
 metric_logger:
 
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+from torchtune.dev.rl.rewards import RewardOutput
+
+
+class TestRewardOutput:
+    @pytest.fixture
+    def sample_reward_output(self):
+        return RewardOutput(
+            reward_base_name="test_reward",
+            total_reward=torch.tensor([1.0, 2.0, 3.0]),
+            successes=torch.tensor([1.0, 0.0, 1.0]),
+            rewards={
+                "sub_reward_1": torch.tensor([0.5, 1.5, 2.5]),
+                "sub_reward_2": torch.tensor([10.0, 20.0, 30.0]),
+            },
+        )
+
+    def test_log(self, sample_reward_output):
+        log_dict = sample_reward_output.log(prefix="train")
+        expected_log = {
+            "train/test_reward/sub_reward_1": 1.5,
+            "train/test_reward/sub_reward_2": 20.0,
+            "train/test_reward": 2.0,
+            "train/test_reward/successes": 2.0 / 3.0,
+        }
+        assert log_dict.keys() == expected_log.keys()
+        for key in expected_log:
+            assert log_dict[key] == pytest.approx(expected_log[key])
+
+    def test_log_no_prefix(self, sample_reward_output):
+        log_dict = sample_reward_output.log()
+        expected_log = {
+            "test_reward/sub_reward_1": 1.5,
+            "test_reward/sub_reward_2": 20.0,
+            "test_reward": 2.0,
+            "test_reward/successes": 2.0 / 3.0,
+        }
+        assert log_dict.keys() == expected_log.keys()
+        for key in expected_log:
+            assert log_dict[key] == pytest.approx(expected_log[key])
@@ -8,6 +8,9 @@
 import time
 
 import pytest
+import torch
+from omegaconf import OmegaConf
+from tests.test_utils import gen_log_file_name, gpu_test, rl_test, skip_if_lt_python_310
 
 _has_ray = importlib.util.find_spec("ray") is not None
 
@@ -25,10 +28,6 @@ def remote(*args, **kwargs):
         return lambda cls: cls
 
 
-import torch
-from omegaconf import OmegaConf
-from tests.test_utils import gen_log_file_name, gpu_test, rl_test, skip_if_lt_python_310
-
 grpo_samples = 4
 max_generated_tokens = 32
 
@@ -130,12 +129,10 @@ def test_run(self, cfg, log_file):
                         ).to(dtype=torch.bool),
                         seq_lens=torch.randint(0, 100, (grpo_samples,)),
                         answers=NonTensorData(["42"] * grpo_samples),
+                        sequence_ids=None,
                         policy_version=None,
-                        rewards=None,
                         advantages=None,
-                        successes=None,
-                        reward_metadata=None,
-                        sequence_ids=None,
+                        reward_outputs=None,
                     )
                 )
             replay_buffer = []
 
@@ -4,10 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Dict, List
+from typing import List
 
 import torch
 from tensordict import TensorClass
+from torchtune.dev.rl.rewards import RewardOutput
 
 
 class Trajectory(TensorClass["nocast"]):
@@ -19,8 +20,6 @@ class Trajectory(TensorClass["nocast"]):
     seq_lens: torch.Tensor
     answers: torch.Tensor
     policy_version: int
-    rewards: torch.Tensor
     advantages: torch.Tensor
-    successes: torch.Tensor
-    reward_metadata: Dict[str, List[str]]
+    reward_outputs: List[RewardOutput]
     sequence_ids: List[str]