Add initial epoch multiplier as a parameter to the PC script.

ernestum · ernestum · commit 78553c9f7351 · 2024-01-11T11:19:12.000+01:00
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -42,6 +42,8 @@ def train_defaults():
     transition_oversampling = 1
     # fraction of total_comparisons that will be sampled right at the beginning
     initial_comparison_frac = 0.1
+    # factor by which to oversample the number of epochs in the first iteration
+    initial_epoch_multiplier = 200.0
     # fraction of sampled trajectories that will include some random actions
     exploration_frac = 0.0
     preference_model_kwargs = {}
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
@@ -68,6 +68,7 @@ def train_preference_comparisons(
     fragment_length: int,
     transition_oversampling: float,
     initial_comparison_frac: float,
+    initial_epoch_multiplier: float,
     exploration_frac: float,
     trajectory_path: Optional[str],
     trajectory_generator_kwargs: Mapping[str, Any],
@@ -106,6 +107,9 @@ def train_preference_comparisons(
             sampled before the rest of training begins (using the randomly initialized
             agent). This can be used to pretrain the reward model before the agent
             is trained on the learned reward.
+        initial_epoch_multiplier: before agent training begins, train the reward
+                model for this many more epochs than usual (on fragments sampled from a
+                random agent).
         exploration_frac: fraction of trajectory samples that will be created using
             partially random actions, rather than the current policy. Might be helpful
             if the learned policy explores too little and gets stuck with a wrong
@@ -258,6 +262,7 @@ def train_preference_comparisons(
             fragment_length=fragment_length,
             transition_oversampling=transition_oversampling,
             initial_comparison_frac=initial_comparison_frac,
+            initial_epoch_multiplier=initial_epoch_multiplier,
             custom_logger=custom_logger,
             allow_variable_horizon=allow_variable_horizon,
             query_schedule=query_schedule,