diff --git a/setup.py b/setup.py index 1c2c85af6..1a76e49fb 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ IS_NOT_WINDOWS = os.name != "nt" -PARALLEL_REQUIRE = ["ray[debug,tune]~=2.0.0"] +PARALLEL_REQUIRE = ["ray[debug,tune]~=2.9.0"] ATARI_REQUIRE = [ "seals[atari]~=0.2.1", ] diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 14a8fad5b..1b0a2b01b 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1678,6 +1678,8 @@ def train( unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations)) probs = unnormalized_probs / np.sum(unnormalized_probs) shares = util.oric(probs * total_comparisons) + shares[shares <= 0] = 1 # ensure we at least request one comparison per iteration + schedule = [initial_comparisons] + shares.tolist() print(f"Query schedule: {schedule}") diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 4d8531732..b053d3f38 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -42,6 +42,8 @@ def train_defaults(): transition_oversampling = 1 # fraction of total_comparisons that will be sampled right at the beginning initial_comparison_frac = 0.1 + # factor by which to oversample the number of epochs in the first iteration + initial_epoch_multiplier = 200.0 # fraction of sampled trajectories that will include some random actions exploration_frac = 0.0 preference_model_kwargs = {} @@ -77,7 +79,7 @@ def cartpole(): @train_preference_comparisons_ex.named_config def seals_ant(): - environment = dict(gym_id="seals/Ant-v0") + environment = dict(gym_id="seals/Ant-v1") rl = dict( batch_size=2048, rl_kwargs=dict( @@ -104,7 +106,7 @@ def half_cheetah(): @train_preference_comparisons_ex.named_config def seals_half_cheetah(): - environment = dict(gym_id="seals/HalfCheetah-v0") + environment = dict(gym_id="seals/HalfCheetah-v1") rl = dict( batch_size=512, rl_kwargs=dict( @@ -125,7 +127,7 @@ def seals_half_cheetah(): @train_preference_comparisons_ex.named_config def seals_hopper(): - environment = dict(gym_id="seals/Hopper-v0") + environment = dict(gym_id="seals/Hopper-v1") policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( @@ -151,7 +153,7 @@ def seals_hopper(): @train_preference_comparisons_ex.named_config def seals_swimmer(): - environment = dict(gym_id="seals/Swimmer-v0") + environment = dict(gym_id="seals/Swimmer-v1") policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( @@ -178,7 +180,7 @@ def seals_swimmer(): @train_preference_comparisons_ex.named_config def seals_walker(): - environment = dict(gym_id="seals/Walker2d-v0") + environment = dict(gym_id="seals/Walker2d-v1") policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( @@ -206,7 +208,7 @@ def seals_walker(): @train_preference_comparisons_ex.named_config def seals_humanoid(): locals().update(**MUJOCO_SHARED_LOCALS) - environment = dict(gym_id="seals/Humanoid-v0") + environment = dict(gym_id="seals/Humanoid-v1") total_timesteps = int(4e6) diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py index 73313770a..daed7c1a0 100644 --- a/src/imitation/scripts/config/tuning.py +++ b/src/imitation/scripts/config/tuning.py @@ -188,38 +188,42 @@ def pc(): parallel_run_config = dict( sacred_ex_name="train_preference_comparisons", run_name="pc_tuning", - base_named_configs=["logging.wandb_logging"], + base_named_configs=[], base_config_updates={ "environment": {"num_vec": 1}, - "demonstrations": {"source": "huggingface"}, "total_timesteps": 2e7, - "total_comparisons": 5000, - "query_schedule": "hyperbolic", - "gatherer_kwargs": {"sample": True}, + "total_comparisons": 1000, + "active_selection": True, }, search_space={ - "named_configs": [ - ["reward.normalize_output_disable"], - ], + "named_configs": ["reward.reward_ensemble"], "config_updates": { - "train": { - "policy_kwargs": { - "activation_fn": tune.choice( - [ - nn.ReLU, - ], - ), - }, + "active_selection_oversampling": tune.randint(1, 11), + "comparison_queue_size": tune.randint(1, 1001), # upper bound determined by total_comparisons=1000 + "exploration_frac": tune.uniform(0.0, 0.5), + "fragment_length": tune.randint(1, 1001), # trajectories are 1000 steps long + "gatherer_kwargs": { + "temperature": tune.uniform(0.0, 2.0), + "discount_factor": tune.uniform(0.95, 1.0), + "sample": tune.choice([True, False]), }, - "num_iterations": tune.choice([25, 50]), - "initial_comparison_frac": tune.choice([0.1, 0.25]), + "initial_comparison_frac": tune.uniform(0.01, 1.0), + "num_iterations": tune.randint(1, 51), + "preference_model_kwargs": { + "noise_prob": tune.uniform(0.0, 0.1), + "discount_factor": tune.uniform(0.95, 1.0), + }, + "query_schedule": tune.choice(["hyperbolic", "constant", "inverse_quadratic"]), + "trajectory_generator_kwargs": { + "switch_prob": tune.uniform(0.1, 1), + "random_prob": tune.uniform(0.1, 0.9), + }, + "transition_oversampling": tune.uniform(0.9, 2.0), "reward_trainer_kwargs": { - "epochs": tune.choice([1, 3, 6]), + "epochs": tune.randint(1, 11), }, "rl": { - "batch_size": tune.choice([512, 2048, 8192]), "rl_kwargs": { - "learning_rate": tune.loguniform(1e-5, 1e-2), "ent_coef": tune.loguniform(1e-7, 1e-3), }, }, diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index d5e5e2378..76a068224 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -188,13 +188,12 @@ def _ray_tune_sacred_wrapper( `ex.run`) and `reporter`. The function returns the run result. """ - def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]: + def inner(config: Mapping[str, Any]) -> Mapping[str, Any]: """Trainable function with the correct signature for `ray.tune`. Args: config: Keyword arguments for `ex.run()`, where `ex` is the `sacred.Experiment` instance associated with `sacred_ex_name`. - reporter: Callback to report progress to Ray. Returns: Result from `ray.Run` object. diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index 71363daee..428c98381 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -68,6 +68,7 @@ def train_preference_comparisons( fragment_length: int, transition_oversampling: float, initial_comparison_frac: float, + initial_epoch_multiplier: float, exploration_frac: float, trajectory_path: Optional[str], trajectory_generator_kwargs: Mapping[str, Any], @@ -106,6 +107,9 @@ def train_preference_comparisons( sampled before the rest of training begins (using the randomly initialized agent). This can be used to pretrain the reward model before the agent is trained on the learned reward. + initial_epoch_multiplier: before agent training begins, train the reward + model for this many more epochs than usual (on fragments sampled from a + random agent). exploration_frac: fraction of trajectory samples that will be created using partially random actions, rather than the current policy. Might be helpful if the learned policy explores too little and gets stuck with a wrong @@ -258,6 +262,7 @@ def train_preference_comparisons( fragment_length=fragment_length, transition_oversampling=transition_oversampling, initial_comparison_frac=initial_comparison_frac, + initial_epoch_multiplier=initial_epoch_multiplier, custom_logger=custom_logger, allow_variable_horizon=allow_variable_horizon, query_schedule=query_schedule,