From 8d5900a380ea49f6871c900254e467fc7a0f5278 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Wed, 9 Nov 2022 16:26:10 -0800 Subject: [PATCH 01/55] Welfords alg and test --- src/imitation/util/util.py | 41 ++++++++++++++++++++++ tests/util/test_util.py | 69 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index bbb7b2c37..317b17bbb 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -359,3 +359,44 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]: return_iterable = iterable return first_element, return_iterable + + +class RunningMeanAndVar: + """Stores a running mean and variance using Wellford's algorithm.""" + + def __init__( + self, + shape: Tuple[int, ...] = (), + device: Optional[str] = None, + ): + """Initialize blank mean, variance, count.""" + self.mean = th.zeros(shape, device=device) + self.M2 = th.zeros(shape, device=device) + self.count = 0 + + def update(self, x: th.Tensor): + with th.no_grad(): + batch_mean = th.mean(x, dim=0) + batch_var = th.var(x, dim=0, unbiased=False) + batch_count = x.shape[0] + batch_M2 = batch_var * batch_count + if self.count == 0: + self.count = batch_count + self.mean = batch_mean + self.M2 = batch_M2 + return + + delta = batch_mean - self.mean + total_count = self.count + batch_count + self.mean += delta * batch_count / total_count + + self.M2 += ( + batch_M2 + delta * delta * (self.count * batch_count) / total_count + ) + + self.count = total_count + + @property + def var(self): + """Returns the unbiased estimate of the variance.""" + return self.M2 / (self.count - 1) diff --git a/tests/util/test_util.py b/tests/util/test_util.py index ce663d8e0..b58e32b7c 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -118,3 +118,72 @@ def test_tensor_iter_norm(): assert np.allclose(norm_1, 14.0) with pytest.raises(ValueError): util.tensor_iter_norm(tensor_list, ord=0.0) + + +def test_RunningMeanAndVarSimple(): + running_stats = util.RunningMeanAndVar(shape=(3, 4)) + first_half = th.ones(size=(10, 3, 4), dtype=th.double) + + running_stats.update(first_half) + np.testing.assert_allclose( + running_stats.mean, + first_half.mean(dim=0), + atol=1e-5, + rtol=1e-4, + ) + np.testing.assert_allclose( + running_stats.var, + first_half.var(dim=0), + atol=1e-5, + rtol=1e-4, + ) + + second_half = 2 * th.ones(size=(10, 3, 4), dtype=th.double) + data = th.cat([first_half, second_half]) + running_stats.update(second_half) + np.testing.assert_allclose( + running_stats.mean, + data.mean(dim=0), + atol=1e-5, + rtol=1e-4, + ) + np.testing.assert_allclose( + running_stats.var, + data.var(dim=0), + atol=1e-5, + rtol=1e-4, + ) + + +def test_RunningMeanAndVar(): + running_stats = util.RunningMeanAndVar(shape=(3, 4)) + data = th.normal(mean=10 * th.ones(size=(20, 3, 4), dtype=th.double)) + + first_half = data[:10] + running_stats.update(first_half) + np.testing.assert_allclose( + running_stats.mean, + first_half.mean(dim=0), + atol=1e-5, + rtol=1e-4, + ) + np.testing.assert_allclose( + running_stats.var, + first_half.var(dim=0), + atol=1e-5, + rtol=1e-4, + ) + + running_stats.update(data[10:]) + np.testing.assert_allclose( + running_stats.mean, + data.mean(dim=0), + atol=1e-5, + rtol=1e-4, + ) + np.testing.assert_allclose( + running_stats.var, + data.var(dim=0), + atol=1e-5, + rtol=1e-4, + ) From 4aac074b73d89b5e96e169d7a27aea031c5a6a05 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Thu, 10 Nov 2022 08:59:27 -0800 Subject: [PATCH 02/55] Next func --- src/imitation/util/util.py | 50 ++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index 317b17bbb..af09e2fe3 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -368,13 +368,13 @@ def __init__( self, shape: Tuple[int, ...] = (), device: Optional[str] = None, - ): + ) -> None: """Initialize blank mean, variance, count.""" self.mean = th.zeros(shape, device=device) self.M2 = th.zeros(shape, device=device) self.count = 0 - def update(self, x: th.Tensor): + def update(self, x: th.Tensor) -> None: with th.no_grad(): batch_mean = th.mean(x, dim=0) batch_var = th.var(x, dim=0, unbiased=False) @@ -390,13 +390,49 @@ def update(self, x: th.Tensor): total_count = self.count + batch_count self.mean += delta * batch_count / total_count - self.M2 += ( - batch_M2 + delta * delta * (self.count * batch_count) / total_count - ) + self.M2 += batch_M2 + delta * delta * self.count * batch_count / total_count self.count = total_count @property - def var(self): - """Returns the unbiased estimate of the variance.""" + def var(self) -> th.Tensor: + """Returns the unbiased estimate of the variances.""" return self.M2 / (self.count - 1) + + +def compute_state_entropy( + obs: th.Tensor, + all_obs: th.Tensor, + k: int, + batch_size: int = 500, +) -> th.Tensor: + """Compute the state entropy given by KNN distance. + + Args: + obs: The tensor of states to compute entropy for. + all_obs: The tensor of all states in our experience, + generally from a replay buffer. + k: the number of neighbors to consider + batch_size: when computing distances, how many to consider at once. + + Returns: + A tensor containing the state entropy for `obs`. + """ + with th.no_grad(): + distances = [] + for i in range(len(all_obs) // batch_size + 1): + start = i * batch_size + end = min((i + 1) * batch_size, obs.shape[1]) + # TODO what is going on w/ these shapes? + # TODO use a non-deprecated norm function + distance = th.norm( + obs[:, None, :] - all_obs[None, start:end, :], + dim=-1, + p=2, + ) + distances.append(distance) + + distances_tensor = th.cat(distances, dim=1) + knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values + state_entropy = knn_dists + return state_entropy.unsqueeze(1) From 383fce06fb9aef2283131a2589039767aa0b8c2c Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Thu, 10 Nov 2022 10:29:50 -0800 Subject: [PATCH 03/55] Test update --- tests/util/test_util.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/tests/util/test_util.py b/tests/util/test_util.py index b58e32b7c..1f2862cce 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -120,41 +120,6 @@ def test_tensor_iter_norm(): util.tensor_iter_norm(tensor_list, ord=0.0) -def test_RunningMeanAndVarSimple(): - running_stats = util.RunningMeanAndVar(shape=(3, 4)) - first_half = th.ones(size=(10, 3, 4), dtype=th.double) - - running_stats.update(first_half) - np.testing.assert_allclose( - running_stats.mean, - first_half.mean(dim=0), - atol=1e-5, - rtol=1e-4, - ) - np.testing.assert_allclose( - running_stats.var, - first_half.var(dim=0), - atol=1e-5, - rtol=1e-4, - ) - - second_half = 2 * th.ones(size=(10, 3, 4), dtype=th.double) - data = th.cat([first_half, second_half]) - running_stats.update(second_half) - np.testing.assert_allclose( - running_stats.mean, - data.mean(dim=0), - atol=1e-5, - rtol=1e-4, - ) - np.testing.assert_allclose( - running_stats.var, - data.var(dim=0), - atol=1e-5, - rtol=1e-4, - ) - - def test_RunningMeanAndVar(): running_stats = util.RunningMeanAndVar(shape=(3, 4)) data = th.normal(mean=10 * th.ones(size=(20, 3, 4), dtype=th.double)) From 055fa67ab7b76ba3c2d14629d5eb5d0132ae0f4c Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Thu, 10 Nov 2022 17:11:06 -0800 Subject: [PATCH 04/55] compute_state_entropy and test --- src/imitation/util/util.py | 35 ++++++++++++++--------------------- tests/util/test_util.py | 26 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index af09e2fe3..b5f58a0cb 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -404,35 +404,28 @@ def compute_state_entropy( obs: th.Tensor, all_obs: th.Tensor, k: int, - batch_size: int = 500, ) -> th.Tensor: """Compute the state entropy given by KNN distance. Args: - obs: The tensor of states to compute entropy for. - all_obs: The tensor of all states in our experience, - generally from a replay buffer. + obs: A single observation. + all_obs: The tensor of all states to compare to. k: the number of neighbors to consider - batch_size: when computing distances, how many to consider at once. Returns: A tensor containing the state entropy for `obs`. """ + assert obs.shape == all_obs.shape[1:] with th.no_grad(): - distances = [] - for i in range(len(all_obs) // batch_size + 1): - start = i * batch_size - end = min((i + 1) * batch_size, obs.shape[1]) - # TODO what is going on w/ these shapes? - # TODO use a non-deprecated norm function - distance = th.norm( - obs[:, None, :] - all_obs[None, start:end, :], - dim=-1, - p=2, - ) - distances.append(distance) - - distances_tensor = th.cat(distances, dim=1) - knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values + non_batch_dimensions = tuple(range(1, len(obs.shape) + 1)) + distances_tensor = th.linalg.vector_norm( + obs[None] - all_obs, + dim=non_batch_dimensions, + ord=2, + ) + + # Note that we take the k+1'th value because the closest neighbor to + # a point is itself, which we want to skip. + knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=0).values state_entropy = knn_dists - return state_entropy.unsqueeze(1) + return state_entropy.unsqueeze(0) diff --git a/tests/util/test_util.py b/tests/util/test_util.py index 1f2862cce..54c89fe80 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -152,3 +152,29 @@ def test_RunningMeanAndVar(): atol=1e-5, rtol=1e-4, ) + + +def test_compute_state_entropy_1d(): + all_obs = th.arange(10, dtype=th.float).unsqueeze(1) + obs = all_obs[5] + assert util.compute_state_entropy(obs, all_obs, k=1) == 1 + assert util.compute_state_entropy(obs, all_obs, k=2) == 1 + assert util.compute_state_entropy(obs, all_obs, k=3) == 2 + assert util.compute_state_entropy(obs, all_obs, k=4) == 2 + assert util.compute_state_entropy(obs, all_obs, k=5) == 3 + + +def test_compute_state_entropy_2d(): + all_obs_x = th.arange(10, dtype=th.float) + all_obs_y = th.arange(0, 100, step=10, dtype=th.float) + all_obs = th.stack((all_obs_x, all_obs_y), dim=1) + + obs = all_obs[5] + np.testing.assert_allclose( + util.compute_state_entropy(obs, all_obs, k=1), + np.sqrt(10**2 + 1**2), + ) + np.testing.assert_allclose( + util.compute_state_entropy(obs, all_obs, k=3), + np.sqrt(20**2 + 2**2), + ) From 5c278f40d511d4ccea45e8505d17efa0527da082 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Thu, 10 Nov 2022 17:32:36 -0800 Subject: [PATCH 05/55] Sketch of the entropy reward replay buffer --- .../policies/replay_buffer_wrapper.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 6d0d70449..167261109 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -101,3 +101,54 @@ def _get_samples(self): "_get_samples() is intentionally not implemented." "This method should not be called.", ) + + +class ReplayBufferEntropyRewardWrapper(ReplayBuffer): + """Relabel the rewards from a ReplayBuffer, initially using entropy as reward.""" + + def __init__( + self, + buffer_size: int, + observation_space: spaces.Space, + action_space: spaces.Space, + *, + replay_buffer_class: Type[ReplayBuffer], + reward_fn: RewardFn, + entropy_as_reward_samples: int, + **kwargs, + ): + """Builds ReplayBufferRewardWrapper. + + Args: + buffer_size: Max number of elements in the buffer + observation_space: Observation space + action_space: Action space + replay_buffer_class: Class of the replay buffer. + reward_fn: Reward function for reward relabeling. + entropy_as_reward_samples: Number of samples to use entropy as the reward, + before switching to using the reward_fn for relabeling. + **kwargs: keyword arguments for ReplayBuffer. + """ + super().__init__( + buffer_size, + observation_space, + action_space, + replay_buffer_class, + reward_fn, + **kwargs, + ) + # TODO should we limit by number of batches (as this does) + # or number of observations returned? + self.samples = 0 + self.entropy_as_reward_samples = entropy_as_reward_samples + + def sample(self, *args, **kwargs): + self.samples += 1 + samples = super().sample(*args, **kwargs) + if self.samples > self.entropy_as_reward_samples: + return samples + + # TODO make the state entropy function accept batches + # TODO compute state entropy for each reward + # TODO replace the reward with the entropies + # TODO note that we really ought to reset the reward network when we are done w/ entropy, and we have no business training it before then From 49dc26f6fb1069a0066b907fd565e5da4713ee99 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 11:20:52 -0800 Subject: [PATCH 06/55] Batchify state entropy func --- src/imitation/util/util.py | 12 ++++++------ tests/util/test_util.py | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index b5f58a0cb..4e0bd90e0 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -408,24 +408,24 @@ def compute_state_entropy( """Compute the state entropy given by KNN distance. Args: - obs: A single observation. + obs: A batch of observations. all_obs: The tensor of all states to compare to. k: the number of neighbors to consider Returns: A tensor containing the state entropy for `obs`. """ - assert obs.shape == all_obs.shape[1:] + assert obs.shape[1:] == all_obs.shape[1:] with th.no_grad(): - non_batch_dimensions = tuple(range(1, len(obs.shape) + 1)) + non_batch_dimensions = tuple(range(2, len(obs.shape) + 1)) distances_tensor = th.linalg.vector_norm( - obs[None] - all_obs, + obs[:, None] - all_obs[None, :], dim=non_batch_dimensions, ord=2, ) # Note that we take the k+1'th value because the closest neighbor to # a point is itself, which we want to skip. - knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=0).values + knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values state_entropy = knn_dists - return state_entropy.unsqueeze(0) + return state_entropy.unsqueeze(1) diff --git a/tests/util/test_util.py b/tests/util/test_util.py index 54c89fe80..6f90dd832 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -156,12 +156,12 @@ def test_RunningMeanAndVar(): def test_compute_state_entropy_1d(): all_obs = th.arange(10, dtype=th.float).unsqueeze(1) - obs = all_obs[5] - assert util.compute_state_entropy(obs, all_obs, k=1) == 1 - assert util.compute_state_entropy(obs, all_obs, k=2) == 1 - assert util.compute_state_entropy(obs, all_obs, k=3) == 2 - assert util.compute_state_entropy(obs, all_obs, k=4) == 2 - assert util.compute_state_entropy(obs, all_obs, k=5) == 3 + obs = all_obs[4:6] + np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=1), 1) + np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=2), 1) + np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=3), 2) + np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=4), 2) + np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=5), 3) def test_compute_state_entropy_2d(): @@ -169,7 +169,7 @@ def test_compute_state_entropy_2d(): all_obs_y = th.arange(0, 100, step=10, dtype=th.float) all_obs = th.stack((all_obs_x, all_obs_y), dim=1) - obs = all_obs[5] + obs = all_obs[4:6] np.testing.assert_allclose( util.compute_state_entropy(obs, all_obs, k=1), np.sqrt(10**2 + 1**2), From 394ad56a94e4c308c9fb762828807d9b14eee122 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 11:35:35 -0800 Subject: [PATCH 07/55] Final sketch of replay entropy buffer. --- .../policies/replay_buffer_wrapper.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 167261109..c7eebfac8 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -115,6 +115,7 @@ def __init__( replay_buffer_class: Type[ReplayBuffer], reward_fn: RewardFn, entropy_as_reward_samples: int, + k: int = 5, **kwargs, ): """Builds ReplayBufferRewardWrapper. @@ -127,6 +128,7 @@ def __init__( reward_fn: Reward function for reward relabeling. entropy_as_reward_samples: Number of samples to use entropy as the reward, before switching to using the reward_fn for relabeling. + k: Use the k'th nearest neighbor's distance when computing state entropy. **kwargs: keyword arguments for ReplayBuffer. """ super().__init__( @@ -141,14 +143,32 @@ def __init__( # or number of observations returned? self.samples = 0 self.entropy_as_reward_samples = entropy_as_reward_samples + self.k = k def sample(self, *args, **kwargs): self.samples += 1 samples = super().sample(*args, **kwargs) if self.samples > self.entropy_as_reward_samples: return samples + # TODO we really ought to reset the reward network when we are done w/ entropy, + # and we have no business training it before then + + if self.full: + all_obs = self.observations + else: + all_obs = self.observations[: self.pos] + entropies = util.compute_state_entropy(samples.observations, all_obs, self.k) + entropies_th = ( + util.safe_to_tensor(entropies) + .reshape(samples.rewards.shape) + .to(samples.rewards.device) + ) + # TODO normalize entropies w/ RunningMeanAndVar - # TODO make the state entropy function accept batches - # TODO compute state entropy for each reward - # TODO replace the reward with the entropies - # TODO note that we really ought to reset the reward network when we are done w/ entropy, and we have no business training it before then + return ReplayBufferSamples( + samples.observations, + samples.actions, + samples.next_observations, + samples.dones, + entropies_th, + ) From 21da5328836bfd04452e175d07f1ad3ef2e75869 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 12:42:39 -0800 Subject: [PATCH 08/55] First test --- .../policies/replay_buffer_wrapper.py | 20 +++- src/imitation/util/util.py | 6 + tests/policies/test_replay_buffer_wrapper.py | 104 +++++++++++++++++- 3 files changed, 123 insertions(+), 7 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index c7eebfac8..3e6a2ac8c 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -103,7 +103,7 @@ def _get_samples(self): ) -class ReplayBufferEntropyRewardWrapper(ReplayBuffer): +class ReplayBufferEntropyRewardWrapper(ReplayBufferRewardWrapper): """Relabel the rewards from a ReplayBuffer, initially using entropy as reward.""" def __init__( @@ -135,8 +135,8 @@ def __init__( buffer_size, observation_space, action_space, - replay_buffer_class, - reward_fn, + replay_buffer_class=replay_buffer_class, + reward_fn=reward_fn, **kwargs, ) # TODO should we limit by number of batches (as this does) @@ -144,26 +144,34 @@ def __init__( self.samples = 0 self.entropy_as_reward_samples = entropy_as_reward_samples self.k = k + # TODO support n_envs > 1 + self.entropy_stats = util.RunningMeanAndVar(shape=(1,)) def sample(self, *args, **kwargs): self.samples += 1 samples = super().sample(*args, **kwargs) if self.samples > self.entropy_as_reward_samples: return samples - # TODO we really ought to reset the reward network when we are done w/ entropy, - # and we have no business training it before then + # TODO we really ought to reset the reward network once we are done w/ + # the entropy based pre-training. We also have no reason to train + # or even use the reward network before then. if self.full: all_obs = self.observations else: all_obs = self.observations[: self.pos] entropies = util.compute_state_entropy(samples.observations, all_obs, self.k) + + # Normalize to have mean of 0 and standard deviation of 1 + self.entropy_stats.update(entropies) + entropies -= self.entropy_stats.mean + entropies /= self.entropy_stats.std + entropies_th = ( util.safe_to_tensor(entropies) .reshape(samples.rewards.shape) .to(samples.rewards.device) ) - # TODO normalize entropies w/ RunningMeanAndVar return ReplayBufferSamples( samples.observations, diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index 4e0bd90e0..33b2179c7 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -375,6 +375,7 @@ def __init__( self.count = 0 def update(self, x: th.Tensor) -> None: + """Update the mean and variance with a batch `x`.""" with th.no_grad(): batch_mean = th.mean(x, dim=0) batch_var = th.var(x, dim=0, unbiased=False) @@ -399,6 +400,11 @@ def var(self) -> th.Tensor: """Returns the unbiased estimate of the variances.""" return self.M2 / (self.count - 1) + @property + def std(self) -> th.Tensor: + """Returns the unbiased estimate of the standard deviations.""" + return np.sqrt(self.var) + def compute_state_entropy( obs: th.Tensor, diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 40fc6eac5..bc6fb436e 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -11,7 +11,10 @@ from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.save_util import load_from_pkl -from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper +from imitation.policies.replay_buffer_wrapper import ( + ReplayBufferEntropyRewardWrapper, + ReplayBufferRewardWrapper, +) from imitation.util import util @@ -112,3 +115,102 @@ def test_wrapper_class(tmpdir, rng): # raise error for _get_samples() with pytest.raises(NotImplementedError, match=r".*_get_samples.*"): replay_buffer_wrapper._get_samples() + + +# Combine this with the above test via parameterization over the buffer class +def test_entropy_wrapper_class_no_op(tmpdir, rng): + buffer_size = 15 + total_timesteps = 20 + + venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng) + rl_algo = sb3.SAC( + policy=sb3.sac.policies.SACPolicy, + policy_kwargs=dict(), + env=venv, + seed=42, + replay_buffer_class=ReplayBufferEntropyRewardWrapper, + replay_buffer_kwargs=dict( + replay_buffer_class=buffers.ReplayBuffer, + reward_fn=zero_reward_fn, + entropy_as_reward_samples=0, + ), + buffer_size=buffer_size, + ) + + rl_algo.learn(total_timesteps=total_timesteps) + + buffer_path = osp.join(tmpdir, "buffer.pkl") + rl_algo.save_replay_buffer(buffer_path) + replay_buffer_wrapper = load_from_pkl(buffer_path) + replay_buffer = replay_buffer_wrapper.replay_buffer + + # replay_buffer_wrapper.sample(...) should return zero-reward transitions + assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size() + assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all() + assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all() # seed=42 + + # replay_buffer_wrapper.pos, replay_buffer_wrapper.full + assert replay_buffer_wrapper.pos == total_timesteps - buffer_size + assert replay_buffer_wrapper.full + + # reset() + replay_buffer_wrapper.reset() + assert 0 == replay_buffer_wrapper.size() == replay_buffer.size() + assert replay_buffer_wrapper.pos == 0 + assert not replay_buffer_wrapper.full + + # to_torch() + tensor = replay_buffer_wrapper.to_torch(np.ones(42)) + assert type(tensor) is th.Tensor + + +# Combine this with the above test via parameterization over the buffer class +def test_entropy_wrapper_class(tmpdir, rng): + buffer_size = 15 + total_timesteps = 20 + + # TODO make entropy reward wrapper + # TODO learn w/ entropy for X timesteps on dummy environment where + # next observation is action, as is reward + # TODO expect that our behavior is approximately uniformly distributed + + venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng) + rl_algo = sb3.SAC( + policy=sb3.sac.policies.SACPolicy, + policy_kwargs=dict(), + env=venv, + seed=42, + replay_buffer_class=ReplayBufferEntropyRewardWrapper, + replay_buffer_kwargs=dict( + replay_buffer_class=buffers.ReplayBuffer, + reward_fn=zero_reward_fn, + entropy_as_reward_samples=0, + ), + buffer_size=buffer_size, + ) + + rl_algo.learn(total_timesteps=total_timesteps) + + buffer_path = osp.join(tmpdir, "buffer.pkl") + rl_algo.save_replay_buffer(buffer_path) + replay_buffer_wrapper = load_from_pkl(buffer_path) + replay_buffer = replay_buffer_wrapper.replay_buffer + + # replay_buffer_wrapper.sample(...) should return zero-reward transitions + assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size() + assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all() + assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all() # seed=42 + + # replay_buffer_wrapper.pos, replay_buffer_wrapper.full + assert replay_buffer_wrapper.pos == total_timesteps - buffer_size + assert replay_buffer_wrapper.full + + # reset() + replay_buffer_wrapper.reset() + assert 0 == replay_buffer_wrapper.size() == replay_buffer.size() + assert replay_buffer_wrapper.pos == 0 + assert not replay_buffer_wrapper.full + + # to_torch() + tensor = replay_buffer_wrapper.to_torch(np.ones(42)) + assert type(tensor) is th.Tensor From 15dad9999004b595e20229f71f751e568ab56c6a Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 14:49:15 -0800 Subject: [PATCH 09/55] Test cleanup --- tests/policies/test_replay_buffer_wrapper.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index bc6fb436e..c8b9e24a0 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -164,12 +164,11 @@ def test_entropy_wrapper_class_no_op(tmpdir, rng): assert type(tensor) is th.Tensor -# Combine this with the above test via parameterization over the buffer class def test_entropy_wrapper_class(tmpdir, rng): buffer_size = 15 + entropy_samples = 10 total_timesteps = 20 - # TODO make entropy reward wrapper # TODO learn w/ entropy for X timesteps on dummy environment where # next observation is action, as is reward # TODO expect that our behavior is approximately uniformly distributed @@ -184,7 +183,7 @@ def test_entropy_wrapper_class(tmpdir, rng): replay_buffer_kwargs=dict( replay_buffer_class=buffers.ReplayBuffer, reward_fn=zero_reward_fn, - entropy_as_reward_samples=0, + entropy_as_reward_samples=entropy_samples, ), buffer_size=buffer_size, ) From 0c280797a117e32bc8fce3a250a5e070ddcb05a2 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 15:02:53 -0800 Subject: [PATCH 10/55] Update --- tests/policies/test_replay_buffer_wrapper.py | 24 ++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index c8b9e24a0..3cf506c8d 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -3,10 +3,12 @@ import os.path as osp from typing import Type +import gym import numpy as np import pytest import stable_baselines3 as sb3 import torch as th +from gym import spaces from stable_baselines3.common import buffers, off_policy_algorithm, policies from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.save_util import load_from_pkl @@ -164,13 +166,31 @@ def test_entropy_wrapper_class_no_op(tmpdir, rng): assert type(tensor) is th.Tensor +class ActionIsObsEnv(gym.Env): + """Simple environment where the obs is the action.""" + + def __init__(self): + """Initialize environment.""" + super().__init__() + self.action_space = spaces.Discrete(50) + self.observation_space = spaces.Discrete(50) + + def step(self, action): + obs = action + reward = 0 + done = False + info = {} + return obs, reward, done, info + + def reset(self): + return self.action_space.sample() + + def test_entropy_wrapper_class(tmpdir, rng): buffer_size = 15 entropy_samples = 10 total_timesteps = 20 - # TODO learn w/ entropy for X timesteps on dummy environment where - # next observation is action, as is reward # TODO expect that our behavior is approximately uniformly distributed venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng) From 5ab9d28a1fcebf7847c2972d99e207fde10384f9 Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 16:34:22 -0800 Subject: [PATCH 11/55] Commit for diff --- .../policies/replay_buffer_wrapper.py | 28 +++++---- tests/policies/test_replay_buffer_wrapper.py | 60 ++++++++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 3e6a2ac8c..77fde0eec 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -131,6 +131,13 @@ def __init__( k: Use the k'th nearest neighbor's distance when computing state entropy. **kwargs: keyword arguments for ReplayBuffer. """ + # TODO should we limit by number of batches (as this does) + # or number of observations returned? + self.sample_count = 0 + self.entropy_as_reward_samples = entropy_as_reward_samples + self.k = k + # TODO support n_envs > 1 + self.entropy_stats = util.RunningMeanAndVar(shape=(1,)) super().__init__( buffer_size, observation_space, @@ -139,18 +146,14 @@ def __init__( reward_fn=reward_fn, **kwargs, ) - # TODO should we limit by number of batches (as this does) - # or number of observations returned? - self.samples = 0 - self.entropy_as_reward_samples = entropy_as_reward_samples - self.k = k - # TODO support n_envs > 1 - self.entropy_stats = util.RunningMeanAndVar(shape=(1,)) + # TODO this seems to never actually get called? def sample(self, *args, **kwargs): - self.samples += 1 + self.sample_count += 1 samples = super().sample(*args, **kwargs) - if self.samples > self.entropy_as_reward_samples: + print(self.sample_count) + print(self.entropy_as_reward_samples) + if self.sample_count > 500: return samples # TODO we really ought to reset the reward network once we are done w/ # the entropy based pre-training. We also have no reason to train @@ -160,7 +163,12 @@ def sample(self, *args, **kwargs): all_obs = self.observations else: all_obs = self.observations[: self.pos] - entropies = util.compute_state_entropy(samples.observations, all_obs, self.k) + entropies = util.compute_state_entropy( + # TODO support multiple environments + samples.observations.unsqueeze(1), + all_obs, + self.k, + ) # Normalize to have mean of 0 and standard deviation of 1 self.entropy_stats.update(entropies) diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 3cf506c8d..3753078b1 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -12,6 +12,7 @@ from stable_baselines3.common import buffers, off_policy_algorithm, policies from stable_baselines3.common.policies import BasePolicy from stable_baselines3.common.save_util import load_from_pkl +from stable_baselines3.common.vec_env import DummyVecEnv from imitation.policies.replay_buffer_wrapper import ( ReplayBufferEntropyRewardWrapper, @@ -123,6 +124,7 @@ def test_wrapper_class(tmpdir, rng): def test_entropy_wrapper_class_no_op(tmpdir, rng): buffer_size = 15 total_timesteps = 20 + entropy_samples = 0 venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng) rl_algo = sb3.SAC( @@ -134,7 +136,7 @@ def test_entropy_wrapper_class_no_op(tmpdir, rng): replay_buffer_kwargs=dict( replay_buffer_class=buffers.ReplayBuffer, reward_fn=zero_reward_fn, - entropy_as_reward_samples=0, + entropy_as_reward_samples=entropy_samples, ), buffer_size=buffer_size, ) @@ -172,8 +174,8 @@ class ActionIsObsEnv(gym.Env): def __init__(self): """Initialize environment.""" super().__init__() - self.action_space = spaces.Discrete(50) - self.observation_space = spaces.Discrete(50) + self.action_space = spaces.Box(np.array([0]), np.array([1])) + self.observation_space = spaces.Box(np.array([0]), np.array([1])) def step(self, action): obs = action @@ -183,17 +185,15 @@ def step(self, action): return obs, reward, done, info def reset(self): - return self.action_space.sample() + return np.array([0]) def test_entropy_wrapper_class(tmpdir, rng): - buffer_size = 15 - entropy_samples = 10 - total_timesteps = 20 - - # TODO expect that our behavior is approximately uniformly distributed + buffer_size = 20 + entropy_samples = 40 + k = 4 - venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng) + venv = DummyVecEnv([ActionIsObsEnv]) rl_algo = sb3.SAC( policy=sb3.sac.policies.SACPolicy, policy_kwargs=dict(), @@ -204,32 +204,24 @@ def test_entropy_wrapper_class(tmpdir, rng): replay_buffer_class=buffers.ReplayBuffer, reward_fn=zero_reward_fn, entropy_as_reward_samples=entropy_samples, + k=k, ), buffer_size=buffer_size, ) - rl_algo.learn(total_timesteps=total_timesteps) - - buffer_path = osp.join(tmpdir, "buffer.pkl") - rl_algo.save_replay_buffer(buffer_path) - replay_buffer_wrapper = load_from_pkl(buffer_path) - replay_buffer = replay_buffer_wrapper.replay_buffer - - # replay_buffer_wrapper.sample(...) should return zero-reward transitions - assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size() - assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all() - assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all() # seed=42 - - # replay_buffer_wrapper.pos, replay_buffer_wrapper.full - assert replay_buffer_wrapper.pos == total_timesteps - buffer_size - assert replay_buffer_wrapper.full - - # reset() - replay_buffer_wrapper.reset() - assert 0 == replay_buffer_wrapper.size() == replay_buffer.size() - assert replay_buffer_wrapper.pos == 0 - assert not replay_buffer_wrapper.full + rl_algo.learn(total_timesteps=buffer_size) + initial_entropy = util.compute_state_entropy( + th.Tensor(rl_algo.replay_buffer.observations), + th.Tensor(rl_algo.replay_buffer.observations), + k=k, + ) - # to_torch() - tensor = replay_buffer_wrapper.to_torch(np.ones(42)) - assert type(tensor) is th.Tensor + rl_algo.learn(total_timesteps=entropy_samples - buffer_size) + # Expect that the entropy of our replay buffer is now higher, + # since we trained with that as the reward. + trained_entropy = util.compute_state_entropy( + th.Tensor(rl_algo.replay_buffer.observations), + th.Tensor(rl_algo.replay_buffer.observations), + k=k, + ) + assert trained_entropy.mean() > initial_entropy.mean() From 9410c31f166365e0db849eef2c4d846eb4ee5e2f Mon Sep 17 00:00:00 2001 From: Dan Pandori Date: Fri, 11 Nov 2022 16:41:35 -0800 Subject: [PATCH 12/55] Push final-ish state --- src/imitation/policies/replay_buffer_wrapper.py | 17 ++++++++--------- tests/policies/test_replay_buffer_wrapper.py | 10 +++++----- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 77fde0eec..9da9ac5ea 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -133,11 +133,6 @@ def __init__( """ # TODO should we limit by number of batches (as this does) # or number of observations returned? - self.sample_count = 0 - self.entropy_as_reward_samples = entropy_as_reward_samples - self.k = k - # TODO support n_envs > 1 - self.entropy_stats = util.RunningMeanAndVar(shape=(1,)) super().__init__( buffer_size, observation_space, @@ -146,14 +141,18 @@ def __init__( reward_fn=reward_fn, **kwargs, ) + self.sample_count = 0 + self.k = k + # TODO support n_envs > 1 + self.entropy_stats = util.RunningMeanAndVar(shape=(1,)) + self.entropy_as_reward_samples = entropy_as_reward_samples - # TODO this seems to never actually get called? def sample(self, *args, **kwargs): self.sample_count += 1 samples = super().sample(*args, **kwargs) - print(self.sample_count) - print(self.entropy_as_reward_samples) - if self.sample_count > 500: + # For some reason self.entropy_as_reward_samples seems to get cleared, + # and I have no idea why. + if self.sample_count > self.entropy_as_reward_samples: return samples # TODO we really ought to reset the reward network once we are done w/ # the entropy based pre-training. We also have no reason to train diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 3753078b1..5d06139aa 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -190,7 +190,7 @@ def reset(self): def test_entropy_wrapper_class(tmpdir, rng): buffer_size = 20 - entropy_samples = 40 + entropy_samples = 500 k = 4 venv = DummyVecEnv([ActionIsObsEnv]) @@ -211,8 +211,8 @@ def test_entropy_wrapper_class(tmpdir, rng): rl_algo.learn(total_timesteps=buffer_size) initial_entropy = util.compute_state_entropy( - th.Tensor(rl_algo.replay_buffer.observations), - th.Tensor(rl_algo.replay_buffer.observations), + th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), + th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), k=k, ) @@ -220,8 +220,8 @@ def test_entropy_wrapper_class(tmpdir, rng): # Expect that the entropy of our replay buffer is now higher, # since we trained with that as the reward. trained_entropy = util.compute_state_entropy( - th.Tensor(rl_algo.replay_buffer.observations), - th.Tensor(rl_algo.replay_buffer.observations), + th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), + th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), k=k, ) assert trained_entropy.mean() > initial_entropy.mean() From fdcdf0d898a3a01afc8a0a2e2110115a4825ba1e Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Tue, 29 Nov 2022 16:03:23 +0100 Subject: [PATCH 13/55] #625 refactor RunningMeanAndVar --- .../policies/replay_buffer_wrapper.py | 2 +- src/imitation/util/util.py | 33 ++++++++----------- tests/util/test_util.py | 4 +-- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 9da9ac5ea..680026d1d 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -171,7 +171,7 @@ def sample(self, *args, **kwargs): # Normalize to have mean of 0 and standard deviation of 1 self.entropy_stats.update(entropies) - entropies -= self.entropy_stats.mean + entropies -= self.entropy_stats.running_mean entropies /= self.entropy_stats.std entropies_th = ( diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index 33b2179c7..3a7ead70e 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -370,30 +370,25 @@ def __init__( device: Optional[str] = None, ) -> None: """Initialize blank mean, variance, count.""" - self.mean = th.zeros(shape, device=device) + self.running_mean = th.zeros(shape, device=device) self.M2 = th.zeros(shape, device=device) self.count = 0 - def update(self, x: th.Tensor) -> None: + def update(self, batch: th.Tensor) -> None: """Update the mean and variance with a batch `x`.""" with th.no_grad(): - batch_mean = th.mean(x, dim=0) - batch_var = th.var(x, dim=0, unbiased=False) - batch_count = x.shape[0] - batch_M2 = batch_var * batch_count - if self.count == 0: - self.count = batch_count - self.mean = batch_mean - self.M2 = batch_M2 - return - - delta = batch_mean - self.mean - total_count = self.count + batch_count - self.mean += delta * batch_count / total_count - - self.M2 += batch_M2 + delta * delta * self.count * batch_count / total_count - - self.count = total_count + batch_mean = th.mean(batch, dim=0) + batch_var = th.var(batch, dim=0, unbiased=False) + batch_count = batch.shape[0] + + delta = batch_mean - self.running_mean + tot_count = self.count + batch_count + self.running_mean += delta * batch_count / tot_count + + self.M2 += batch_var * batch_count + self.M2 += th.square(delta) * self.count * batch_count / tot_count + + self.count += batch_count @property def var(self) -> th.Tensor: diff --git a/tests/util/test_util.py b/tests/util/test_util.py index 6f90dd832..6ce2efcc2 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -127,7 +127,7 @@ def test_RunningMeanAndVar(): first_half = data[:10] running_stats.update(first_half) np.testing.assert_allclose( - running_stats.mean, + running_stats.running_mean, first_half.mean(dim=0), atol=1e-5, rtol=1e-4, @@ -141,7 +141,7 @@ def test_RunningMeanAndVar(): running_stats.update(data[10:]) np.testing.assert_allclose( - running_stats.mean, + running_stats.running_mean, data.mean(dim=0), atol=1e-5, rtol=1e-4, From 0cd12557fef89008763442433cd478044bd3b9b7 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Tue, 29 Nov 2022 17:35:12 +0100 Subject: [PATCH 14/55] #625 use RunningNorm instead of RunningMeanAndVar --- .../policies/replay_buffer_wrapper.py | 9 ++--- src/imitation/util/networks.py | 10 ++--- src/imitation/util/util.py | 40 ------------------- tests/util/test_util.py | 34 ---------------- 4 files changed, 9 insertions(+), 84 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 680026d1d..539f2e512 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -9,6 +9,7 @@ from imitation.rewards.reward_function import RewardFn from imitation.util import util +from imitation.util.networks import RunningNorm def _samples_to_reward_fn_input( @@ -144,7 +145,7 @@ def __init__( self.sample_count = 0 self.k = k # TODO support n_envs > 1 - self.entropy_stats = util.RunningMeanAndVar(shape=(1,)) + self.entropy_stats = RunningNorm(1) self.entropy_as_reward_samples = entropy_as_reward_samples def sample(self, *args, **kwargs): @@ -169,10 +170,8 @@ def sample(self, *args, **kwargs): self.k, ) - # Normalize to have mean of 0 and standard deviation of 1 - self.entropy_stats.update(entropies) - entropies -= self.entropy_stats.running_mean - entropies /= self.entropy_stats.std + # Normalize to have mean of 0 and standard deviation of 1 according to running stats + entropies = self.entropy_stats.forward(entropies) entropies_th = ( util.safe_to_tensor(entropies) diff --git a/src/imitation/util/networks.py b/src/imitation/util/networks.py index c27aea2cd..048273656 100644 --- a/src/imitation/util/networks.py +++ b/src/imitation/util/networks.py @@ -126,12 +126,12 @@ def update_stats(self, batch: th.Tensor) -> None: tot_count = self.count + batch_count self.running_mean += delta * batch_count / tot_count - self.running_var *= self.count - self.running_var += batch_var * batch_count - self.running_var += th.square(delta) * self.count * batch_count / tot_count - self.running_var /= tot_count + m_a = self.running_var * self.count + m_b = batch_var * batch_count + M2 = m_a + m_b + th.square(delta) * self.count * batch_count / tot_count + self.running_var = M2 / tot_count - self.count += batch_count + self.count = tot_count class EMANorm(BaseNorm): diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index 3a7ead70e..df8eb6a6a 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -361,46 +361,6 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]: return first_element, return_iterable -class RunningMeanAndVar: - """Stores a running mean and variance using Wellford's algorithm.""" - - def __init__( - self, - shape: Tuple[int, ...] = (), - device: Optional[str] = None, - ) -> None: - """Initialize blank mean, variance, count.""" - self.running_mean = th.zeros(shape, device=device) - self.M2 = th.zeros(shape, device=device) - self.count = 0 - - def update(self, batch: th.Tensor) -> None: - """Update the mean and variance with a batch `x`.""" - with th.no_grad(): - batch_mean = th.mean(batch, dim=0) - batch_var = th.var(batch, dim=0, unbiased=False) - batch_count = batch.shape[0] - - delta = batch_mean - self.running_mean - tot_count = self.count + batch_count - self.running_mean += delta * batch_count / tot_count - - self.M2 += batch_var * batch_count - self.M2 += th.square(delta) * self.count * batch_count / tot_count - - self.count += batch_count - - @property - def var(self) -> th.Tensor: - """Returns the unbiased estimate of the variances.""" - return self.M2 / (self.count - 1) - - @property - def std(self) -> th.Tensor: - """Returns the unbiased estimate of the standard deviations.""" - return np.sqrt(self.var) - - def compute_state_entropy( obs: th.Tensor, all_obs: th.Tensor, diff --git a/tests/util/test_util.py b/tests/util/test_util.py index 6ce2efcc2..28678dc8b 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -120,40 +120,6 @@ def test_tensor_iter_norm(): util.tensor_iter_norm(tensor_list, ord=0.0) -def test_RunningMeanAndVar(): - running_stats = util.RunningMeanAndVar(shape=(3, 4)) - data = th.normal(mean=10 * th.ones(size=(20, 3, 4), dtype=th.double)) - - first_half = data[:10] - running_stats.update(first_half) - np.testing.assert_allclose( - running_stats.running_mean, - first_half.mean(dim=0), - atol=1e-5, - rtol=1e-4, - ) - np.testing.assert_allclose( - running_stats.var, - first_half.var(dim=0), - atol=1e-5, - rtol=1e-4, - ) - - running_stats.update(data[10:]) - np.testing.assert_allclose( - running_stats.running_mean, - data.mean(dim=0), - atol=1e-5, - rtol=1e-4, - ) - np.testing.assert_allclose( - running_stats.var, - data.var(dim=0), - atol=1e-5, - rtol=1e-4, - ) - - def test_compute_state_entropy_1d(): all_obs = th.arange(10, dtype=th.float).unsqueeze(1) obs = all_obs[4:6] From d88ba4441557f1ab7d198f6b1c40c170831ea772 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Tue, 29 Nov 2022 22:52:58 +0100 Subject: [PATCH 15/55] #625 make copy of train_preference_comparisons.py for pebble --- .../train_preference_comparisons_pebble.py | 128 ++++++++ .../train_preference_comparisons_pebble.py | 292 ++++++++++++++++++ 2 files changed, 420 insertions(+) create mode 100644 src/imitation/scripts/config/train_preference_comparisons_pebble.py create mode 100644 src/imitation/scripts/train_preference_comparisons_pebble.py diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py new file mode 100644 index 000000000..d6887e066 --- /dev/null +++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py @@ -0,0 +1,128 @@ +"""Configuration for imitation.scripts.train_preference_comparisons_pebble.""" + +import sacred + +from imitation.algorithms import preference_comparisons +from imitation.scripts.common import common, reward, rl, train + +train_preference_comparisons_pebble_ex = sacred.Experiment( + "train_preference_comparisons_pebble", + ingredients=[ + common.common_ingredient, + reward.reward_ingredient, + rl.rl_ingredient, + train.train_ingredient, + ], +) + + +MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1))) +ANT_SHARED_LOCALS = dict( + total_timesteps=int(3e7), + rl=dict(batch_size=16384), +) + + +@train_preference_comparisons_pebble_ex.config +def train_defaults(): + fragment_length = 100 # timesteps per fragment used for comparisons + total_timesteps = int(1e6) # total number of environment timesteps + total_comparisons = 5000 # total number of comparisons to elicit + num_iterations = 5 # Arbitrary, should be tuned for the task + comparison_queue_size = None + # factor by which to oversample transitions before creating fragments + transition_oversampling = 1 + # fraction of total_comparisons that will be sampled right at the beginning + initial_comparison_frac = 0.1 + # fraction of sampled trajectories that will include some random actions + exploration_frac = 0.0 + preference_model_kwargs = {} + reward_trainer_kwargs = { + "epochs": 3, + } + save_preferences = False # save preference dataset at the end? + agent_path = None # path to a (partially) trained agent to load at the beginning + # type of PreferenceGatherer to use + gatherer_cls = preference_comparisons.SyntheticGatherer + # arguments passed on to the PreferenceGatherer specified by gatherer_cls + gatherer_kwargs = {} + active_selection = False + active_selection_oversampling = 2 + uncertainty_on = "logit" + fragmenter_kwargs = { + "warning_threshold": 0, + } + # path to a pickled sequence of trajectories used instead of training an agent + trajectory_path = None + trajectory_generator_kwargs = {} # kwargs to pass to trajectory generator + allow_variable_horizon = False + + checkpoint_interval = 0 # Num epochs between saving (<0 disables, =0 final only) + query_schedule = "hyperbolic" + + +@train_preference_comparisons_pebble_ex.named_config +def cartpole(): + common = dict(env_name="CartPole-v1") + allow_variable_horizon = True + + +@train_preference_comparisons_pebble_ex.named_config +def seals_ant(): + locals().update(**MUJOCO_SHARED_LOCALS) + locals().update(**ANT_SHARED_LOCALS) + common = dict(env_name="seals/Ant-v0") + + +@train_preference_comparisons_pebble_ex.named_config +def half_cheetah(): + locals().update(**MUJOCO_SHARED_LOCALS) + common = dict(env_name="HalfCheetah-v2") + rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024)) + + +@train_preference_comparisons_pebble_ex.named_config +def seals_hopper(): + locals().update(**MUJOCO_SHARED_LOCALS) + common = dict(env_name="seals/Hopper-v0") + + +@train_preference_comparisons_pebble_ex.named_config +def seals_humanoid(): + locals().update(**MUJOCO_SHARED_LOCALS) + common = dict(env_name="seals/Humanoid-v0") + total_timesteps = int(4e6) + + +@train_preference_comparisons_pebble_ex.named_config +def seals_cartpole(): + common = dict(env_name="seals/CartPole-v0") + + +@train_preference_comparisons_pebble_ex.named_config +def pendulum(): + common = dict(env_name="Pendulum-v1") + + +@train_preference_comparisons_pebble_ex.named_config +def mountain_car(): + common = dict(env_name="MountainCar-v0") + allow_variable_horizon = True + + +@train_preference_comparisons_pebble_ex.named_config +def seals_mountain_car(): + common = dict(env_name="seals/MountainCar-v0") + + +@train_preference_comparisons_pebble_ex.named_config +def fast(): + # Minimize the amount of computation. Useful for test cases. + total_timesteps = 50 + total_comparisons = 5 + initial_comparison_frac = 0.2 + num_iterations = 1 + fragment_length = 2 + reward_trainer_kwargs = { + "epochs": 1, + } diff --git a/src/imitation/scripts/train_preference_comparisons_pebble.py b/src/imitation/scripts/train_preference_comparisons_pebble.py new file mode 100644 index 000000000..f34eefb9d --- /dev/null +++ b/src/imitation/scripts/train_preference_comparisons_pebble.py @@ -0,0 +1,292 @@ +"""Train a reward model using preference comparisons. + +Can be used as a CLI script, or the `train_preference_comparisons` function +can be called directly. +""" + +import functools +import pathlib +from typing import Any, Mapping, Optional, Type, Union + +import torch as th +from sacred.observers import FileStorageObserver +from stable_baselines3.common import type_aliases + +from imitation.algorithms import preference_comparisons +from imitation.data import types +from imitation.policies import serialize +from imitation.scripts.common import common, reward +from imitation.scripts.common import rl as rl_common +from imitation.scripts.common import train +from imitation.scripts.config.train_preference_comparisons_pebble import ( + train_preference_comparisons_pebble_ex, +) + + +def save_model( + agent_trainer: preference_comparisons.AgentTrainer, + save_path: pathlib.Path, +): + """Save the model as `model.zip`.""" + serialize.save_stable_model( + output_dir=save_path / "policy", + model=agent_trainer.algorithm, + ) + + +def save_checkpoint( + trainer: preference_comparisons.PreferenceComparisons, + save_path: pathlib.Path, + allow_save_policy: Optional[bool], +): + """Save reward model and optionally policy.""" + save_path.mkdir(parents=True, exist_ok=True) + th.save(trainer.model, save_path / "reward_net.pt") + if allow_save_policy: + # Note: We should only save the model as model.zip if `trajectory_generator` + # contains one. Currently we are slightly over-conservative, by requiring + # that an AgentTrainer be used if we're saving the policy. + assert isinstance( + trainer.trajectory_generator, + preference_comparisons.AgentTrainer, + ) + save_model(trainer.trajectory_generator, save_path) + else: + trainer.logger.warn( + "trainer.trajectory_generator doesn't contain a policy to save.", + ) + + +@train_preference_comparisons_pebble_ex.main +def train_preference_comparisons( + total_timesteps: int, + total_comparisons: int, + num_iterations: int, + comparison_queue_size: Optional[int], + fragment_length: int, + transition_oversampling: float, + initial_comparison_frac: float, + exploration_frac: float, + trajectory_path: Optional[str], + trajectory_generator_kwargs: Mapping[str, Any], + save_preferences: bool, + agent_path: Optional[str], + preference_model_kwargs: Mapping[str, Any], + reward_trainer_kwargs: Mapping[str, Any], + gatherer_cls: Type[preference_comparisons.PreferenceGatherer], + gatherer_kwargs: Mapping[str, Any], + active_selection: bool, + active_selection_oversampling: int, + uncertainty_on: str, + fragmenter_kwargs: Mapping[str, Any], + allow_variable_horizon: bool, + checkpoint_interval: int, + query_schedule: Union[str, type_aliases.Schedule], +) -> Mapping[str, Any]: + """Train a reward model using preference comparisons. + + Args: + total_timesteps: number of environment interaction steps + total_comparisons: number of preferences to gather in total + num_iterations: number of times to train the agent against the reward model + and then train the reward model against newly gathered preferences. + comparison_queue_size: the maximum number of comparisons to keep in the + queue for training the reward model. If None, the queue will grow + without bound as new comparisons are added. + fragment_length: number of timesteps per fragment that is used to elicit + preferences + transition_oversampling: factor by which to oversample transitions before + creating fragments. Since fragments are sampled with replacement, + this is usually chosen > 1 to avoid having the same transition + in too many fragments. + initial_comparison_frac: fraction of total_comparisons that will be + sampled before the rest of training begins (using the randomly initialized + agent). This can be used to pretrain the reward model before the agent + is trained on the learned reward. + exploration_frac: fraction of trajectory samples that will be created using + partially random actions, rather than the current policy. Might be helpful + if the learned policy explores too little and gets stuck with a wrong + reward. + trajectory_path: either None, in which case an agent will be trained + and used to sample trajectories on the fly, or a path to a pickled + sequence of TrajectoryWithRew to be trained on. + trajectory_generator_kwargs: kwargs to pass to the trajectory generator. + save_preferences: if True, store the final dataset of preferences to disk. + agent_path: if given, initialize the agent using this stored policy + rather than randomly. + preference_model_kwargs: passed to PreferenceModel + reward_trainer_kwargs: passed to BasicRewardTrainer or EnsembleRewardTrainer + gatherer_cls: type of PreferenceGatherer to use (defaults to SyntheticGatherer) + gatherer_kwargs: passed to the PreferenceGatherer specified by gatherer_cls + active_selection: use active selection fragmenter instead of random fragmenter + active_selection_oversampling: factor by which to oversample random fragments + from the base fragmenter of active selection. + this is usually chosen > 1 to allow the active selection algorithm to pick + fragment pairs with highest uncertainty. = 1 implies no active selection. + uncertainty_on: passed to ActiveSelectionFragmenter + fragmenter_kwargs: passed to RandomFragmenter + allow_variable_horizon: If False (default), algorithm will raise an + exception if it detects trajectories of different length during + training. If True, overrides this safety check. WARNING: variable + horizon episodes leak information about the reward via termination + condition, and can seriously confound evaluation. Read + https://imitation.readthedocs.io/en/latest/guide/variable_horizon.html + before overriding this. + checkpoint_interval: Save the reward model and policy models (if + trajectory_generator contains a policy) every `checkpoint_interval` + iterations and after training is complete. If 0, then only save weights + after training is complete. If <0, then don't save weights at all. + query_schedule: one of ("constant", "hyperbolic", "inverse_quadratic"). + A function indicating how the total number of preference queries should + be allocated to each iteration. "hyperbolic" and "inverse_quadratic" + apportion fewer queries to later iterations when the policy is assumed + to be better and more stable. + + Returns: + Rollout statistics from trained policy. + + Raises: + ValueError: Inconsistency between config and deserialized policy normalization. + """ + custom_logger, log_dir = common.setup_logging() + rng = common.make_rng() + + with common.make_venv() as venv: + reward_net = reward.make_reward_net(venv) + relabel_reward_fn = functools.partial( + reward_net.predict_processed, + update_stats=False, + ) + if agent_path is None: + agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn) + else: + agent = rl_common.load_rl_algo_from_path( + agent_path=agent_path, + venv=venv, + relabel_reward_fn=relabel_reward_fn, + ) + + if trajectory_path is None: + # Setting the logger here is not necessary (PreferenceComparisons takes care + # of it automatically) but it avoids creating unnecessary loggers. + agent_trainer = preference_comparisons.AgentTrainer( + algorithm=agent, + reward_fn=reward_net, + venv=venv, + exploration_frac=exploration_frac, + rng=rng, + custom_logger=custom_logger, + **trajectory_generator_kwargs, + ) + # Stable Baselines will automatically occupy GPU 0 if it is available. + # Let's use the same device as the SB3 agent for the reward model. + reward_net = reward_net.to(agent_trainer.algorithm.device) + trajectory_generator: preference_comparisons.TrajectoryGenerator = ( + agent_trainer + ) + else: + if exploration_frac > 0: + raise ValueError( + "exploration_frac can't be set when a trajectory dataset is used", + ) + trajectory_generator = preference_comparisons.TrajectoryDataset( + trajectories=types.load_with_rewards(trajectory_path), + rng=rng, + custom_logger=custom_logger, + **trajectory_generator_kwargs, + ) + + fragmenter: preference_comparisons.Fragmenter = ( + preference_comparisons.RandomFragmenter( + **fragmenter_kwargs, + rng=rng, + custom_logger=custom_logger, + ) + ) + preference_model = preference_comparisons.PreferenceModel( + **preference_model_kwargs, + model=reward_net, + ) + if active_selection: + fragmenter = preference_comparisons.ActiveSelectionFragmenter( + preference_model=preference_model, + base_fragmenter=fragmenter, + fragment_sample_factor=active_selection_oversampling, + uncertainty_on=uncertainty_on, + custom_logger=custom_logger, + ) + gatherer = gatherer_cls( + **gatherer_kwargs, + rng=rng, + custom_logger=custom_logger, + ) + + loss = preference_comparisons.CrossEntropyRewardLoss() + + reward_trainer = preference_comparisons._make_reward_trainer( + preference_model, + loss, + rng, + reward_trainer_kwargs, + ) + + main_trainer = preference_comparisons.PreferenceComparisons( + trajectory_generator, + reward_net, + num_iterations=num_iterations, + fragmenter=fragmenter, + preference_gatherer=gatherer, + reward_trainer=reward_trainer, + comparison_queue_size=comparison_queue_size, + fragment_length=fragment_length, + transition_oversampling=transition_oversampling, + initial_comparison_frac=initial_comparison_frac, + custom_logger=custom_logger, + allow_variable_horizon=allow_variable_horizon, + query_schedule=query_schedule, + ) + + def save_callback(iteration_num): + if checkpoint_interval > 0 and iteration_num % checkpoint_interval == 0: + save_checkpoint( + trainer=main_trainer, + save_path=log_dir / "checkpoints" / f"{iteration_num:04d}", + allow_save_policy=bool(trajectory_path is None), + ) + + results = main_trainer.train( + total_timesteps, + total_comparisons, + callback=save_callback, + ) + + # Storing and evaluating policy only useful if we generated trajectory data + if bool(trajectory_path is None): + results = dict(results) + results["rollout"] = train.eval_policy(agent, venv) + + if save_preferences: + main_trainer.dataset.save(log_dir / "preferences.pkl") + + # Save final artifacts. + if checkpoint_interval >= 0: + save_checkpoint( + trainer=main_trainer, + save_path=log_dir / "checkpoints" / "final", + allow_save_policy=bool(trajectory_path is None), + ) + + return results + + +def main_console(): + observer_path = ( + pathlib.Path.cwd() / "output" / "sacred" / "train_preference_comparisons_pebble" + ) + observer = FileStorageObserver(observer_path) + train_preference_comparisons_pebble_ex.observers.append(observer) + train_preference_comparisons_pebble_ex.run_commandline() + + +if __name__ == "__main__": # pragma: no cover + main_console() From 2d836deebebf79731a0ecfa13acd1a154730f302 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Wed, 30 Nov 2022 00:40:00 +0100 Subject: [PATCH 16/55] #625 use an OffPolicy for pebble --- .../train_preference_comparisons_pebble.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py index d6887e066..3dde185b5 100644 --- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py +++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py @@ -1,8 +1,12 @@ """Configuration for imitation.scripts.train_preference_comparisons_pebble.""" +import warnings + import sacred +import stable_baselines3 as sb3 from imitation.algorithms import preference_comparisons +from imitation.policies import base from imitation.scripts.common import common, reward, rl, train train_preference_comparisons_pebble_ex = sacred.Experiment( @@ -15,7 +19,6 @@ ], ) - MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1))) ANT_SHARED_LOCALS = dict( total_timesteps=int(3e7), @@ -23,6 +26,35 @@ ) +@rl.rl_ingredient.config +def rl_sac(): + # For recommended SAC hyperparams in each environment, see: + # https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/sac.yml + rl_cls = sb3.SAC + warnings.warn( + "SAC currently only supports continuous action spaces. " + "Consider adding a discrete version as mentioned here: " + "https://github.com/DLR-RM/stable-baselines3/issues/505", + category=RuntimeWarning, + ) + # Default HPs are as follows: + batch_size = 256 # batch size for RL algorithm + rl_kwargs = dict(batch_size=None) # make sure to set batch size to None + locals() # quieten flake8 + + +@train.train_ingredient.config +def train_sac(): + policy_cls = base.SAC1024Policy # noqa: F841 + locals() # quieten flake8 + + +@common.common_ingredient.config +def mountain_car(): + env_name = "MountainCarContinuous-v0" + locals() # quieten flake8 + + @train_preference_comparisons_pebble_ex.config def train_defaults(): fragment_length = 100 # timesteps per fragment used for comparisons From ec5f67e986e105816ff6f4f77145dd45e7bb5be8 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Wed, 30 Nov 2022 14:23:02 +0100 Subject: [PATCH 17/55] #625 fix assumptions about shapes in ReplayBufferEntropyRewardWrapper --- .../policies/replay_buffer_wrapper.py | 24 ++++++++----------- .../train_preference_comparisons_pebble.py | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 539f2e512..5a55b80bf 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -163,26 +163,22 @@ def sample(self, *args, **kwargs): all_obs = self.observations else: all_obs = self.observations[: self.pos] + # super().sample() flattens the venv dimension, let's do it too + all_obs = all_obs.reshape((-1, *self.obs_shape)) entropies = util.compute_state_entropy( - # TODO support multiple environments - samples.observations.unsqueeze(1), - all_obs, + samples.observations, + all_obs.reshape((-1, *self.obs_shape)), self.k, ) # Normalize to have mean of 0 and standard deviation of 1 according to running stats entropies = self.entropy_stats.forward(entropies) - - entropies_th = ( - util.safe_to_tensor(entropies) - .reshape(samples.rewards.shape) - .to(samples.rewards.device) - ) + assert entropies.shape == samples.rewards.shape return ReplayBufferSamples( - samples.observations, - samples.actions, - samples.next_observations, - samples.dones, - entropies_th, + observations=samples.observations, + actions=samples.actions, + next_observations=samples.next_observations, + dones=samples.dones, + rewards=entropies, ) diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py index 3dde185b5..e65f38e37 100644 --- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py +++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py @@ -50,7 +50,7 @@ def train_sac(): @common.common_ingredient.config -def mountain_car(): +def common_mountain_car_continuous(): env_name = "MountainCarContinuous-v0" locals() # quieten flake8 From da228bd8d37b093d79a7bf1dede0ebc92bf89daa Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 00:00:01 +0100 Subject: [PATCH 18/55] #625 entropy reward as a function --- .../algorithms/pebble/entropy_reward.py | 44 ++++++++++++ .../policies/replay_buffer_wrapper.py | 32 ++++++++- src/imitation/util/networks.py | 3 + src/imitation/util/util.py | 19 +++-- .../algorithms/pebble/test_entropy_reward.py | 70 +++++++++++++++++++ tests/policies/test_replay_buffer_wrapper.py | 39 +++++++++++ tests/util/test_util.py | 12 ++++ 7 files changed, 211 insertions(+), 8 deletions(-) create mode 100644 src/imitation/algorithms/pebble/entropy_reward.py create mode 100644 tests/algorithms/pebble/test_entropy_reward.py diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py new file mode 100644 index 000000000..724fbf314 --- /dev/null +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -0,0 +1,44 @@ +import numpy as np +import torch as th +from gym.vector.utils import spaces +from stable_baselines3.common.preprocessing import get_obs_shape + +from imitation.policies.replay_buffer_wrapper import ReplayBufferView +from imitation.rewards.reward_function import RewardFn +from imitation.util import util +from imitation.util.networks import RunningNorm + + +class StateEntropyReward(RewardFn): + def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): + self.nearest_neighbor_k = nearest_neighbor_k + # TODO support n_envs > 1 + self.entropy_stats = RunningNorm(1) + self.obs_shape = get_obs_shape(observation_space) + self.replay_buffer_view = ReplayBufferView( + np.empty(0, dtype=observation_space.dtype), lambda: slice(0) + ) + + def set_buffer_view(self, replay_buffer_view: ReplayBufferView): + self.replay_buffer_view = replay_buffer_view + + def __call__( + self, + state: np.ndarray, + action: np.ndarray, + next_state: np.ndarray, + done: np.ndarray, + ) -> np.ndarray: + # TODO: should this work with torch instead of numpy internally? + # (The RewardFn protocol requires numpy) + + all_observations = self.replay_buffer_view.observations + # ReplayBuffer sampling flattens the venv dimension, let's adapt to that + all_observations = all_observations.reshape((-1, *self.obs_shape)) + entropies = util.compute_state_entropy( + state, + all_observations, + self.nearest_neighbor_k, + ) + normalized_entropies = self.entropy_stats.forward(th.as_tensor(entropies)) + return normalized_entropies.numpy() diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 5a55b80bf..477fb97b2 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -24,6 +24,29 @@ def _samples_to_reward_fn_input( ) +class ReplayBufferView: + """A read-only view over a valid records in a ReplayBuffer. + + Args: + observations_buffer: Array buffer holding observations + buffer_slice_provider: Function returning slice of buffer + with valid observations + """ + + def __init__( + self, + observations_buffer: np.ndarray, + buffer_slice_provider: Callable[[], slice], + ): + self._observations_buffer = observations_buffer.view() + self._observations_buffer.flags.writeable = False + self._buffer_slice_provider = buffer_slice_provider + + @property + def observations(self): + return self._observations_buffer[self._buffer_slice_provider()] + + class ReplayBufferRewardWrapper(ReplayBuffer): """Relabel the rewards in transitions sampled from a ReplayBuffer.""" @@ -79,6 +102,13 @@ def full(self) -> bool: def full(self, full: bool): self.replay_buffer.full = full + @property + def buffer_view(self) -> ReplayBufferView: + def valid_buffer_slice(): + return slice(None) if self.full else slice(self.pos) + + return ReplayBufferView(self.replay_buffer.observations, valid_buffer_slice) + def sample(self, *args, **kwargs): samples = self.replay_buffer.sample(*args, **kwargs) rewards = self.reward_fn(**_samples_to_reward_fn_input(samples)) @@ -167,7 +197,7 @@ def sample(self, *args, **kwargs): all_obs = all_obs.reshape((-1, *self.obs_shape)) entropies = util.compute_state_entropy( samples.observations, - all_obs.reshape((-1, *self.obs_shape)), + all_obs, self.k, ) diff --git a/src/imitation/util/networks.py b/src/imitation/util/networks.py index 048273656..e9564ca44 100644 --- a/src/imitation/util/networks.py +++ b/src/imitation/util/networks.py @@ -86,6 +86,9 @@ def forward(self, x: th.Tensor) -> th.Tensor: with th.no_grad(): self.update_stats(x) + return self.normalize(x) + + def normalize(self, x: th.Tensor) -> th.Tensor: # Note: this is different from the behavior in stable-baselines, see # https://github.com/HumanCompatibleAI/imitation/issues/442 return (x - self.running_mean) / th.sqrt(self.running_var + self.eps) diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index df8eb6a6a..d88f775cd 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -362,10 +362,10 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]: def compute_state_entropy( - obs: th.Tensor, - all_obs: th.Tensor, + obs: np.ndarray, + all_obs: np.ndarray, k: int, -) -> th.Tensor: +) -> np.ndarray: """Compute the state entropy given by KNN distance. Args: @@ -379,14 +379,19 @@ def compute_state_entropy( assert obs.shape[1:] == all_obs.shape[1:] with th.no_grad(): non_batch_dimensions = tuple(range(2, len(obs.shape) + 1)) - distances_tensor = th.linalg.vector_norm( + distances_tensor = np.linalg.norm( obs[:, None] - all_obs[None, :], - dim=non_batch_dimensions, + axis=non_batch_dimensions, ord=2, ) # Note that we take the k+1'th value because the closest neighbor to # a point is itself, which we want to skip. - knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values + knn_dists = kth_value(distances_tensor, k+1) state_entropy = knn_dists - return state_entropy.unsqueeze(1) + return np.expand_dims(state_entropy, axis=1) + + +def kth_value(x: np.ndarray, k: int): + assert k > 0 + return np.partition(x, k - 1, axis=-1)[..., k - 1] diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py new file mode 100644 index 000000000..777a9b9d6 --- /dev/null +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -0,0 +1,70 @@ +from unittest.mock import patch + +import numpy as np +import torch as th +from gym.spaces import Discrete +from stable_baselines3.common.preprocessing import get_obs_shape + +from imitation.algorithms.pebble.entropy_reward import StateEntropyReward +from imitation.policies.replay_buffer_wrapper import ReplayBufferView +from imitation.util import util + +SPACE = Discrete(4) +PLACEHOLDER = np.empty(get_obs_shape(SPACE)) + +BUFFER_SIZE = 20 +K = 4 +BATCH_SIZE = 8 +VENVS = 2 + + +def test_state_entropy_reward_returns_entropy(rng): + obs_shape = get_obs_shape(SPACE) + all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape)) + + reward_fn = StateEntropyReward(K, SPACE) + reward_fn.set_buffer_view(ReplayBufferView(all_observations, lambda: slice(None))) + + # Act + observations = rng.random((BATCH_SIZE, *obs_shape)) + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + # Assert + expected = util.compute_state_entropy( + observations, all_observations.reshape(-1, *obs_shape), K + ) + expected_normalized = reward_fn.entropy_stats.normalize(th.as_tensor(expected)).numpy() + np.testing.assert_allclose(reward, expected_normalized) + + +def test_state_entropy_reward_returns_normalized_values(): + with patch("imitation.util.util.compute_state_entropy") as m: + # mock entropy computation so that we can test only stats collection in this test + m.side_effect = lambda obs, all_obs, k: obs + + reward_fn = StateEntropyReward(K, SPACE) + all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) + reward_fn.set_buffer_view( + ReplayBufferView(all_observations, lambda: slice(None)) + ) + + dim = 8 + shift = 3 + scale = 2 + + # Act + for _ in range(1000): + state = th.randn(dim) * scale + shift + reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + normalized_reward = reward_fn( + np.zeros(dim), PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) + + # Assert + np.testing.assert_allclose( + normalized_reward, + np.repeat(-shift / scale, dim), + rtol=0.05, + atol=0.05, + ) diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 5d06139aa..668208b58 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -2,6 +2,7 @@ import os.path as osp from typing import Type +from unittest.mock import Mock import gym import numpy as np @@ -10,7 +11,9 @@ import torch as th from gym import spaces from stable_baselines3.common import buffers, off_policy_algorithm, policies +from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.policies import BasePolicy +from stable_baselines3.common.preprocessing import get_obs_shape, get_action_dim from stable_baselines3.common.save_util import load_from_pkl from stable_baselines3.common.vec_env import DummyVecEnv @@ -225,3 +228,39 @@ def test_entropy_wrapper_class(tmpdir, rng): k=k, ) assert trained_entropy.mean() > initial_entropy.mean() + + +def test_replay_buffer_view_provides_buffered_observations(): + space = spaces.Box(np.array([0]), np.array([5])) + n_envs = 2 + buffer_size = 10 + action = np.empty((n_envs, get_action_dim(space))) + + obs_shape = get_obs_shape(space) + wrapper = ReplayBufferRewardWrapper( + buffer_size, + space, + space, + replay_buffer_class=ReplayBuffer, + reward_fn=Mock(), + n_envs=n_envs, + handle_timeout_termination=False, + ) + view = wrapper.buffer_view + + # initially empty + assert len(view.observations) == 0 + + # after adding observation + obs1 = np.random.random((n_envs, *obs_shape)) + wrapper.add(obs1, obs1, action, np.empty(n_envs), np.empty(n_envs), []) + np.testing.assert_allclose(view.observations, np.array([obs1])) + + # after filling buffer + observations = np.random.random((buffer_size // n_envs, n_envs, *obs_shape)) + for obs in observations: + wrapper.add(obs, obs, action, np.empty(n_envs), np.empty(n_envs), []) + + # ReplayBuffer internally uses a circular buffer + expected = np.roll(observations, 1, axis=0) + np.testing.assert_allclose(view.observations, expected) diff --git a/tests/util/test_util.py b/tests/util/test_util.py index 28678dc8b..be2487aee 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -11,6 +11,7 @@ from imitation.util import sacred as sacred_util from imitation.util import util +from imitation.util.util import kth_value def test_endless_iter(): @@ -144,3 +145,14 @@ def test_compute_state_entropy_2d(): util.compute_state_entropy(obs, all_obs, k=3), np.sqrt(20**2 + 2**2), ) + + +def test_kth_value(): + arr1 = np.arange(0, 10, 1) + np.random.shuffle(arr1) + arr2 = np.arange(0, 100, 10) + np.random.shuffle(arr2) + arr = np.stack([arr1, arr2]) + + result = kth_value(arr, 3) + np.testing.assert_array_equal(result, np.array([2, 20])) From 1ec645ae7b2bc54fef31e8dd40e951e005e80f4c Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 01:20:00 +0100 Subject: [PATCH 19/55] #625 make entropy reward serializable with pickle --- .../algorithms/pebble/entropy_reward.py | 16 +++++++++-- .../policies/replay_buffer_wrapper.py | 1 + .../algorithms/pebble/test_entropy_reward.py | 28 +++++++++++++++++-- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 724fbf314..a1fff0e46 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -14,13 +14,14 @@ def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): self.nearest_neighbor_k = nearest_neighbor_k # TODO support n_envs > 1 self.entropy_stats = RunningNorm(1) + self.observation_space = observation_space self.obs_shape = get_obs_shape(observation_space) self.replay_buffer_view = ReplayBufferView( np.empty(0, dtype=observation_space.dtype), lambda: slice(0) ) - def set_buffer_view(self, replay_buffer_view: ReplayBufferView): - self.replay_buffer_view = replay_buffer_view + def set_replay_buffer(self, replay_buffer: ReplayBufferView): + self.replay_buffer_view = replay_buffer def __call__( self, @@ -42,3 +43,14 @@ def __call__( ) normalized_entropies = self.entropy_stats.forward(th.as_tensor(entropies)) return normalized_entropies.numpy() + + def __getstate__(self): + state = self.__dict__.copy() + del state["replay_buffer_view"] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.replay_buffer_view = ReplayBufferView( + np.empty(0, self.observation_space.dtype), lambda: slice(0) + ) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 477fb97b2..a7d548165 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -10,6 +10,7 @@ from imitation.rewards.reward_function import RewardFn from imitation.util import util from imitation.util.networks import RunningNorm +from typing import Callable def _samples_to_reward_fn_input( diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 777a9b9d6..5571c304f 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -1,3 +1,4 @@ +import pickle from unittest.mock import patch import numpy as np @@ -33,7 +34,9 @@ def test_state_entropy_reward_returns_entropy(rng): expected = util.compute_state_entropy( observations, all_observations.reshape(-1, *obs_shape), K ) - expected_normalized = reward_fn.entropy_stats.normalize(th.as_tensor(expected)).numpy() + expected_normalized = reward_fn.entropy_stats.normalize( + th.as_tensor(expected) + ).numpy() np.testing.assert_allclose(reward, expected_normalized) @@ -44,7 +47,7 @@ def test_state_entropy_reward_returns_normalized_values(): reward_fn = StateEntropyReward(K, SPACE) all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) - reward_fn.set_buffer_view( + reward_fn.set_replay_buffer( ReplayBufferView(all_observations, lambda: slice(None)) ) @@ -68,3 +71,24 @@ def test_state_entropy_reward_returns_normalized_values(): rtol=0.05, atol=0.05, ) + + +def test_state_entropy_reward_can_pickle(): + all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) + replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) + + obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE)) + reward_fn = StateEntropyReward(K, SPACE) + reward_fn.set_replay_buffer(replay_buffer) + reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + # Act + pickled = pickle.dumps(reward_fn) + reward_fn_deserialized = pickle.loads(pickled) + reward_fn_deserialized.set_replay_buffer(replay_buffer) + + # Assert + obs2 = np.random.rand(VENVS, *get_obs_shape(SPACE)) + expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + np.testing.assert_allclose(actual_result, expected_result) From 4e16c424c6a8ffda4e046b3427b096d0a6de1783 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 14:24:23 +0100 Subject: [PATCH 20/55] #625 revert change of compute_state_entropy() from tensors to numpy --- src/imitation/util/util.py | 20 ++++++++------------ tests/util/test_util.py | 11 ----------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index d88f775cd..9e5815e0c 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -362,10 +362,10 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]: def compute_state_entropy( - obs: np.ndarray, - all_obs: np.ndarray, + obs: th.Tensor, + all_obs: th.Tensor, k: int, -) -> np.ndarray: +) -> th.Tensor: """Compute the state entropy given by KNN distance. Args: @@ -379,19 +379,15 @@ def compute_state_entropy( assert obs.shape[1:] == all_obs.shape[1:] with th.no_grad(): non_batch_dimensions = tuple(range(2, len(obs.shape) + 1)) - distances_tensor = np.linalg.norm( + distances_tensor = th.linalg.vector_norm( obs[:, None] - all_obs[None, :], - axis=non_batch_dimensions, + dim=non_batch_dimensions, ord=2, ) # Note that we take the k+1'th value because the closest neighbor to # a point is itself, which we want to skip. - knn_dists = kth_value(distances_tensor, k+1) + assert distances_tensor.shape[-1] > k + knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values state_entropy = knn_dists - return np.expand_dims(state_entropy, axis=1) - - -def kth_value(x: np.ndarray, k: int): - assert k > 0 - return np.partition(x, k - 1, axis=-1)[..., k - 1] + return state_entropy.unsqueeze(1) diff --git a/tests/util/test_util.py b/tests/util/test_util.py index be2487aee..745529d2d 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -11,7 +11,6 @@ from imitation.util import sacred as sacred_util from imitation.util import util -from imitation.util.util import kth_value def test_endless_iter(): @@ -146,13 +145,3 @@ def test_compute_state_entropy_2d(): np.sqrt(20**2 + 2**2), ) - -def test_kth_value(): - arr1 = np.arange(0, 10, 1) - np.random.shuffle(arr1) - arr2 = np.arange(0, 100, 10) - np.random.shuffle(arr2) - arr = np.stack([arr1, arr2]) - - result = kth_value(arr, 3) - np.testing.assert_array_equal(result, np.array([2, 20])) From acb51be5c3e0922cc992e1f7f98002f95084c814 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 14:41:06 +0100 Subject: [PATCH 21/55] #625 extract _preference_feedback_schedule() --- .../algorithms/preference_comparisons.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 413cd979a..2b4a6d972 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1670,16 +1670,9 @@ def train( A dictionary with final metrics such as loss and accuracy of the reward model. """ - initial_comparisons = int(total_comparisons * self.initial_comparison_frac) - total_comparisons -= initial_comparisons - # Compute the number of comparisons to request at each iteration in advance. - vec_schedule = np.vectorize(self.query_schedule) - unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations)) - probs = unnormalized_probs / np.sum(unnormalized_probs) - shares = util.oric(probs * total_comparisons) - schedule = [initial_comparisons] + shares.tolist() - print(f"Query schedule: {schedule}") + preference_query_schedule = self._preference_gather_schedule(total_comparisons) + print(f"Query schedule: {preference_query_schedule}") timesteps_per_iteration, extra_timesteps = divmod( total_timesteps, @@ -1688,7 +1681,7 @@ def train( reward_loss = None reward_accuracy = None - for i, num_pairs in enumerate(schedule): + for i, num_pairs in enumerate(preference_query_schedule): ########################## # Gather new preferences # ########################## @@ -1751,3 +1744,13 @@ def train( self._iteration += 1 return {"reward_loss": reward_loss, "reward_accuracy": reward_accuracy} + + def _preference_gather_schedule(self, total_comparisons): + initial_comparisons = int(total_comparisons * self.initial_comparison_frac) + total_comparisons -= initial_comparisons + vec_schedule = np.vectorize(self.query_schedule) + unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations)) + probs = unnormalized_probs / np.sum(unnormalized_probs) + shares = util.oric(probs * total_comparisons) + schedule = [initial_comparisons] + shares.tolist() + return schedule From 8143ba394e4909c6e5674767bf4594f4d061cb8c Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 15:18:28 +0100 Subject: [PATCH 22/55] #625 introduce parameter for pretraining steps --- .../algorithms/preference_comparisons.py | 24 +++++++++++++++---- .../train_preference_comparisons_pebble.py | 3 +++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 2b4a6d972..ad2b8b6dc 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1495,6 +1495,7 @@ def __init__( transition_oversampling: float = 1, initial_comparison_frac: float = 0.1, initial_epoch_multiplier: float = 200.0, + initial_agent_pretrain_frac: float = 0.01, custom_logger: Optional[imit_logger.HierarchicalLogger] = None, allow_variable_horizon: bool = False, rng: Optional[np.random.Generator] = None, @@ -1544,6 +1545,9 @@ def __init__( initial_epoch_multiplier: before agent training begins, train the reward model for this many more epochs than usual (on fragments sampled from a random agent). + initial_agent_pretrain_frac: fraction of total_timesteps for which the + agent will be trained without preference gathering (and reward model + training) custom_logger: Where to log to; if None (default), creates a new logger. allow_variable_horizon: If False (default), algorithm will raise an exception if it detects trajectories of different length during @@ -1642,6 +1646,7 @@ def __init__( self.fragment_length = fragment_length self.initial_comparison_frac = initial_comparison_frac self.initial_epoch_multiplier = initial_epoch_multiplier + self.initial_agent_pretrain_frac = initial_agent_pretrain_frac self.num_iterations = num_iterations self.transition_oversampling = transition_oversampling if callable(query_schedule): @@ -1674,10 +1679,11 @@ def train( preference_query_schedule = self._preference_gather_schedule(total_comparisons) print(f"Query schedule: {preference_query_schedule}") - timesteps_per_iteration, extra_timesteps = divmod( - total_timesteps, - self.num_iterations, - ) + ( + agent_pretrain_timesteps, + timesteps_per_iteration, + extra_timesteps, + ) = self._compute_timesteps(total_timesteps) reward_loss = None reward_accuracy = None @@ -1754,3 +1760,13 @@ def _preference_gather_schedule(self, total_comparisons): shares = util.oric(probs * total_comparisons) schedule = [initial_comparisons] + shares.tolist() return schedule + + def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]: + agent_pretrain_timesteps = int( + total_timesteps * self.initial_agent_pretrain_frac + ) + timesteps_per_iteration, extra_timesteps = divmod( + total_timesteps - agent_pretrain_timesteps, + self.num_iterations, + ) + return agent_pretrain_timesteps, timesteps_per_iteration, extra_timesteps diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py index e65f38e37..a497542e7 100644 --- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py +++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py @@ -68,6 +68,8 @@ def train_defaults(): initial_comparison_frac = 0.1 # fraction of sampled trajectories that will include some random actions exploration_frac = 0.0 + # fraction of total_timesteps for training before preference gathering + initial_agent_pretrain_frac = 0.05 preference_model_kwargs = {} reward_trainer_kwargs = { "epochs": 3, @@ -153,6 +155,7 @@ def fast(): total_timesteps = 50 total_comparisons = 5 initial_comparison_frac = 0.2 + initial_agent_pretrain_frac = 0.2 num_iterations = 1 fragment_length = 2 reward_trainer_kwargs = { From 184e191c4194e2f1c3105609e76cf6d43ef840b0 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 15:57:29 +0100 Subject: [PATCH 23/55] #625 add initialized callback to ReplayBufferRewardWrapper --- src/imitation/policies/replay_buffer_wrapper.py | 8 +++++++- tests/policies/test_replay_buffer_wrapper.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index a7d548165..a6f13b832 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -1,5 +1,6 @@ """Wrapper for reward labeling for transitions sampled from a replay buffer.""" +from typing import Callable from typing import Mapping, Type import numpy as np @@ -10,7 +11,6 @@ from imitation.rewards.reward_function import RewardFn from imitation.util import util from imitation.util.networks import RunningNorm -from typing import Callable def _samples_to_reward_fn_input( @@ -59,6 +59,7 @@ def __init__( *, replay_buffer_class: Type[ReplayBuffer], reward_fn: RewardFn, + on_initialized_callback: Callable[["ReplayBufferRewardWrapper"], None] = None, **kwargs, ): """Builds ReplayBufferRewardWrapper. @@ -69,6 +70,9 @@ def __init__( action_space: Action space replay_buffer_class: Class of the replay buffer. reward_fn: Reward function for reward relabeling. + on_initialized_callback: Callback called with reference to this object after + this instance is fully initialized. This provides a hook to access the + buffer after it is created from inside a Stable Baselines algorithm. **kwargs: keyword arguments for ReplayBuffer. """ # Note(yawen-d): we directly inherit ReplayBuffer and leave out the case of @@ -86,6 +90,8 @@ def __init__( self.reward_fn = reward_fn _base_kwargs = {k: v for k, v in kwargs.items() if k in ["device", "n_envs"]} super().__init__(buffer_size, observation_space, action_space, **_base_kwargs) + if on_initialized_callback is not None: + on_initialized_callback(self) @property def pos(self) -> int: diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 668208b58..38597dbc0 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -264,3 +264,18 @@ def test_replay_buffer_view_provides_buffered_observations(): # ReplayBuffer internally uses a circular buffer expected = np.roll(observations, 1, axis=0) np.testing.assert_allclose(view.observations, expected) + + +def test_replay_buffer_reward_wrapper_calls_initialization_callback_with_itself(): + callback = Mock() + buffer = ReplayBufferRewardWrapper( + 10, + spaces.Discrete(2), + spaces.Discrete(2), + replay_buffer_class=ReplayBuffer, + reward_fn=Mock(), + n_envs=2, + handle_timeout_termination=False, + on_initialized_callback=callback, + ) + assert callback.call_args.args[0] is buffer From 52d914ab865519995e8bef550b2479c4817a43e9 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 16:26:16 +0100 Subject: [PATCH 24/55] #625 fix entropy_reward.py --- src/imitation/algorithms/pebble/entropy_reward.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index a1fff0e46..01c2f9a9f 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -35,13 +35,14 @@ def __call__( all_observations = self.replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that - all_observations = all_observations.reshape((-1, *self.obs_shape)) + all_observations = all_observations.reshape((-1, *state.shape[1:])) # TODO #625: fix self.obs_shape + # TODO #625: deal with the conversion back and forth between np and torch entropies = util.compute_state_entropy( - state, - all_observations, + th.tensor(state), + th.tensor(all_observations), self.nearest_neighbor_k, ) - normalized_entropies = self.entropy_stats.forward(th.as_tensor(entropies)) + normalized_entropies = self.entropy_stats.forward(entropies) return normalized_entropies.numpy() def __getstate__(self): From 1f01a7a0b4228bab782830148d130ff2c947a9d4 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 16:28:59 +0100 Subject: [PATCH 25/55] #625 remove ReplayBufferEntropyRewardWrapper --- .../algorithms/pebble/entropy_reward.py | 4 +- .../policies/replay_buffer_wrapper.py | 84 +--------------- src/imitation/scripts/common/rl.py | 9 +- tests/policies/test_replay_buffer_wrapper.py | 95 +------------------ tests/util/test_util.py | 1 - 5 files changed, 11 insertions(+), 182 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 01c2f9a9f..812d1aa56 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -35,7 +35,9 @@ def __call__( all_observations = self.replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that - all_observations = all_observations.reshape((-1, *state.shape[1:])) # TODO #625: fix self.obs_shape + all_observations = all_observations.reshape( + (-1, *state.shape[1:]) # TODO #625: fix self.obs_shape + ) # TODO #625: deal with the conversion back and forth between np and torch entropies = util.compute_state_entropy( th.tensor(state), diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index a6f13b832..897957296 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -1,7 +1,6 @@ """Wrapper for reward labeling for transitions sampled from a replay buffer.""" -from typing import Callable -from typing import Mapping, Type +from typing import Callable, Mapping, Type import numpy as np from gym import spaces @@ -10,7 +9,6 @@ from imitation.rewards.reward_function import RewardFn from imitation.util import util -from imitation.util.networks import RunningNorm def _samples_to_reward_fn_input( @@ -139,83 +137,3 @@ def _get_samples(self): "_get_samples() is intentionally not implemented." "This method should not be called.", ) - - -class ReplayBufferEntropyRewardWrapper(ReplayBufferRewardWrapper): - """Relabel the rewards from a ReplayBuffer, initially using entropy as reward.""" - - def __init__( - self, - buffer_size: int, - observation_space: spaces.Space, - action_space: spaces.Space, - *, - replay_buffer_class: Type[ReplayBuffer], - reward_fn: RewardFn, - entropy_as_reward_samples: int, - k: int = 5, - **kwargs, - ): - """Builds ReplayBufferRewardWrapper. - - Args: - buffer_size: Max number of elements in the buffer - observation_space: Observation space - action_space: Action space - replay_buffer_class: Class of the replay buffer. - reward_fn: Reward function for reward relabeling. - entropy_as_reward_samples: Number of samples to use entropy as the reward, - before switching to using the reward_fn for relabeling. - k: Use the k'th nearest neighbor's distance when computing state entropy. - **kwargs: keyword arguments for ReplayBuffer. - """ - # TODO should we limit by number of batches (as this does) - # or number of observations returned? - super().__init__( - buffer_size, - observation_space, - action_space, - replay_buffer_class=replay_buffer_class, - reward_fn=reward_fn, - **kwargs, - ) - self.sample_count = 0 - self.k = k - # TODO support n_envs > 1 - self.entropy_stats = RunningNorm(1) - self.entropy_as_reward_samples = entropy_as_reward_samples - - def sample(self, *args, **kwargs): - self.sample_count += 1 - samples = super().sample(*args, **kwargs) - # For some reason self.entropy_as_reward_samples seems to get cleared, - # and I have no idea why. - if self.sample_count > self.entropy_as_reward_samples: - return samples - # TODO we really ought to reset the reward network once we are done w/ - # the entropy based pre-training. We also have no reason to train - # or even use the reward network before then. - - if self.full: - all_obs = self.observations - else: - all_obs = self.observations[: self.pos] - # super().sample() flattens the venv dimension, let's do it too - all_obs = all_obs.reshape((-1, *self.obs_shape)) - entropies = util.compute_state_entropy( - samples.observations, - all_obs, - self.k, - ) - - # Normalize to have mean of 0 and standard deviation of 1 according to running stats - entropies = self.entropy_stats.forward(entropies) - assert entropies.shape == samples.rewards.shape - - return ReplayBufferSamples( - observations=samples.observations, - actions=samples.actions, - next_observations=samples.next_observations, - dones=samples.dones, - rewards=entropies, - ) diff --git a/src/imitation/scripts/common/rl.py b/src/imitation/scripts/common/rl.py index 2bd3759a2..e879bbaf8 100644 --- a/src/imitation/scripts/common/rl.py +++ b/src/imitation/scripts/common/rl.py @@ -86,10 +86,11 @@ def _maybe_add_relabel_buffer( """Use ReplayBufferRewardWrapper in rl_kwargs if relabel_reward_fn is not None.""" rl_kwargs = dict(rl_kwargs) if relabel_reward_fn: - _buffer_kwargs = dict(reward_fn=relabel_reward_fn) - _buffer_kwargs["replay_buffer_class"] = rl_kwargs.get( - "replay_buffer_class", - buffers.ReplayBuffer, + _buffer_kwargs = dict( + reward_fn=relabel_reward_fn, + replay_buffer_class=rl_kwargs.get( + "replay_buffer_class", buffers.ReplayBuffer + ), ) rl_kwargs["replay_buffer_class"] = ReplayBufferRewardWrapper diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 38597dbc0..248018a75 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -13,14 +13,10 @@ from stable_baselines3.common import buffers, off_policy_algorithm, policies from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.policies import BasePolicy -from stable_baselines3.common.preprocessing import get_obs_shape, get_action_dim +from stable_baselines3.common.preprocessing import get_action_dim, get_obs_shape from stable_baselines3.common.save_util import load_from_pkl -from stable_baselines3.common.vec_env import DummyVecEnv -from imitation.policies.replay_buffer_wrapper import ( - ReplayBufferEntropyRewardWrapper, - ReplayBufferRewardWrapper, -) +from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper from imitation.util import util @@ -123,54 +119,6 @@ def test_wrapper_class(tmpdir, rng): replay_buffer_wrapper._get_samples() -# Combine this with the above test via parameterization over the buffer class -def test_entropy_wrapper_class_no_op(tmpdir, rng): - buffer_size = 15 - total_timesteps = 20 - entropy_samples = 0 - - venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng) - rl_algo = sb3.SAC( - policy=sb3.sac.policies.SACPolicy, - policy_kwargs=dict(), - env=venv, - seed=42, - replay_buffer_class=ReplayBufferEntropyRewardWrapper, - replay_buffer_kwargs=dict( - replay_buffer_class=buffers.ReplayBuffer, - reward_fn=zero_reward_fn, - entropy_as_reward_samples=entropy_samples, - ), - buffer_size=buffer_size, - ) - - rl_algo.learn(total_timesteps=total_timesteps) - - buffer_path = osp.join(tmpdir, "buffer.pkl") - rl_algo.save_replay_buffer(buffer_path) - replay_buffer_wrapper = load_from_pkl(buffer_path) - replay_buffer = replay_buffer_wrapper.replay_buffer - - # replay_buffer_wrapper.sample(...) should return zero-reward transitions - assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size() - assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all() - assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all() # seed=42 - - # replay_buffer_wrapper.pos, replay_buffer_wrapper.full - assert replay_buffer_wrapper.pos == total_timesteps - buffer_size - assert replay_buffer_wrapper.full - - # reset() - replay_buffer_wrapper.reset() - assert 0 == replay_buffer_wrapper.size() == replay_buffer.size() - assert replay_buffer_wrapper.pos == 0 - assert not replay_buffer_wrapper.full - - # to_torch() - tensor = replay_buffer_wrapper.to_torch(np.ones(42)) - assert type(tensor) is th.Tensor - - class ActionIsObsEnv(gym.Env): """Simple environment where the obs is the action.""" @@ -191,45 +139,6 @@ def reset(self): return np.array([0]) -def test_entropy_wrapper_class(tmpdir, rng): - buffer_size = 20 - entropy_samples = 500 - k = 4 - - venv = DummyVecEnv([ActionIsObsEnv]) - rl_algo = sb3.SAC( - policy=sb3.sac.policies.SACPolicy, - policy_kwargs=dict(), - env=venv, - seed=42, - replay_buffer_class=ReplayBufferEntropyRewardWrapper, - replay_buffer_kwargs=dict( - replay_buffer_class=buffers.ReplayBuffer, - reward_fn=zero_reward_fn, - entropy_as_reward_samples=entropy_samples, - k=k, - ), - buffer_size=buffer_size, - ) - - rl_algo.learn(total_timesteps=buffer_size) - initial_entropy = util.compute_state_entropy( - th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), - th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), - k=k, - ) - - rl_algo.learn(total_timesteps=entropy_samples - buffer_size) - # Expect that the entropy of our replay buffer is now higher, - # since we trained with that as the reward. - trained_entropy = util.compute_state_entropy( - th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), - th.Tensor(rl_algo.replay_buffer.replay_buffer.observations), - k=k, - ) - assert trained_entropy.mean() > initial_entropy.mean() - - def test_replay_buffer_view_provides_buffered_observations(): space = spaces.Box(np.array([0]), np.array([5])) n_envs = 2 diff --git a/tests/util/test_util.py b/tests/util/test_util.py index 745529d2d..28678dc8b 100644 --- a/tests/util/test_util.py +++ b/tests/util/test_util.py @@ -144,4 +144,3 @@ def test_compute_state_entropy_2d(): util.compute_state_entropy(obs, all_obs, k=3), np.sqrt(20**2 + 2**2), ) - From 1fbc590f999c62b8435a73e1c20588ddc27cf6ca Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 16:50:15 +0100 Subject: [PATCH 26/55] #625 introduce ReplayBufferAwareRewardFn --- .../algorithms/pebble/entropy_reward.py | 17 +++++++++++++---- src/imitation/policies/replay_buffer_wrapper.py | 13 ++++++------- src/imitation/rewards/reward_function.py | 6 ++++++ tests/algorithms/pebble/test_entropy_reward.py | 8 +++++--- tests/policies/test_replay_buffer_wrapper.py | 10 +++++----- 5 files changed, 35 insertions(+), 19 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 812d1aa56..f26af2479 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -1,15 +1,20 @@ +from typing import Tuple + import numpy as np import torch as th from gym.vector.utils import spaces from stable_baselines3.common.preprocessing import get_obs_shape -from imitation.policies.replay_buffer_wrapper import ReplayBufferView -from imitation.rewards.reward_function import RewardFn +from imitation.policies.replay_buffer_wrapper import ( + ReplayBufferView, + ReplayBufferRewardWrapper, +) +from imitation.rewards.reward_function import ReplayBufferAwareRewardFn from imitation.util import util from imitation.util.networks import RunningNorm -class StateEntropyReward(RewardFn): +class StateEntropyReward(ReplayBufferAwareRewardFn): def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): self.nearest_neighbor_k = nearest_neighbor_k # TODO support n_envs > 1 @@ -20,8 +25,12 @@ def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): np.empty(0, dtype=observation_space.dtype), lambda: slice(0) ) - def set_replay_buffer(self, replay_buffer: ReplayBufferView): + def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): + self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape) + + def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape:Tuple): self.replay_buffer_view = replay_buffer + self.obs_shape = obs_shape def __call__( self, diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 897957296..297a6b008 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -7,7 +7,7 @@ from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples -from imitation.rewards.reward_function import RewardFn +from imitation.rewards.reward_function import RewardFn, ReplayBufferAwareRewardFn from imitation.util import util @@ -37,13 +37,13 @@ def __init__( observations_buffer: np.ndarray, buffer_slice_provider: Callable[[], slice], ): - self._observations_buffer = observations_buffer.view() - self._observations_buffer.flags.writeable = False + self._observations_buffer_view = observations_buffer.view() + self._observations_buffer_view.flags.writeable = False self._buffer_slice_provider = buffer_slice_provider @property def observations(self): - return self._observations_buffer[self._buffer_slice_provider()] + return self._observations_buffer_view[self._buffer_slice_provider()] class ReplayBufferRewardWrapper(ReplayBuffer): @@ -57,7 +57,6 @@ def __init__( *, replay_buffer_class: Type[ReplayBuffer], reward_fn: RewardFn, - on_initialized_callback: Callable[["ReplayBufferRewardWrapper"], None] = None, **kwargs, ): """Builds ReplayBufferRewardWrapper. @@ -88,8 +87,8 @@ def __init__( self.reward_fn = reward_fn _base_kwargs = {k: v for k, v in kwargs.items() if k in ["device", "n_envs"]} super().__init__(buffer_size, observation_space, action_space, **_base_kwargs) - if on_initialized_callback is not None: - on_initialized_callback(self) + if isinstance(reward_fn, ReplayBufferAwareRewardFn): + reward_fn.on_replay_buffer_initialized(self) @property def pos(self) -> int: diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py index 93761752d..e9d7bed30 100644 --- a/src/imitation/rewards/reward_function.py +++ b/src/imitation/rewards/reward_function.py @@ -32,3 +32,9 @@ def __call__( Returns: Computed rewards of shape `(batch_size,`). """ # noqa: DAR202 + + +class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): + @abc.abstractmethod + def on_replay_buffer_initialized(self, replay_buffer: "ReplayBufferRewardWrapper"): + pass diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 5571c304f..16314a1e1 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -23,8 +23,9 @@ def test_state_entropy_reward_returns_entropy(rng): obs_shape = get_obs_shape(SPACE) all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape)) + reward_fn = StateEntropyReward(K, SPACE) - reward_fn.set_buffer_view(ReplayBufferView(all_observations, lambda: slice(None))) + reward_fn.set_replay_buffer(ReplayBufferView(all_observations, lambda: slice(None)), obs_shape) # Act observations = rng.random((BATCH_SIZE, *obs_shape)) @@ -48,7 +49,8 @@ def test_state_entropy_reward_returns_normalized_values(): reward_fn = StateEntropyReward(K, SPACE) all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) reward_fn.set_replay_buffer( - ReplayBufferView(all_observations, lambda: slice(None)) + ReplayBufferView(all_observations, lambda: slice(None)), + get_obs_shape(SPACE) ) dim = 8 @@ -79,7 +81,7 @@ def test_state_entropy_reward_can_pickle(): obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE)) reward_fn = StateEntropyReward(K, SPACE) - reward_fn.set_replay_buffer(replay_buffer) + reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE)) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Act diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 248018a75..02bb72ce2 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -17,6 +17,7 @@ from stable_baselines3.common.save_util import load_from_pkl from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper +from imitation.rewards.reward_function import ReplayBufferAwareRewardFn from imitation.util import util @@ -175,16 +176,15 @@ def test_replay_buffer_view_provides_buffered_observations(): np.testing.assert_allclose(view.observations, expected) -def test_replay_buffer_reward_wrapper_calls_initialization_callback_with_itself(): - callback = Mock() +def test_replay_buffer_reward_wrapper_calls_reward_initialization_callback(): + reward_fn = Mock(spec=ReplayBufferAwareRewardFn) buffer = ReplayBufferRewardWrapper( 10, spaces.Discrete(2), spaces.Discrete(2), replay_buffer_class=ReplayBuffer, - reward_fn=Mock(), + reward_fn=reward_fn, n_envs=2, handle_timeout_termination=False, - on_initialized_callback=callback, ) - assert callback.call_args.args[0] is buffer + assert reward_fn.on_replay_buffer_initialized.call_args.args[0] is buffer From e19dd85e9cecada420a5e55664ceb8df23908b1e Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 16:59:45 +0100 Subject: [PATCH 27/55] #625 rename PebbleStateEntropyReward --- src/imitation/algorithms/pebble/entropy_reward.py | 3 ++- tests/algorithms/pebble/test_entropy_reward.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index f26af2479..ab5d424b8 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -14,7 +14,8 @@ from imitation.util.networks import RunningNorm -class StateEntropyReward(ReplayBufferAwareRewardFn): +class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): + # TODO #625: get rid of the observation_space parameter def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): self.nearest_neighbor_k = nearest_neighbor_k # TODO support n_envs > 1 diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 16314a1e1..9ba4dd9cd 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -6,7 +6,7 @@ from gym.spaces import Discrete from stable_baselines3.common.preprocessing import get_obs_shape -from imitation.algorithms.pebble.entropy_reward import StateEntropyReward +from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.policies.replay_buffer_wrapper import ReplayBufferView from imitation.util import util @@ -24,7 +24,7 @@ def test_state_entropy_reward_returns_entropy(rng): all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape)) - reward_fn = StateEntropyReward(K, SPACE) + reward_fn = PebbleStateEntropyReward(K, SPACE) reward_fn.set_replay_buffer(ReplayBufferView(all_observations, lambda: slice(None)), obs_shape) # Act @@ -46,7 +46,7 @@ def test_state_entropy_reward_returns_normalized_values(): # mock entropy computation so that we can test only stats collection in this test m.side_effect = lambda obs, all_obs, k: obs - reward_fn = StateEntropyReward(K, SPACE) + reward_fn = PebbleStateEntropyReward(K, SPACE) all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) reward_fn.set_replay_buffer( ReplayBufferView(all_observations, lambda: slice(None)), @@ -80,7 +80,7 @@ def test_state_entropy_reward_can_pickle(): replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE)) - reward_fn = StateEntropyReward(K, SPACE) + reward_fn = PebbleStateEntropyReward(K, SPACE) reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE)) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) From da77f5c0373b42b8a9e46f58621bbf6183f134e2 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 17:14:39 +0100 Subject: [PATCH 28/55] #625 PebbleStateEntropyReward can switch from unsupervised pretraining --- .../algorithms/pebble/entropy_reward.py | 26 ++++++++-- .../algorithms/pebble/test_entropy_reward.py | 51 +++++++++++++++---- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index ab5d424b8..81d43daa8 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -9,14 +9,21 @@ ReplayBufferView, ReplayBufferRewardWrapper, ) -from imitation.rewards.reward_function import ReplayBufferAwareRewardFn +from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn from imitation.util import util from imitation.util.networks import RunningNorm class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): # TODO #625: get rid of the observation_space parameter - def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): + # TODO #625: parametrize nearest_neighbor_k + def __init__( + self, + trained_reward_fn: RewardFn, + observation_space: spaces.Space, + nearest_neighbor_k: int = 5, + ): + self.trained_reward_fn = trained_reward_fn self.nearest_neighbor_k = nearest_neighbor_k # TODO support n_envs > 1 self.entropy_stats = RunningNorm(1) @@ -25,14 +32,20 @@ def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space): self.replay_buffer_view = ReplayBufferView( np.empty(0, dtype=observation_space.dtype), lambda: slice(0) ) + # This indicates that the training is in the "Unsupervised exploration" + # phase of the Pebble algorithm, where entropy is used as reward + self.unsupervised_exploration_active = True def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape) - def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape:Tuple): + def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple): self.replay_buffer_view = replay_buffer self.obs_shape = obs_shape + def on_unsupervised_exploration_finished(self): + self.unsupervised_exploration_active = False + def __call__( self, state: np.ndarray, @@ -40,9 +53,14 @@ def __call__( next_state: np.ndarray, done: np.ndarray, ) -> np.ndarray: + if self.unsupervised_exploration_active: + return self._entropy_reward(state) + else: + return self.trained_reward_fn(state, action, next_state, done) + + def _entropy_reward(self, state): # TODO: should this work with torch instead of numpy internally? # (The RewardFn protocol requires numpy) - all_observations = self.replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that all_observations = all_observations.reshape( diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 9ba4dd9cd..42496a79c 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -1,5 +1,5 @@ import pickle -from unittest.mock import patch +from unittest.mock import patch, Mock import numpy as np import torch as th @@ -19,13 +19,14 @@ VENVS = 2 -def test_state_entropy_reward_returns_entropy(rng): +def test_pebble_entropy_reward_returns_entropy(rng): obs_shape = get_obs_shape(SPACE) all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape)) - - reward_fn = PebbleStateEntropyReward(K, SPACE) - reward_fn.set_replay_buffer(ReplayBufferView(all_observations, lambda: slice(None)), obs_shape) + reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K) + reward_fn.set_replay_buffer( + ReplayBufferView(all_observations, lambda: slice(None)), obs_shape + ) # Act observations = rng.random((BATCH_SIZE, *obs_shape)) @@ -41,16 +42,16 @@ def test_state_entropy_reward_returns_entropy(rng): np.testing.assert_allclose(reward, expected_normalized) -def test_state_entropy_reward_returns_normalized_values(): +def test_pebble_entropy_reward_returns_normalized_values(): with patch("imitation.util.util.compute_state_entropy") as m: # mock entropy computation so that we can test only stats collection in this test m.side_effect = lambda obs, all_obs, k: obs - reward_fn = PebbleStateEntropyReward(K, SPACE) + reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K) all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) reward_fn.set_replay_buffer( ReplayBufferView(all_observations, lambda: slice(None)), - get_obs_shape(SPACE) + get_obs_shape(SPACE), ) dim = 8 @@ -75,12 +76,12 @@ def test_state_entropy_reward_returns_normalized_values(): ) -def test_state_entropy_reward_can_pickle(): +def test_pebble_entropy_reward_can_pickle(): all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE)) - reward_fn = PebbleStateEntropyReward(K, SPACE) + reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K) reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE)) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) @@ -94,3 +95,33 @@ def test_state_entropy_reward_can_pickle(): expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) np.testing.assert_allclose(actual_result, expected_result) + + +def test_pebble_entropy_reward_function_switches_to_inner(): + obs_shape = get_obs_shape(SPACE) + + expected_reward = np.ones(1) + reward_fn_mock = Mock() + reward_fn_mock.return_value = expected_reward + reward_fn = PebbleStateEntropyReward(reward_fn_mock, SPACE) + + # Act + reward_fn.on_unsupervised_exploration_finished() + observations = np.ones((BATCH_SIZE, *obs_shape)) + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + # Assert + assert reward == expected_reward + reward_fn_mock.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) + + +def reward_fn_stub( + self, + state: np.ndarray, + action: np.ndarray, + next_state: np.ndarray, + done: np.ndarray, +) -> np.ndarray: + return state From a11e7756f2a6b4d839386c1bea10187173e96340 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 19:25:22 +0100 Subject: [PATCH 29/55] #625 add optional pretraining to PreferenceComparisons --- .../algorithms/preference_comparisons.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index ad2b8b6dc..e29433188 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -75,6 +75,19 @@ def sample(self, steps: int) -> Sequence[TrajectoryWithRew]: be the environment rewards, not ones from a reward model). """ # noqa: DAR202 + def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: + """Pre-train an agent if the trajectory generator uses one that + needs pre-training. + + By default, this method does nothing and doesn't need + to be overridden in subclasses that don't require pre-training. + + Args: + steps: number of environment steps to train for. + **kwargs: additional keyword arguments to pass on to + the training procedure. + """ + def train(self, steps: int, **kwargs: Any) -> None: """Train an agent if the trajectory generator uses one. @@ -1495,7 +1508,7 @@ def __init__( transition_oversampling: float = 1, initial_comparison_frac: float = 0.1, initial_epoch_multiplier: float = 200.0, - initial_agent_pretrain_frac: float = 0.01, + initial_agent_pretrain_frac: float = 0.05, custom_logger: Optional[imit_logger.HierarchicalLogger] = None, allow_variable_horizon: bool = False, rng: Optional[np.random.Generator] = None, @@ -1687,6 +1700,15 @@ def train( reward_loss = None reward_accuracy = None + ################################################### + # Pre-training agent before gathering preferences # + ################################################### + with self.logger.accumulate_means("agent"): + self.logger.log( + f"Pre-training agent for {agent_pretrain_timesteps} timesteps" + ) + self.trajectory_generator.unsupervised_pretrain(agent_pretrain_timesteps) + for i, num_pairs in enumerate(preference_query_schedule): ########################## # Gather new preferences # From 7b12162aba277da580b344966044adea7ab6a989 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 20:30:09 +0100 Subject: [PATCH 30/55] #625 PebbleStateEntropyReward supports the initial phase before replay buffer is filled --- .../algorithms/pebble/entropy_reward.py | 79 ++++++++++----- .../algorithms/pebble/test_entropy_reward.py | 97 +++++++++++-------- 2 files changed, 109 insertions(+), 67 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 81d43daa8..04322f808 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -1,9 +1,8 @@ +from enum import Enum, auto from typing import Tuple import numpy as np import torch as th -from gym.vector.utils import spaces -from stable_baselines3.common.preprocessing import get_obs_shape from imitation.policies.replay_buffer_wrapper import ( ReplayBufferView, @@ -14,27 +13,53 @@ from imitation.util.networks import RunningNorm +class PebbleRewardPhase(Enum): + """States representing different behaviors for PebbleStateEntropyReward""" + + # Collecting samples so that we have something for entropy calculation + LEARNING_START = auto() + # Entropy based reward + UNSUPERVISED_EXPLORATION = auto() + # Learned reward + POLICY_AND_REWARD_LEARNING = auto() + + class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): - # TODO #625: get rid of the observation_space parameter + """ + Reward function for implementation of the PEBBLE learning algorithm + (https://arxiv.org/pdf/2106.05091.pdf). + + The rewards returned by this function go through the three phases + defined in PebbleRewardPhase. To transition between these phases, + unsupervised_exploration_start() and unsupervised_exploration_finish() + need to be called. + + The second phase (UNSUPERVISED_EXPLORATION) also requires that a buffer + with observations to compare against is supplied with set_replay_buffer() + or on_replay_buffer_initialized(). + + Args: + learned_reward_fn: The learned reward function used after unsupervised + exploration is finished + nearest_neighbor_k: Parameter for entropy computation (see + compute_state_entropy()) + """ + # TODO #625: parametrize nearest_neighbor_k def __init__( self, - trained_reward_fn: RewardFn, - observation_space: spaces.Space, + learned_reward_fn: RewardFn, nearest_neighbor_k: int = 5, ): - self.trained_reward_fn = trained_reward_fn + self.trained_reward_fn = learned_reward_fn self.nearest_neighbor_k = nearest_neighbor_k # TODO support n_envs > 1 self.entropy_stats = RunningNorm(1) - self.observation_space = observation_space - self.obs_shape = get_obs_shape(observation_space) - self.replay_buffer_view = ReplayBufferView( - np.empty(0, dtype=observation_space.dtype), lambda: slice(0) - ) - # This indicates that the training is in the "Unsupervised exploration" - # phase of the Pebble algorithm, where entropy is used as reward - self.unsupervised_exploration_active = True + self.state = PebbleRewardPhase.LEARNING_START + + # These two need to be set with set_replay_buffer(): + self.replay_buffer_view = None + self.obs_shape = None def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape) @@ -43,8 +68,13 @@ def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple): self.replay_buffer_view = replay_buffer self.obs_shape = obs_shape - def on_unsupervised_exploration_finished(self): - self.unsupervised_exploration_active = False + def unsupervised_exploration_start(self): + assert self.state == PebbleRewardPhase.LEARNING_START + self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION + + def unsupervised_exploration_finish(self): + assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION + self.state = PebbleRewardPhase.POLICY_AND_REWARD_LEARNING def __call__( self, @@ -53,19 +83,20 @@ def __call__( next_state: np.ndarray, done: np.ndarray, ) -> np.ndarray: - if self.unsupervised_exploration_active: + if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION: return self._entropy_reward(state) else: return self.trained_reward_fn(state, action, next_state, done) def _entropy_reward(self, state): - # TODO: should this work with torch instead of numpy internally? - # (The RewardFn protocol requires numpy) + if self.replay_buffer_view is None: + raise ValueError( + "Replay buffer must be supplied before entropy reward can be used" + ) + all_observations = self.replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that - all_observations = all_observations.reshape( - (-1, *state.shape[1:]) # TODO #625: fix self.obs_shape - ) + all_observations = all_observations.reshape((-1, *self.obs_shape)) # TODO #625: deal with the conversion back and forth between np and torch entropies = util.compute_state_entropy( th.tensor(state), @@ -82,6 +113,4 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) - self.replay_buffer_view = ReplayBufferView( - np.empty(0, self.observation_space.dtype), lambda: slice(0) - ) + self.replay_buffer_view = None diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 42496a79c..3abd66752 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -11,7 +11,8 @@ from imitation.util import util SPACE = Discrete(4) -PLACEHOLDER = np.empty(get_obs_shape(SPACE)) +OBS_SHAPE = get_obs_shape(SPACE) +PLACEHOLDER = np.empty(OBS_SHAPE) BUFFER_SIZE = 20 K = 4 @@ -19,22 +20,59 @@ VENVS = 2 -def test_pebble_entropy_reward_returns_entropy(rng): - obs_shape = get_obs_shape(SPACE) - all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape)) +def test_pebble_entropy_reward_function_returns_learned_reward_initially(): + expected_reward = np.ones(1) + learned_reward_mock = Mock() + learned_reward_mock.return_value = expected_reward + reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE) + + # Act + observations = np.ones((BATCH_SIZE, *OBS_SHAPE)) + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + # Assert + assert reward == expected_reward + learned_reward_mock.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) + + +def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training(): + expected_reward = np.ones(1) + learned_reward_mock = Mock() + learned_reward_mock.return_value = expected_reward + reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE) + # move all the way to the last state + reward_fn.unsupervised_exploration_start() + reward_fn.unsupervised_exploration_finish() + + # Act + observations = np.ones((BATCH_SIZE, *OBS_SHAPE)) + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + # Assert + assert reward == expected_reward + learned_reward_mock.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) + + +def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): + all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE))) reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K) reward_fn.set_replay_buffer( - ReplayBufferView(all_observations, lambda: slice(None)), obs_shape + ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE ) + reward_fn.unsupervised_exploration_start() # Act - observations = rng.random((BATCH_SIZE, *obs_shape)) + observations = th.rand((BATCH_SIZE, *(OBS_SHAPE))) reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Assert expected = util.compute_state_entropy( - observations, all_observations.reshape(-1, *obs_shape), K + observations, all_observations.reshape(-1, *(OBS_SHAPE)), K ) expected_normalized = reward_fn.entropy_stats.normalize( th.as_tensor(expected) @@ -42,17 +80,18 @@ def test_pebble_entropy_reward_returns_entropy(rng): np.testing.assert_allclose(reward, expected_normalized) -def test_pebble_entropy_reward_returns_normalized_values(): +def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): with patch("imitation.util.util.compute_state_entropy") as m: # mock entropy computation so that we can test only stats collection in this test m.side_effect = lambda obs, all_obs, k: obs reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K) - all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) + all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) reward_fn.set_replay_buffer( ReplayBufferView(all_observations, lambda: slice(None)), - get_obs_shape(SPACE), + OBS_SHAPE, ) + reward_fn.unsupervised_exploration_start() dim = 8 shift = 3 @@ -77,51 +116,25 @@ def test_pebble_entropy_reward_returns_normalized_values(): def test_pebble_entropy_reward_can_pickle(): - all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE))) + all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) - obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE)) + obs1 = np.random.rand(VENVS, *OBS_SHAPE) reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K) - reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE)) + reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Act pickled = pickle.dumps(reward_fn) reward_fn_deserialized = pickle.loads(pickled) - reward_fn_deserialized.set_replay_buffer(replay_buffer) + reward_fn_deserialized.set_replay_buffer(replay_buffer, OBS_SHAPE) # Assert - obs2 = np.random.rand(VENVS, *get_obs_shape(SPACE)) + obs2 = np.random.rand(VENVS, *OBS_SHAPE) expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) np.testing.assert_allclose(actual_result, expected_result) -def test_pebble_entropy_reward_function_switches_to_inner(): - obs_shape = get_obs_shape(SPACE) - - expected_reward = np.ones(1) - reward_fn_mock = Mock() - reward_fn_mock.return_value = expected_reward - reward_fn = PebbleStateEntropyReward(reward_fn_mock, SPACE) - - # Act - reward_fn.on_unsupervised_exploration_finished() - observations = np.ones((BATCH_SIZE, *obs_shape)) - reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) - - # Assert - assert reward == expected_reward - reward_fn_mock.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER - ) - - -def reward_fn_stub( - self, - state: np.ndarray, - action: np.ndarray, - next_state: np.ndarray, - done: np.ndarray, -) -> np.ndarray: +def reward_fn_stub(state, action, next_state, done): return state From e354e16c51bcc36be74247ece6bf1ce503f38883 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 20:54:10 +0100 Subject: [PATCH 31/55] #625 entropy_reward can automatically detect if enough observations are present --- .../algorithms/pebble/entropy_reward.py | 62 +++++++++--------- .../algorithms/pebble/test_entropy_reward.py | 64 +++++++------------ 2 files changed, 53 insertions(+), 73 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 04322f808..3d9d76b00 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -16,12 +16,8 @@ class PebbleRewardPhase(Enum): """States representing different behaviors for PebbleStateEntropyReward""" - # Collecting samples so that we have something for entropy calculation - LEARNING_START = auto() - # Entropy based reward - UNSUPERVISED_EXPLORATION = auto() - # Learned reward - POLICY_AND_REWARD_LEARNING = auto() + UNSUPERVISED_EXPLORATION = auto() # Entropy based reward + POLICY_AND_REWARD_LEARNING = auto() # Learned reward class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): @@ -29,14 +25,19 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): Reward function for implementation of the PEBBLE learning algorithm (https://arxiv.org/pdf/2106.05091.pdf). - The rewards returned by this function go through the three phases - defined in PebbleRewardPhase. To transition between these phases, - unsupervised_exploration_start() and unsupervised_exploration_finish() - need to be called. + The rewards returned by this function go through the three phases: + 1. Before enough samples are collected for entropy calculation, the + underlying function is returned. This shouldn't matter because + OffPolicyAlgorithms have an initialization period for `learning_starts` + timesteps. + 2. During the unsupervised exploration phase, entropy based reward is returned + 3. After unsupervised exploration phase is finished, the underlying learned + reward is returned. - The second phase (UNSUPERVISED_EXPLORATION) also requires that a buffer - with observations to compare against is supplied with set_replay_buffer() - or on_replay_buffer_initialized(). + The second phase requires that a buffer with observations to compare against is + supplied with set_replay_buffer() or on_replay_buffer_initialized(). + To transition to the last phase, unsupervised_exploration_finish() needs + to be called. Args: learned_reward_fn: The learned reward function used after unsupervised @@ -51,11 +52,10 @@ def __init__( learned_reward_fn: RewardFn, nearest_neighbor_k: int = 5, ): - self.trained_reward_fn = learned_reward_fn + self.learned_reward_fn = learned_reward_fn self.nearest_neighbor_k = nearest_neighbor_k - # TODO support n_envs > 1 self.entropy_stats = RunningNorm(1) - self.state = PebbleRewardPhase.LEARNING_START + self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION # These two need to be set with set_replay_buffer(): self.replay_buffer_view = None @@ -68,10 +68,6 @@ def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple): self.replay_buffer_view = replay_buffer self.obs_shape = obs_shape - def unsupervised_exploration_start(self): - assert self.state == PebbleRewardPhase.LEARNING_START - self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION - def unsupervised_exploration_finish(self): assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION self.state = PebbleRewardPhase.POLICY_AND_REWARD_LEARNING @@ -84,26 +80,30 @@ def __call__( done: np.ndarray, ) -> np.ndarray: if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION: - return self._entropy_reward(state) + return self._entropy_reward(state, action, next_state, done) else: - return self.trained_reward_fn(state, action, next_state, done) + return self.learned_reward_fn(state, action, next_state, done) - def _entropy_reward(self, state): + def _entropy_reward(self, state, action, next_state, done): if self.replay_buffer_view is None: raise ValueError( "Replay buffer must be supplied before entropy reward can be used" ) - all_observations = self.replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that all_observations = all_observations.reshape((-1, *self.obs_shape)) - # TODO #625: deal with the conversion back and forth between np and torch - entropies = util.compute_state_entropy( - th.tensor(state), - th.tensor(all_observations), - self.nearest_neighbor_k, - ) - normalized_entropies = self.entropy_stats.forward(entropies) + + if all_observations.shape[0] < self.nearest_neighbor_k: + # not enough observations to compare to, fall back to the learned function + return self.learned_reward_fn(state, action, next_state, done) + else: + # TODO #625: deal with the conversion back and forth between np and torch + entropies = util.compute_state_entropy( + th.tensor(state), + th.tensor(all_observations), + self.nearest_neighbor_k, + ) + normalized_entropies = self.entropy_stats.forward(entropies) return normalized_entropies.numpy() def __getstate__(self): diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 3abd66752..c4f127b09 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -20,51 +20,13 @@ VENVS = 2 -def test_pebble_entropy_reward_function_returns_learned_reward_initially(): - expected_reward = np.ones(1) - learned_reward_mock = Mock() - learned_reward_mock.return_value = expected_reward - reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE) - - # Act - observations = np.ones((BATCH_SIZE, *OBS_SHAPE)) - reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) - - # Assert - assert reward == expected_reward - learned_reward_mock.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER - ) - - -def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training(): - expected_reward = np.ones(1) - learned_reward_mock = Mock() - learned_reward_mock.return_value = expected_reward - reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE) - # move all the way to the last state - reward_fn.unsupervised_exploration_start() - reward_fn.unsupervised_exploration_finish() - - # Act - observations = np.ones((BATCH_SIZE, *OBS_SHAPE)) - reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) - - # Assert - assert reward == expected_reward - learned_reward_mock.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER - ) - - def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE))) - reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K) + reward_fn = PebbleStateEntropyReward(Mock(), K) reward_fn.set_replay_buffer( ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE ) - reward_fn.unsupervised_exploration_start() # Act observations = th.rand((BATCH_SIZE, *(OBS_SHAPE))) @@ -85,13 +47,12 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): # mock entropy computation so that we can test only stats collection in this test m.side_effect = lambda obs, all_obs, k: obs - reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K) + reward_fn = PebbleStateEntropyReward(Mock(), K) all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) reward_fn.set_replay_buffer( ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE, ) - reward_fn.unsupervised_exploration_start() dim = 8 shift = 3 @@ -115,12 +76,31 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): ) +def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training(): + expected_reward = np.ones(1) + learned_reward_mock = Mock() + learned_reward_mock.return_value = expected_reward + reward_fn = PebbleStateEntropyReward(learned_reward_mock) + # move all the way to the last state + reward_fn.unsupervised_exploration_finish() + + # Act + observations = np.ones((BATCH_SIZE, *OBS_SHAPE)) + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + # Assert + assert reward == expected_reward + learned_reward_mock.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) + + def test_pebble_entropy_reward_can_pickle(): all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) obs1 = np.random.rand(VENVS, *OBS_SHAPE) - reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K) + reward_fn = PebbleStateEntropyReward(reward_fn_stub, K) reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) From b8ccf2f8f2140909c47ac636167be432800c7c5e Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 21:12:18 +0100 Subject: [PATCH 32/55] #625 fix entropy shape --- src/imitation/algorithms/pebble/entropy_reward.py | 5 +++-- src/imitation/util/util.py | 3 +-- tests/algorithms/pebble/test_entropy_reward.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 3d9d76b00..e0d94c171 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -94,7 +94,8 @@ def _entropy_reward(self, state, action, next_state, done): all_observations = all_observations.reshape((-1, *self.obs_shape)) if all_observations.shape[0] < self.nearest_neighbor_k: - # not enough observations to compare to, fall back to the learned function + # not enough observations to compare to, fall back to the learned function; + # (falling back to a constant may also be ok) return self.learned_reward_fn(state, action, next_state, done) else: # TODO #625: deal with the conversion back and forth between np and torch @@ -104,7 +105,7 @@ def _entropy_reward(self, state, action, next_state, done): self.nearest_neighbor_k, ) normalized_entropies = self.entropy_stats.forward(entropies) - return normalized_entropies.numpy() + return normalized_entropies.numpy() def __getstate__(self): state = self.__dict__.copy() diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index 9e5815e0c..9bf1c1a40 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -389,5 +389,4 @@ def compute_state_entropy( # a point is itself, which we want to skip. assert distances_tensor.shape[-1] > k knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values - state_entropy = knn_dists - return state_entropy.unsqueeze(1) + return knn_dists diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index c4f127b09..918222382 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -21,7 +21,7 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): - all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE))) + all_observations = rng.random((BUFFER_SIZE, VENVS, *OBS_SHAPE)) reward_fn = PebbleStateEntropyReward(Mock(), K) reward_fn.set_replay_buffer( @@ -29,12 +29,12 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): ) # Act - observations = th.rand((BATCH_SIZE, *(OBS_SHAPE))) + observations = th.rand((BATCH_SIZE, *OBS_SHAPE)) reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Assert expected = util.compute_state_entropy( - observations, all_observations.reshape(-1, *(OBS_SHAPE)), K + observations, all_observations.reshape(-1, *OBS_SHAPE), K ) expected_normalized = reward_fn.entropy_stats.normalize( th.as_tensor(expected) From c5f1dba1bd1da18d9ed35410dd698bf1a8c9a167 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 21:15:03 +0100 Subject: [PATCH 33/55] #625 rename unsupervised_agent_pretrain_frac parameter --- .../algorithms/preference_comparisons.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index e29433188..96af17cfd 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1508,7 +1508,7 @@ def __init__( transition_oversampling: float = 1, initial_comparison_frac: float = 0.1, initial_epoch_multiplier: float = 200.0, - initial_agent_pretrain_frac: float = 0.05, + unsupervised_agent_pretrain_frac: float = 0.05, custom_logger: Optional[imit_logger.HierarchicalLogger] = None, allow_variable_horizon: bool = False, rng: Optional[np.random.Generator] = None, @@ -1558,7 +1558,7 @@ def __init__( initial_epoch_multiplier: before agent training begins, train the reward model for this many more epochs than usual (on fragments sampled from a random agent). - initial_agent_pretrain_frac: fraction of total_timesteps for which the + unsupervised_agent_pretrain_frac: fraction of total_timesteps for which the agent will be trained without preference gathering (and reward model training) custom_logger: Where to log to; if None (default), creates a new logger. @@ -1659,7 +1659,7 @@ def __init__( self.fragment_length = fragment_length self.initial_comparison_frac = initial_comparison_frac self.initial_epoch_multiplier = initial_epoch_multiplier - self.initial_agent_pretrain_frac = initial_agent_pretrain_frac + self.unsupervised_agent_pretrain_frac = unsupervised_agent_pretrain_frac self.num_iterations = num_iterations self.transition_oversampling = transition_oversampling if callable(query_schedule): @@ -1693,7 +1693,7 @@ def train( print(f"Query schedule: {preference_query_schedule}") ( - agent_pretrain_timesteps, + unsupervised_pretrain_timesteps, timesteps_per_iteration, extra_timesteps, ) = self._compute_timesteps(total_timesteps) @@ -1705,9 +1705,9 @@ def train( ################################################### with self.logger.accumulate_means("agent"): self.logger.log( - f"Pre-training agent for {agent_pretrain_timesteps} timesteps" + f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps" ) - self.trajectory_generator.unsupervised_pretrain(agent_pretrain_timesteps) + self.trajectory_generator.unsupervised_pretrain(unsupervised_pretrain_timesteps) for i, num_pairs in enumerate(preference_query_schedule): ########################## @@ -1784,11 +1784,11 @@ def _preference_gather_schedule(self, total_comparisons): return schedule def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]: - agent_pretrain_timesteps = int( - total_timesteps * self.initial_agent_pretrain_frac + unsupervised_pretrain_timesteps = int( + total_timesteps * self.unsupervised_agent_pretrain_frac ) timesteps_per_iteration, extra_timesteps = divmod( - total_timesteps - agent_pretrain_timesteps, + total_timesteps - unsupervised_pretrain_timesteps, self.num_iterations, ) - return agent_pretrain_timesteps, timesteps_per_iteration, extra_timesteps + return unsupervised_pretrain_timesteps, timesteps_per_iteration, extra_timesteps From 0ba89593602d3438c0d0bf07a89e351b1cdcbe6c Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 21:32:35 +0100 Subject: [PATCH 34/55] #625 specialized PebbleAgentTrainer to distinguish from old preference comparison trainer --- .../algorithms/preference_comparisons.py | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 96af17cfd..7d7466b26 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -33,6 +33,7 @@ from tqdm.auto import tqdm from imitation.algorithms import base +from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.data import rollout, types, wrappers from imitation.data.types import ( AnyPath, @@ -329,6 +330,27 @@ def logger(self, value: imit_logger.HierarchicalLogger) -> None: self.algorithm.set_logger(self.logger) +class PebbleAgentTrainer(AgentTrainer): + """ + Specialization of AgentTrainer for PEBBLE training. + Includes unsupervised pretraining with an entropy based reward function. + """ + + reward_fn: PebbleStateEntropyReward + + def __init__( + self, + *, + reward_fn: PebbleStateEntropyReward, + **kwargs, + ) -> None: + super().__init__(reward_fn=reward_fn, **kwargs) + + def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: + self.train(steps, **kwargs) + self.reward_fn.unsupervised_exploration_finish() + + def _get_trajectories( trajectories: Sequence[TrajectoryWithRew], steps: int, @@ -1707,7 +1729,9 @@ def train( self.logger.log( f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps" ) - self.trajectory_generator.unsupervised_pretrain(unsupervised_pretrain_timesteps) + self.trajectory_generator.unsupervised_pretrain( + unsupervised_pretrain_timesteps + ) for i, num_pairs in enumerate(preference_query_schedule): ########################## From c55fee727dd100f1af0efaef983ecdb05a13428d Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 22:14:38 +0100 Subject: [PATCH 35/55] #625 merge pebble to train_preference_comparisons.py and configure only through sacred --- .../algorithms/preference_comparisons.py | 4 ++++ .../config/train_preference_comparisons.py | 24 ++++++++++++++++++- .../scripts/train_preference_comparisons.py | 8 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 7d7466b26..fe5dc472e 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -344,6 +344,10 @@ def __init__( reward_fn: PebbleStateEntropyReward, **kwargs, ) -> None: + if not isinstance(reward_fn, PebbleStateEntropyReward): + raise ValueError( + f"{self.__class__.__name__} expects {PebbleStateEntropyReward.__name__} reward function" + ) super().__init__(reward_fn=reward_fn, **kwargs) def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index ba4e9483c..227142814 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -1,8 +1,10 @@ """Configuration for imitation.scripts.train_preference_comparisons.""" import sacred +import stable_baselines3 as sb3 from imitation.algorithms import preference_comparisons +from imitation.policies import base from imitation.scripts.common import common, reward, rl, train train_preference_comparisons_ex = sacred.Experiment( @@ -15,7 +17,6 @@ ], ) - MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1))) ANT_SHARED_LOCALS = dict( total_timesteps=int(3e7), @@ -61,6 +62,26 @@ def train_defaults(): query_schedule = "hyperbolic" +@train_preference_comparisons_ex.named_config +def pebble(): + # fraction of total_timesteps for training before preference gathering + unsupervised_agent_pretrain_frac = 0.05 + pebble_nearest_neighbor_k = 5 + + rl = { + "rl_cls": sb3.SAC, + "batch_size": 256, # batch size for RL algorithm + "rl_kwargs": {"batch_size": None}, # make sure to set batch size to None + } + train = { + "policy_cls": base.SAC1024Policy, # noqa: F841 + } + common = {"env_name": "MountainCarContinuous-v0"} + allow_variable_horizon = True + + locals() # quieten flake8 + + @train_preference_comparisons_ex.named_config def cartpole(): common = dict(env_name="CartPole-v1") @@ -121,6 +142,7 @@ def fast(): total_timesteps = 50 total_comparisons = 5 initial_comparison_frac = 0.2 + unsupervised_agent_pretrain_frac = 0.2 num_iterations = 1 fragment_length = 2 reward_trainer_kwargs = { diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index 331a4797a..cfa87a960 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -82,6 +82,8 @@ def train_preference_comparisons( allow_variable_horizon: bool, checkpoint_interval: int, query_schedule: Union[str, type_aliases.Schedule], + unsupervised_agent_pretrain_frac: Optional[float], + pebble_nearest_neighbor_k: Optional[int], ) -> Mapping[str, Any]: """Train a reward model using preference comparisons. @@ -141,6 +143,11 @@ def train_preference_comparisons( be allocated to each iteration. "hyperbolic" and "inverse_quadratic" apportion fewer queries to later iterations when the policy is assumed to be better and more stable. + unsupervised_agent_pretrain_frac: fraction of total_timesteps for which the + agent will be trained without preference gathering (and reward model + training) + pebble_nearest_neighbor_k: Parameter for state entropy computation (for PEBBLE + training only) Returns: Rollout statistics from trained policy. @@ -244,6 +251,7 @@ def train_preference_comparisons( custom_logger=custom_logger, allow_variable_horizon=allow_variable_horizon, query_schedule=query_schedule, + unsupervised_agent_pretrain_frac=unsupervised_agent_pretrain_frac, ) def save_callback(iteration_num): From 1f9642a362a1e1d3d075e7944bd8952ab915bf12 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 22:56:56 +0100 Subject: [PATCH 36/55] #625 plug in pebble according to parameters --- .../config/train_preference_comparisons.py | 3 + .../train_preference_comparisons_pebble.py | 163 ---------- .../scripts/train_preference_comparisons.py | 83 +++-- .../train_preference_comparisons_pebble.py | 292 ------------------ 4 files changed, 68 insertions(+), 473 deletions(-) delete mode 100644 src/imitation/scripts/config/train_preference_comparisons_pebble.py delete mode 100644 src/imitation/scripts/train_preference_comparisons_pebble.py diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 227142814..ca0e786ff 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -60,11 +60,14 @@ def train_defaults(): checkpoint_interval = 0 # Num epochs between saving (<0 disables, =0 final only) query_schedule = "hyperbolic" + # Whether to use the PEBBLE algorithm (https://arxiv.org/pdf/2106.05091.pdf) + pebble_enabled = False @train_preference_comparisons_ex.named_config def pebble(): # fraction of total_timesteps for training before preference gathering + pebble_enabled = True unsupervised_agent_pretrain_frac = 0.05 pebble_nearest_neighbor_k = 5 diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py deleted file mode 100644 index a497542e7..000000000 --- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py +++ /dev/null @@ -1,163 +0,0 @@ -"""Configuration for imitation.scripts.train_preference_comparisons_pebble.""" - -import warnings - -import sacred -import stable_baselines3 as sb3 - -from imitation.algorithms import preference_comparisons -from imitation.policies import base -from imitation.scripts.common import common, reward, rl, train - -train_preference_comparisons_pebble_ex = sacred.Experiment( - "train_preference_comparisons_pebble", - ingredients=[ - common.common_ingredient, - reward.reward_ingredient, - rl.rl_ingredient, - train.train_ingredient, - ], -) - -MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1))) -ANT_SHARED_LOCALS = dict( - total_timesteps=int(3e7), - rl=dict(batch_size=16384), -) - - -@rl.rl_ingredient.config -def rl_sac(): - # For recommended SAC hyperparams in each environment, see: - # https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/sac.yml - rl_cls = sb3.SAC - warnings.warn( - "SAC currently only supports continuous action spaces. " - "Consider adding a discrete version as mentioned here: " - "https://github.com/DLR-RM/stable-baselines3/issues/505", - category=RuntimeWarning, - ) - # Default HPs are as follows: - batch_size = 256 # batch size for RL algorithm - rl_kwargs = dict(batch_size=None) # make sure to set batch size to None - locals() # quieten flake8 - - -@train.train_ingredient.config -def train_sac(): - policy_cls = base.SAC1024Policy # noqa: F841 - locals() # quieten flake8 - - -@common.common_ingredient.config -def common_mountain_car_continuous(): - env_name = "MountainCarContinuous-v0" - locals() # quieten flake8 - - -@train_preference_comparisons_pebble_ex.config -def train_defaults(): - fragment_length = 100 # timesteps per fragment used for comparisons - total_timesteps = int(1e6) # total number of environment timesteps - total_comparisons = 5000 # total number of comparisons to elicit - num_iterations = 5 # Arbitrary, should be tuned for the task - comparison_queue_size = None - # factor by which to oversample transitions before creating fragments - transition_oversampling = 1 - # fraction of total_comparisons that will be sampled right at the beginning - initial_comparison_frac = 0.1 - # fraction of sampled trajectories that will include some random actions - exploration_frac = 0.0 - # fraction of total_timesteps for training before preference gathering - initial_agent_pretrain_frac = 0.05 - preference_model_kwargs = {} - reward_trainer_kwargs = { - "epochs": 3, - } - save_preferences = False # save preference dataset at the end? - agent_path = None # path to a (partially) trained agent to load at the beginning - # type of PreferenceGatherer to use - gatherer_cls = preference_comparisons.SyntheticGatherer - # arguments passed on to the PreferenceGatherer specified by gatherer_cls - gatherer_kwargs = {} - active_selection = False - active_selection_oversampling = 2 - uncertainty_on = "logit" - fragmenter_kwargs = { - "warning_threshold": 0, - } - # path to a pickled sequence of trajectories used instead of training an agent - trajectory_path = None - trajectory_generator_kwargs = {} # kwargs to pass to trajectory generator - allow_variable_horizon = False - - checkpoint_interval = 0 # Num epochs between saving (<0 disables, =0 final only) - query_schedule = "hyperbolic" - - -@train_preference_comparisons_pebble_ex.named_config -def cartpole(): - common = dict(env_name="CartPole-v1") - allow_variable_horizon = True - - -@train_preference_comparisons_pebble_ex.named_config -def seals_ant(): - locals().update(**MUJOCO_SHARED_LOCALS) - locals().update(**ANT_SHARED_LOCALS) - common = dict(env_name="seals/Ant-v0") - - -@train_preference_comparisons_pebble_ex.named_config -def half_cheetah(): - locals().update(**MUJOCO_SHARED_LOCALS) - common = dict(env_name="HalfCheetah-v2") - rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024)) - - -@train_preference_comparisons_pebble_ex.named_config -def seals_hopper(): - locals().update(**MUJOCO_SHARED_LOCALS) - common = dict(env_name="seals/Hopper-v0") - - -@train_preference_comparisons_pebble_ex.named_config -def seals_humanoid(): - locals().update(**MUJOCO_SHARED_LOCALS) - common = dict(env_name="seals/Humanoid-v0") - total_timesteps = int(4e6) - - -@train_preference_comparisons_pebble_ex.named_config -def seals_cartpole(): - common = dict(env_name="seals/CartPole-v0") - - -@train_preference_comparisons_pebble_ex.named_config -def pendulum(): - common = dict(env_name="Pendulum-v1") - - -@train_preference_comparisons_pebble_ex.named_config -def mountain_car(): - common = dict(env_name="MountainCar-v0") - allow_variable_horizon = True - - -@train_preference_comparisons_pebble_ex.named_config -def seals_mountain_car(): - common = dict(env_name="seals/MountainCar-v0") - - -@train_preference_comparisons_pebble_ex.named_config -def fast(): - # Minimize the amount of computation. Useful for test cases. - total_timesteps = 50 - total_comparisons = 5 - initial_comparison_frac = 0.2 - initial_agent_pretrain_frac = 0.2 - num_iterations = 1 - fragment_length = 2 - reward_trainer_kwargs = { - "epochs": 1, - } diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index cfa87a960..c848a6d09 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -3,24 +3,27 @@ Can be used as a CLI script, or the `train_preference_comparisons` function can be called directly. """ - import functools import pathlib from typing import Any, Mapping, Optional, Type, Union +import numpy as np import torch as th from sacred.observers import FileStorageObserver -from stable_baselines3.common import type_aliases +from stable_baselines3.common import type_aliases, base_class, vec_env from imitation.algorithms import preference_comparisons +from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.data import types from imitation.policies import serialize +from imitation.rewards import reward_nets, reward_function from imitation.scripts.common import common, reward from imitation.scripts.common import rl as rl_common from imitation.scripts.common import train from imitation.scripts.config.train_preference_comparisons import ( train_preference_comparisons_ex, ) +from imitation.util import logger as imit_logger def save_model( @@ -57,6 +60,59 @@ def save_checkpoint( ) +@train_preference_comparisons_ex.capture +def make_reward_function( + reward_net: reward_nets.RewardNet, + *, + pebble_enabled: bool = False, + pebble_nearest_neighbor_k: Optional[int] = None, +): + relabel_reward_fn = functools.partial( + reward_net.predict_processed, + update_stats=False, + ) + if pebble_enabled: + relabel_reward_fn = PebbleStateEntropyReward( + relabel_reward_fn, pebble_nearest_neighbor_k + ) + return relabel_reward_fn + + +@train_preference_comparisons_ex.capture +def make_agent_trajectory_generator( + venv: vec_env.VecEnv, + agent: base_class.BaseAlgorithm, + reward_net: reward_nets.RewardNet, + relabel_reward_fn: reward_function.RewardFn, + rng: np.random.Generator, + custom_logger: Optional[imit_logger.HierarchicalLogger], + *, + exploration_frac: float, + pebble_enabled: bool, + trajectory_generator_kwargs: Mapping[str, Any], +) -> preference_comparisons.AgentTrainer: + if pebble_enabled: + return preference_comparisons.PebbleAgentTrainer( + algorithm=agent, + reward_fn=relabel_reward_fn, + venv=venv, + exploration_frac=exploration_frac, + rng=rng, + custom_logger=custom_logger, + **trajectory_generator_kwargs, + ) + else: + return preference_comparisons.AgentTrainer( + algorithm=agent, + reward_fn=reward_net, + venv=venv, + exploration_frac=exploration_frac, + rng=rng, + custom_logger=custom_logger, + **trajectory_generator_kwargs, + ) + + @train_preference_comparisons_ex.main def train_preference_comparisons( total_timesteps: int, @@ -83,7 +139,6 @@ def train_preference_comparisons( checkpoint_interval: int, query_schedule: Union[str, type_aliases.Schedule], unsupervised_agent_pretrain_frac: Optional[float], - pebble_nearest_neighbor_k: Optional[int], ) -> Mapping[str, Any]: """Train a reward model using preference comparisons. @@ -146,8 +201,6 @@ def train_preference_comparisons( unsupervised_agent_pretrain_frac: fraction of total_timesteps for which the agent will be trained without preference gathering (and reward model training) - pebble_nearest_neighbor_k: Parameter for state entropy computation (for PEBBLE - training only) Returns: Rollout statistics from trained policy. @@ -160,10 +213,8 @@ def train_preference_comparisons( with common.make_venv() as venv: reward_net = reward.make_reward_net(venv) - relabel_reward_fn = functools.partial( - reward_net.predict_processed, - update_stats=False, - ) + relabel_reward_fn = make_reward_function(reward_net) + if agent_path is None: agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn) else: @@ -176,21 +227,17 @@ def train_preference_comparisons( if trajectory_path is None: # Setting the logger here is not necessary (PreferenceComparisons takes care # of it automatically) but it avoids creating unnecessary loggers. - agent_trainer = preference_comparisons.AgentTrainer( - algorithm=agent, - reward_fn=reward_net, + trajectory_generator = make_agent_trajectory_generator( venv=venv, - exploration_frac=exploration_frac, + agent=agent, + reward_net=reward_net, + relabel_reward_fn=relabel_reward_fn, rng=rng, custom_logger=custom_logger, - **trajectory_generator_kwargs, ) # Stable Baselines will automatically occupy GPU 0 if it is available. # Let's use the same device as the SB3 agent for the reward model. - reward_net = reward_net.to(agent_trainer.algorithm.device) - trajectory_generator: preference_comparisons.TrajectoryGenerator = ( - agent_trainer - ) + reward_net = reward_net.to(trajectory_generator.algorithm.device) else: if exploration_frac > 0: raise ValueError( diff --git a/src/imitation/scripts/train_preference_comparisons_pebble.py b/src/imitation/scripts/train_preference_comparisons_pebble.py deleted file mode 100644 index f34eefb9d..000000000 --- a/src/imitation/scripts/train_preference_comparisons_pebble.py +++ /dev/null @@ -1,292 +0,0 @@ -"""Train a reward model using preference comparisons. - -Can be used as a CLI script, or the `train_preference_comparisons` function -can be called directly. -""" - -import functools -import pathlib -from typing import Any, Mapping, Optional, Type, Union - -import torch as th -from sacred.observers import FileStorageObserver -from stable_baselines3.common import type_aliases - -from imitation.algorithms import preference_comparisons -from imitation.data import types -from imitation.policies import serialize -from imitation.scripts.common import common, reward -from imitation.scripts.common import rl as rl_common -from imitation.scripts.common import train -from imitation.scripts.config.train_preference_comparisons_pebble import ( - train_preference_comparisons_pebble_ex, -) - - -def save_model( - agent_trainer: preference_comparisons.AgentTrainer, - save_path: pathlib.Path, -): - """Save the model as `model.zip`.""" - serialize.save_stable_model( - output_dir=save_path / "policy", - model=agent_trainer.algorithm, - ) - - -def save_checkpoint( - trainer: preference_comparisons.PreferenceComparisons, - save_path: pathlib.Path, - allow_save_policy: Optional[bool], -): - """Save reward model and optionally policy.""" - save_path.mkdir(parents=True, exist_ok=True) - th.save(trainer.model, save_path / "reward_net.pt") - if allow_save_policy: - # Note: We should only save the model as model.zip if `trajectory_generator` - # contains one. Currently we are slightly over-conservative, by requiring - # that an AgentTrainer be used if we're saving the policy. - assert isinstance( - trainer.trajectory_generator, - preference_comparisons.AgentTrainer, - ) - save_model(trainer.trajectory_generator, save_path) - else: - trainer.logger.warn( - "trainer.trajectory_generator doesn't contain a policy to save.", - ) - - -@train_preference_comparisons_pebble_ex.main -def train_preference_comparisons( - total_timesteps: int, - total_comparisons: int, - num_iterations: int, - comparison_queue_size: Optional[int], - fragment_length: int, - transition_oversampling: float, - initial_comparison_frac: float, - exploration_frac: float, - trajectory_path: Optional[str], - trajectory_generator_kwargs: Mapping[str, Any], - save_preferences: bool, - agent_path: Optional[str], - preference_model_kwargs: Mapping[str, Any], - reward_trainer_kwargs: Mapping[str, Any], - gatherer_cls: Type[preference_comparisons.PreferenceGatherer], - gatherer_kwargs: Mapping[str, Any], - active_selection: bool, - active_selection_oversampling: int, - uncertainty_on: str, - fragmenter_kwargs: Mapping[str, Any], - allow_variable_horizon: bool, - checkpoint_interval: int, - query_schedule: Union[str, type_aliases.Schedule], -) -> Mapping[str, Any]: - """Train a reward model using preference comparisons. - - Args: - total_timesteps: number of environment interaction steps - total_comparisons: number of preferences to gather in total - num_iterations: number of times to train the agent against the reward model - and then train the reward model against newly gathered preferences. - comparison_queue_size: the maximum number of comparisons to keep in the - queue for training the reward model. If None, the queue will grow - without bound as new comparisons are added. - fragment_length: number of timesteps per fragment that is used to elicit - preferences - transition_oversampling: factor by which to oversample transitions before - creating fragments. Since fragments are sampled with replacement, - this is usually chosen > 1 to avoid having the same transition - in too many fragments. - initial_comparison_frac: fraction of total_comparisons that will be - sampled before the rest of training begins (using the randomly initialized - agent). This can be used to pretrain the reward model before the agent - is trained on the learned reward. - exploration_frac: fraction of trajectory samples that will be created using - partially random actions, rather than the current policy. Might be helpful - if the learned policy explores too little and gets stuck with a wrong - reward. - trajectory_path: either None, in which case an agent will be trained - and used to sample trajectories on the fly, or a path to a pickled - sequence of TrajectoryWithRew to be trained on. - trajectory_generator_kwargs: kwargs to pass to the trajectory generator. - save_preferences: if True, store the final dataset of preferences to disk. - agent_path: if given, initialize the agent using this stored policy - rather than randomly. - preference_model_kwargs: passed to PreferenceModel - reward_trainer_kwargs: passed to BasicRewardTrainer or EnsembleRewardTrainer - gatherer_cls: type of PreferenceGatherer to use (defaults to SyntheticGatherer) - gatherer_kwargs: passed to the PreferenceGatherer specified by gatherer_cls - active_selection: use active selection fragmenter instead of random fragmenter - active_selection_oversampling: factor by which to oversample random fragments - from the base fragmenter of active selection. - this is usually chosen > 1 to allow the active selection algorithm to pick - fragment pairs with highest uncertainty. = 1 implies no active selection. - uncertainty_on: passed to ActiveSelectionFragmenter - fragmenter_kwargs: passed to RandomFragmenter - allow_variable_horizon: If False (default), algorithm will raise an - exception if it detects trajectories of different length during - training. If True, overrides this safety check. WARNING: variable - horizon episodes leak information about the reward via termination - condition, and can seriously confound evaluation. Read - https://imitation.readthedocs.io/en/latest/guide/variable_horizon.html - before overriding this. - checkpoint_interval: Save the reward model and policy models (if - trajectory_generator contains a policy) every `checkpoint_interval` - iterations and after training is complete. If 0, then only save weights - after training is complete. If <0, then don't save weights at all. - query_schedule: one of ("constant", "hyperbolic", "inverse_quadratic"). - A function indicating how the total number of preference queries should - be allocated to each iteration. "hyperbolic" and "inverse_quadratic" - apportion fewer queries to later iterations when the policy is assumed - to be better and more stable. - - Returns: - Rollout statistics from trained policy. - - Raises: - ValueError: Inconsistency between config and deserialized policy normalization. - """ - custom_logger, log_dir = common.setup_logging() - rng = common.make_rng() - - with common.make_venv() as venv: - reward_net = reward.make_reward_net(venv) - relabel_reward_fn = functools.partial( - reward_net.predict_processed, - update_stats=False, - ) - if agent_path is None: - agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn) - else: - agent = rl_common.load_rl_algo_from_path( - agent_path=agent_path, - venv=venv, - relabel_reward_fn=relabel_reward_fn, - ) - - if trajectory_path is None: - # Setting the logger here is not necessary (PreferenceComparisons takes care - # of it automatically) but it avoids creating unnecessary loggers. - agent_trainer = preference_comparisons.AgentTrainer( - algorithm=agent, - reward_fn=reward_net, - venv=venv, - exploration_frac=exploration_frac, - rng=rng, - custom_logger=custom_logger, - **trajectory_generator_kwargs, - ) - # Stable Baselines will automatically occupy GPU 0 if it is available. - # Let's use the same device as the SB3 agent for the reward model. - reward_net = reward_net.to(agent_trainer.algorithm.device) - trajectory_generator: preference_comparisons.TrajectoryGenerator = ( - agent_trainer - ) - else: - if exploration_frac > 0: - raise ValueError( - "exploration_frac can't be set when a trajectory dataset is used", - ) - trajectory_generator = preference_comparisons.TrajectoryDataset( - trajectories=types.load_with_rewards(trajectory_path), - rng=rng, - custom_logger=custom_logger, - **trajectory_generator_kwargs, - ) - - fragmenter: preference_comparisons.Fragmenter = ( - preference_comparisons.RandomFragmenter( - **fragmenter_kwargs, - rng=rng, - custom_logger=custom_logger, - ) - ) - preference_model = preference_comparisons.PreferenceModel( - **preference_model_kwargs, - model=reward_net, - ) - if active_selection: - fragmenter = preference_comparisons.ActiveSelectionFragmenter( - preference_model=preference_model, - base_fragmenter=fragmenter, - fragment_sample_factor=active_selection_oversampling, - uncertainty_on=uncertainty_on, - custom_logger=custom_logger, - ) - gatherer = gatherer_cls( - **gatherer_kwargs, - rng=rng, - custom_logger=custom_logger, - ) - - loss = preference_comparisons.CrossEntropyRewardLoss() - - reward_trainer = preference_comparisons._make_reward_trainer( - preference_model, - loss, - rng, - reward_trainer_kwargs, - ) - - main_trainer = preference_comparisons.PreferenceComparisons( - trajectory_generator, - reward_net, - num_iterations=num_iterations, - fragmenter=fragmenter, - preference_gatherer=gatherer, - reward_trainer=reward_trainer, - comparison_queue_size=comparison_queue_size, - fragment_length=fragment_length, - transition_oversampling=transition_oversampling, - initial_comparison_frac=initial_comparison_frac, - custom_logger=custom_logger, - allow_variable_horizon=allow_variable_horizon, - query_schedule=query_schedule, - ) - - def save_callback(iteration_num): - if checkpoint_interval > 0 and iteration_num % checkpoint_interval == 0: - save_checkpoint( - trainer=main_trainer, - save_path=log_dir / "checkpoints" / f"{iteration_num:04d}", - allow_save_policy=bool(trajectory_path is None), - ) - - results = main_trainer.train( - total_timesteps, - total_comparisons, - callback=save_callback, - ) - - # Storing and evaluating policy only useful if we generated trajectory data - if bool(trajectory_path is None): - results = dict(results) - results["rollout"] = train.eval_policy(agent, venv) - - if save_preferences: - main_trainer.dataset.save(log_dir / "preferences.pkl") - - # Save final artifacts. - if checkpoint_interval >= 0: - save_checkpoint( - trainer=main_trainer, - save_path=log_dir / "checkpoints" / "final", - allow_save_policy=bool(trajectory_path is None), - ) - - return results - - -def main_console(): - observer_path = ( - pathlib.Path.cwd() / "output" / "sacred" / "train_preference_comparisons_pebble" - ) - observer = FileStorageObserver(observer_path) - train_preference_comparisons_pebble_ex.observers.append(observer) - train_preference_comparisons_pebble_ex.run_commandline() - - -if __name__ == "__main__": # pragma: no cover - main_console() From 6f05b1d5d5f9b1c63ca0d2f996b759a72992fc00 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 23:04:26 +0100 Subject: [PATCH 37/55] #625 fix pre-commit errors --- .../algorithms/pebble/entropy_reward.py | 41 +++++++++++-------- .../algorithms/preference_comparisons.py | 26 ++++++++---- .../policies/replay_buffer_wrapper.py | 20 ++++----- src/imitation/rewards/reward_function.py | 7 +++- src/imitation/scripts/common/rl.py | 3 +- .../config/train_preference_comparisons.py | 2 + .../scripts/train_preference_comparisons.py | 12 +++--- .../algorithms/pebble/test_entropy_reward.py | 29 +++++++++---- 8 files changed, 88 insertions(+), 52 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index e0d94c171..7570d369f 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -1,12 +1,14 @@ +"""Reward function for the PEBBLE training algorithm.""" + from enum import Enum, auto -from typing import Tuple +from typing import Dict, Optional, Tuple, Union import numpy as np import torch as th from imitation.policies.replay_buffer_wrapper import ( - ReplayBufferView, ReplayBufferRewardWrapper, + ReplayBufferView, ) from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn from imitation.util import util @@ -14,16 +16,16 @@ class PebbleRewardPhase(Enum): - """States representing different behaviors for PebbleStateEntropyReward""" + """States representing different behaviors for PebbleStateEntropyReward.""" UNSUPERVISED_EXPLORATION = auto() # Entropy based reward POLICY_AND_REWARD_LEARNING = auto() # Learned reward class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): - """ - Reward function for implementation of the PEBBLE learning algorithm - (https://arxiv.org/pdf/2106.05091.pdf). + """Reward function for implementation of the PEBBLE learning algorithm. + + See https://arxiv.org/pdf/2106.05091.pdf . The rewards returned by this function go through the three phases: 1. Before enough samples are collected for entropy calculation, the @@ -38,33 +40,38 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): supplied with set_replay_buffer() or on_replay_buffer_initialized(). To transition to the last phase, unsupervised_exploration_finish() needs to be called. - - Args: - learned_reward_fn: The learned reward function used after unsupervised - exploration is finished - nearest_neighbor_k: Parameter for entropy computation (see - compute_state_entropy()) """ - # TODO #625: parametrize nearest_neighbor_k def __init__( self, learned_reward_fn: RewardFn, nearest_neighbor_k: int = 5, ): + """Builds this class. + + Args: + learned_reward_fn: The learned reward function used after unsupervised + exploration is finished + nearest_neighbor_k: Parameter for entropy computation (see + compute_state_entropy()) + """ self.learned_reward_fn = learned_reward_fn self.nearest_neighbor_k = nearest_neighbor_k self.entropy_stats = RunningNorm(1) self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION # These two need to be set with set_replay_buffer(): - self.replay_buffer_view = None - self.obs_shape = None + self.replay_buffer_view: Optional[ReplayBufferView] = None + self.obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]], None] = None def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape) - def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple): + def set_replay_buffer( + self, + replay_buffer: ReplayBufferView, + obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]]], + ): self.replay_buffer_view = replay_buffer self.obs_shape = obs_shape @@ -87,7 +94,7 @@ def __call__( def _entropy_reward(self, state, action, next_state, done): if self.replay_buffer_view is None: raise ValueError( - "Replay buffer must be supplied before entropy reward can be used" + "Replay buffer must be supplied before entropy reward can be used", ) all_observations = self.replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index fe5dc472e..fade985b4 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -77,8 +77,7 @@ def sample(self, steps: int) -> Sequence[TrajectoryWithRew]: """ # noqa: DAR202 def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: - """Pre-train an agent if the trajectory generator uses one that - needs pre-training. + """Pre-train an agent before collecting comparisons. By default, this method does nothing and doesn't need to be overridden in subclasses that don't require pre-training. @@ -331,8 +330,8 @@ def logger(self, value: imit_logger.HierarchicalLogger) -> None: class PebbleAgentTrainer(AgentTrainer): - """ - Specialization of AgentTrainer for PEBBLE training. + """Specialization of AgentTrainer for PEBBLE training. + Includes unsupervised pretraining with an entropy based reward function. """ @@ -344,9 +343,20 @@ def __init__( reward_fn: PebbleStateEntropyReward, **kwargs, ) -> None: + """Builds PebbleAgentTrainer. + + Args: + reward_fn: Pebble reward function + **kwargs: additional keyword arguments to pass on to + the parent class + + Raises: + ValueError: Unexpected type of reward_fn given. + """ if not isinstance(reward_fn, PebbleStateEntropyReward): raise ValueError( - f"{self.__class__.__name__} expects {PebbleStateEntropyReward.__name__} reward function" + f"{self.__class__.__name__} expects " + f"{PebbleStateEntropyReward.__name__} reward function", ) super().__init__(reward_fn=reward_fn, **kwargs) @@ -1731,10 +1741,10 @@ def train( ################################################### with self.logger.accumulate_means("agent"): self.logger.log( - f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps" + f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps", ) self.trajectory_generator.unsupervised_pretrain( - unsupervised_pretrain_timesteps + unsupervised_pretrain_timesteps, ) for i, num_pairs in enumerate(preference_query_schedule): @@ -1813,7 +1823,7 @@ def _preference_gather_schedule(self, total_comparisons): def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]: unsupervised_pretrain_timesteps = int( - total_timesteps * self.unsupervised_agent_pretrain_frac + total_timesteps * self.unsupervised_agent_pretrain_frac, ) timesteps_per_iteration, extra_timesteps = divmod( total_timesteps - unsupervised_pretrain_timesteps, diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 297a6b008..414b421f5 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -7,7 +7,7 @@ from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples -from imitation.rewards.reward_function import RewardFn, ReplayBufferAwareRewardFn +from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn from imitation.util import util @@ -24,19 +24,20 @@ def _samples_to_reward_fn_input( class ReplayBufferView: - """A read-only view over a valid records in a ReplayBuffer. - - Args: - observations_buffer: Array buffer holding observations - buffer_slice_provider: Function returning slice of buffer - with valid observations - """ + """A read-only view over a valid records in a ReplayBuffer.""" def __init__( self, observations_buffer: np.ndarray, buffer_slice_provider: Callable[[], slice], ): + """Builds ReplayBufferView. + + Args: + observations_buffer: Array buffer holding observations + buffer_slice_provider: Function returning slice of buffer + with valid observations + """ self._observations_buffer_view = observations_buffer.view() self._observations_buffer_view.flags.writeable = False self._buffer_slice_provider = buffer_slice_provider @@ -67,9 +68,6 @@ def __init__( action_space: Action space replay_buffer_class: Class of the replay buffer. reward_fn: Reward function for reward relabeling. - on_initialized_callback: Callback called with reference to this object after - this instance is fully initialized. This provides a hook to access the - buffer after it is created from inside a Stable Baselines algorithm. **kwargs: keyword arguments for ReplayBuffer. """ # Note(yawen-d): we directly inherit ReplayBuffer and leave out the case of diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py index e9d7bed30..00b1da958 100644 --- a/src/imitation/rewards/reward_function.py +++ b/src/imitation/rewards/reward_function.py @@ -35,6 +35,11 @@ def __call__( class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): + """Abstract class for a reward function that needs access to a replay buffer.""" + @abc.abstractmethod - def on_replay_buffer_initialized(self, replay_buffer: "ReplayBufferRewardWrapper"): + def on_replay_buffer_initialized( + self, + replay_buffer: "ReplayBufferRewardWrapper", # type: ignore[name-defined] + ): pass diff --git a/src/imitation/scripts/common/rl.py b/src/imitation/scripts/common/rl.py index e879bbaf8..d71e35211 100644 --- a/src/imitation/scripts/common/rl.py +++ b/src/imitation/scripts/common/rl.py @@ -89,7 +89,8 @@ def _maybe_add_relabel_buffer( _buffer_kwargs = dict( reward_fn=relabel_reward_fn, replay_buffer_class=rl_kwargs.get( - "replay_buffer_class", buffers.ReplayBuffer + "replay_buffer_class", + buffers.ReplayBuffer, ), ) rl_kwargs["replay_buffer_class"] = ReplayBufferRewardWrapper diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index ca0e786ff..9876ee952 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -60,8 +60,10 @@ def train_defaults(): checkpoint_interval = 0 # Num epochs between saving (<0 disables, =0 final only) query_schedule = "hyperbolic" + # Whether to use the PEBBLE algorithm (https://arxiv.org/pdf/2106.05091.pdf) pebble_enabled = False + unsupervised_agent_pretrain_frac = 0.0 @train_preference_comparisons_ex.named_config diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index c848a6d09..659b47a74 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -10,13 +10,13 @@ import numpy as np import torch as th from sacred.observers import FileStorageObserver -from stable_baselines3.common import type_aliases, base_class, vec_env +from stable_baselines3.common import base_class, type_aliases, vec_env from imitation.algorithms import preference_comparisons from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.data import types from imitation.policies import serialize -from imitation.rewards import reward_nets, reward_function +from imitation.rewards import reward_function, reward_nets from imitation.scripts.common import common, reward from imitation.scripts.common import rl as rl_common from imitation.scripts.common import train @@ -65,7 +65,7 @@ def make_reward_function( reward_net: reward_nets.RewardNet, *, pebble_enabled: bool = False, - pebble_nearest_neighbor_k: Optional[int] = None, + pebble_nearest_neighbor_k: int = 5, ): relabel_reward_fn = functools.partial( reward_net.predict_processed, @@ -73,7 +73,8 @@ def make_reward_function( ) if pebble_enabled: relabel_reward_fn = PebbleStateEntropyReward( - relabel_reward_fn, pebble_nearest_neighbor_k + relabel_reward_fn, # type: ignore[assignment] + pebble_nearest_neighbor_k, ) return relabel_reward_fn @@ -92,6 +93,7 @@ def make_agent_trajectory_generator( trajectory_generator_kwargs: Mapping[str, Any], ) -> preference_comparisons.AgentTrainer: if pebble_enabled: + assert isinstance(relabel_reward_fn, PebbleStateEntropyReward) return preference_comparisons.PebbleAgentTrainer( algorithm=agent, reward_fn=relabel_reward_fn, @@ -138,7 +140,7 @@ def train_preference_comparisons( allow_variable_horizon: bool, checkpoint_interval: int, query_schedule: Union[str, type_aliases.Schedule], - unsupervised_agent_pretrain_frac: Optional[float], + unsupervised_agent_pretrain_frac: float, ) -> Mapping[str, Any]: """Train a reward model using preference comparisons. diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 918222382..84b59107a 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -1,17 +1,18 @@ +"""Tests for `imitation.algorithms.entropy_reward`.""" + import pickle -from unittest.mock import patch, Mock +from unittest.mock import Mock, patch import numpy as np import torch as th from gym.spaces import Discrete -from stable_baselines3.common.preprocessing import get_obs_shape from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.policies.replay_buffer_wrapper import ReplayBufferView from imitation.util import util SPACE = Discrete(4) -OBS_SHAPE = get_obs_shape(SPACE) +OBS_SHAPE = (1,) PLACEHOLDER = np.empty(OBS_SHAPE) BUFFER_SIZE = 20 @@ -25,7 +26,8 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): reward_fn = PebbleStateEntropyReward(Mock(), K) reward_fn.set_replay_buffer( - ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE + ReplayBufferView(all_observations, lambda: slice(None)), + OBS_SHAPE, ) # Act @@ -34,17 +36,20 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): # Assert expected = util.compute_state_entropy( - observations, all_observations.reshape(-1, *OBS_SHAPE), K + observations, + all_observations.reshape(-1, *OBS_SHAPE), + K, ) expected_normalized = reward_fn.entropy_stats.normalize( - th.as_tensor(expected) + th.as_tensor(expected), ).numpy() np.testing.assert_allclose(reward, expected_normalized) def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): with patch("imitation.util.util.compute_state_entropy") as m: - # mock entropy computation so that we can test only stats collection in this test + # mock entropy computation so that we can test + # only stats collection in this test m.side_effect = lambda obs, all_obs, k: obs reward_fn = PebbleStateEntropyReward(Mock(), K) @@ -64,7 +69,10 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) normalized_reward = reward_fn( - np.zeros(dim), PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + np.zeros(dim), + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) # Assert @@ -91,7 +99,10 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin # Assert assert reward == expected_reward learned_reward_mock.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + observations, + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) From c787877389a87bf7b6c092963062a49fad400a8a Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 1 Dec 2022 23:59:56 +0100 Subject: [PATCH 38/55] #625 add test for pebble agent trainer --- .../algorithms/preference_comparisons.py | 5 ++-- src/imitation/rewards/reward_function.py | 2 +- .../algorithms/test_preference_comparisons.py | 24 ++++++++++++++++++- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index fade985b4..91c7e55f1 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -45,6 +45,7 @@ from imitation.policies import exploration_wrapper from imitation.regularization import regularizers from imitation.rewards import reward_function, reward_nets, reward_wrapper +from imitation.rewards.reward_function import RewardFn from imitation.util import logger as imit_logger from imitation.util import networks, util @@ -178,7 +179,7 @@ def __init__( reward_fn.action_space, ) reward_fn = reward_fn.predict_processed - self.reward_fn = reward_fn + self.reward_fn: RewardFn = reward_fn self.exploration_frac = exploration_frac self.rng = rng @@ -362,7 +363,7 @@ def __init__( def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: self.train(steps, **kwargs) - self.reward_fn.unsupervised_exploration_finish() + self.reward_fn.unsupervised_exploration_finish() # type: ignore[attribute-error] def _get_trajectories( diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py index 00b1da958..3e85a4fa5 100644 --- a/src/imitation/rewards/reward_function.py +++ b/src/imitation/rewards/reward_function.py @@ -40,6 +40,6 @@ class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): @abc.abstractmethod def on_replay_buffer_initialized( self, - replay_buffer: "ReplayBufferRewardWrapper", # type: ignore[name-defined] + replay_buffer: "ReplayBufferRewardWrapper", # type: ignore[name-defined] # noqa ): pass diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py index 12727c1c9..3dedc4482 100644 --- a/tests/algorithms/test_preference_comparisons.py +++ b/tests/algorithms/test_preference_comparisons.py @@ -17,8 +17,10 @@ import imitation.testing.reward_nets as testing_reward_nets from imitation.algorithms import preference_comparisons +from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.data import types from imitation.data.types import TrajectoryWithRew +from imitation.policies.replay_buffer_wrapper import ReplayBufferView from imitation.regularization import regularizers, updaters from imitation.rewards import reward_nets from imitation.util import networks, util @@ -72,6 +74,23 @@ def agent_trainer(agent, reward_net, venv, rng): return preference_comparisons.AgentTrainer(agent, reward_net, venv, rng) +@pytest.fixture +def replay_buffer(rng): + return ReplayBufferView(rng.random((10, 8, 4)), lambda: slice(None)) + + +@pytest.fixture +def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer): + reward_fn = PebbleStateEntropyReward(reward_net.predict_processed) + reward_fn.set_replay_buffer(replay_buffer, (4,)) + return preference_comparisons.PebbleAgentTrainer( + algorithm=agent, + reward_fn=reward_fn, + venv=venv, + rng=rng, + ) + + def assert_info_arrs_equal(arr1, arr2): # pragma: no cover def check_possibly_nested_dicts_equal(dict1, dict2): for key, val1 in dict1.items(): @@ -293,14 +312,17 @@ def build_preference_comparsions(gatherer, reward_trainer, fragmenter, rng): "schedule", ["constant", "hyperbolic", "inverse_quadratic", lambda t: 1 / (1 + t**3)], ) +@pytest.mark.parametrize("agent_fixture", ["agent_trainer", "pebble_agent_trainer"]) def test_trainer_no_crash( - agent_trainer, + request, + agent_fixture, reward_net, random_fragmenter, custom_logger, schedule, rng, ): + agent_trainer = request.getfixturevalue(agent_fixture) main_trainer = preference_comparisons.PreferenceComparisons( agent_trainer, reward_net, From b9c5614e3c61ca61b1a9a6882b70ce298eb7fb52 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Fri, 2 Dec 2022 00:15:56 +0100 Subject: [PATCH 39/55] #625 fix more pre-commit errors --- src/imitation/algorithms/pebble/__init__.py | 1 + src/imitation/algorithms/pebble/entropy_reward.py | 2 +- src/imitation/algorithms/preference_comparisons.py | 3 ++- src/imitation/policies/base.py | 2 +- src/imitation/rewards/reward_function.py | 6 +++++- .../scripts/config/train_preference_comparisons.py | 2 +- 6 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 src/imitation/algorithms/pebble/__init__.py diff --git a/src/imitation/algorithms/pebble/__init__.py b/src/imitation/algorithms/pebble/__init__.py new file mode 100644 index 000000000..dca061476 --- /dev/null +++ b/src/imitation/algorithms/pebble/__init__.py @@ -0,0 +1 @@ +"""PEBBLE specific algorithms.""" diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 7570d369f..08cf800c8 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -25,7 +25,7 @@ class PebbleRewardPhase(Enum): class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): """Reward function for implementation of the PEBBLE learning algorithm. - See https://arxiv.org/pdf/2106.05091.pdf . + See https://arxiv.org/abs/2106.05091 . The rewards returned by this function go through the three phases: 1. Before enough samples are collected for entropy calculation, the diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 91c7e55f1..3374ff136 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -363,7 +363,8 @@ def __init__( def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: self.train(steps, **kwargs) - self.reward_fn.unsupervised_exploration_finish() # type: ignore[attribute-error] + fn = self.reward_fn + fn.unsupervised_exploration_finish() # type: ignore[attribute-error] def _get_trajectories( diff --git a/src/imitation/policies/base.py b/src/imitation/policies/base.py index 3101cf2c7..9d455ff15 100644 --- a/src/imitation/policies/base.py +++ b/src/imitation/policies/base.py @@ -76,7 +76,7 @@ class SAC1024Policy(sac_policies.SACPolicy): """Actor and value networks with two hidden layers of 1024 units respectively. This matches the implementation of SAC policies in the PEBBLE paper. See: - https://arxiv.org/pdf/2106.05091.pdf + https://arxiv.org/abs/2106.05091 https://github.com/denisyarats/pytorch_sac/blob/master/config/agent/sac.yaml Note: This differs from stable_baselines3 SACPolicy by having 1024 hidden units diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py index 3e85a4fa5..69f2f5932 100644 --- a/src/imitation/rewards/reward_function.py +++ b/src/imitation/rewards/reward_function.py @@ -5,6 +5,8 @@ import numpy as np +import imitation.policies.replay_buffer_wrapper + class RewardFn(Protocol): """Abstract class for reward function. @@ -40,6 +42,8 @@ class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): @abc.abstractmethod def on_replay_buffer_initialized( self, - replay_buffer: "ReplayBufferRewardWrapper", # type: ignore[name-defined] # noqa + replay_buffer: ( + "imitation.policies.replay_buffer_wrapper.ReplayBufferRewardWrapper" + ), ): pass diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 9876ee952..0c4ed6411 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -61,7 +61,7 @@ def train_defaults(): checkpoint_interval = 0 # Num epochs between saving (<0 disables, =0 final only) query_schedule = "hyperbolic" - # Whether to use the PEBBLE algorithm (https://arxiv.org/pdf/2106.05091.pdf) + # Whether to use the PEBBLE algorithm (https://arxiv.org/abs/2106.05091) pebble_enabled = False unsupervised_agent_pretrain_frac = 0.0 From 40e73873f193f593f9d360c627217f331ef9859b Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Fri, 2 Dec 2022 01:09:54 +0100 Subject: [PATCH 40/55] #625 fix even more pre-commit errors --- src/imitation/algorithms/pebble/entropy_reward.py | 11 ++++++----- .../algorithms/preference_comparisons.py | 3 +-- src/imitation/policies/replay_buffer_wrapper.py | 15 +++++++++++++-- src/imitation/rewards/reward_function.py | 15 --------------- tests/policies/test_replay_buffer_wrapper.py | 6 ++++-- 5 files changed, 24 insertions(+), 26 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 08cf800c8..8cce6b084 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -7,10 +7,11 @@ import torch as th from imitation.policies.replay_buffer_wrapper import ( + ReplayBufferAwareRewardFn, ReplayBufferRewardWrapper, ReplayBufferView, ) -from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn +from imitation.rewards.reward_function import RewardFn from imitation.util import util from imitation.util.networks import RunningNorm @@ -29,12 +30,12 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): The rewards returned by this function go through the three phases: 1. Before enough samples are collected for entropy calculation, the - underlying function is returned. This shouldn't matter because - OffPolicyAlgorithms have an initialization period for `learning_starts` - timesteps. + underlying function is returned. This shouldn't matter because + OffPolicyAlgorithms have an initialization period for `learning_starts` + timesteps. 2. During the unsupervised exploration phase, entropy based reward is returned 3. After unsupervised exploration phase is finished, the underlying learned - reward is returned. + reward is returned. The second phase requires that a buffer with observations to compare against is supplied with set_replay_buffer() or on_replay_buffer_initialized(). diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 3374ff136..77d68eff0 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -348,8 +348,7 @@ def __init__( Args: reward_fn: Pebble reward function - **kwargs: additional keyword arguments to pass on to - the parent class + **kwargs: additional keyword arguments to pass on to the parent class Raises: ValueError: Unexpected type of reward_fn given. diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 414b421f5..b7a67a1c1 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -1,5 +1,5 @@ """Wrapper for reward labeling for transitions sampled from a replay buffer.""" - +import abc from typing import Callable, Mapping, Type import numpy as np @@ -7,7 +7,7 @@ from stable_baselines3.common.buffers import ReplayBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples -from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn +from imitation.rewards.reward_function import RewardFn from imitation.util import util @@ -134,3 +134,14 @@ def _get_samples(self): "_get_samples() is intentionally not implemented." "This method should not be called.", ) + + +class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): + """Abstract class for a reward function that needs access to a replay buffer.""" + + @abc.abstractmethod + def on_replay_buffer_initialized( + self, + replay_buffer: ReplayBufferRewardWrapper, + ): + pass diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py index 69f2f5932..93761752d 100644 --- a/src/imitation/rewards/reward_function.py +++ b/src/imitation/rewards/reward_function.py @@ -5,8 +5,6 @@ import numpy as np -import imitation.policies.replay_buffer_wrapper - class RewardFn(Protocol): """Abstract class for reward function. @@ -34,16 +32,3 @@ def __call__( Returns: Computed rewards of shape `(batch_size,`). """ # noqa: DAR202 - - -class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): - """Abstract class for a reward function that needs access to a replay buffer.""" - - @abc.abstractmethod - def on_replay_buffer_initialized( - self, - replay_buffer: ( - "imitation.policies.replay_buffer_wrapper.ReplayBufferRewardWrapper" - ), - ): - pass diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 02bb72ce2..7c26dd2d4 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -16,8 +16,10 @@ from stable_baselines3.common.preprocessing import get_action_dim, get_obs_shape from stable_baselines3.common.save_util import load_from_pkl -from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper -from imitation.rewards.reward_function import ReplayBufferAwareRewardFn +from imitation.policies.replay_buffer_wrapper import ( + ReplayBufferAwareRewardFn, + ReplayBufferRewardWrapper, +) from imitation.util import util From aad2e7cb324164af15ddcff79af7c90e8075a6b2 Mon Sep 17 00:00:00 2001 From: Mifeet Date: Fri, 2 Dec 2022 12:39:29 +0100 Subject: [PATCH 41/55] code review - Update src/imitation/policies/replay_buffer_wrapper.py Co-authored-by: Adam Gleave --- src/imitation/policies/replay_buffer_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index b7a67a1c1..255e01f3b 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -24,7 +24,7 @@ def _samples_to_reward_fn_input( class ReplayBufferView: - """A read-only view over a valid records in a ReplayBuffer.""" + """A read-only view over valid records in a ReplayBuffer.""" def __init__( self, From e0aea610862c43bdcfd39cdad5d0cf93c4bc6172 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Fri, 2 Dec 2022 23:04:56 +0100 Subject: [PATCH 42/55] #625 code review --- src/imitation/algorithms/pebble/entropy_reward.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 8cce6b084..ba844c682 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -1,6 +1,6 @@ """Reward function for the PEBBLE training algorithm.""" -from enum import Enum, auto +import enum from typing import Dict, Optional, Tuple, Union import numpy as np @@ -16,11 +16,11 @@ from imitation.util.networks import RunningNorm -class PebbleRewardPhase(Enum): +class PebbleRewardPhase(enum.Enum): """States representing different behaviors for PebbleStateEntropyReward.""" - UNSUPERVISED_EXPLORATION = auto() # Entropy based reward - POLICY_AND_REWARD_LEARNING = auto() # Learned reward + UNSUPERVISED_EXPLORATION = enum.auto() # Entropy based reward + POLICY_AND_REWARD_LEARNING = enum.auto() # Learned reward class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): From f0a3359f15cadac9bb89c950641e46e5daca9df7 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Fri, 2 Dec 2022 23:56:10 +0100 Subject: [PATCH 43/55] #625 code review: do not allocate timesteps for pretraining if there is no pretraining --- .../algorithms/preference_comparisons.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 77d68eff0..ba44e338c 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -77,17 +77,36 @@ def sample(self, steps: int) -> Sequence[TrajectoryWithRew]: be the environment rewards, not ones from a reward model). """ # noqa: DAR202 + @property + def has_pretraining(self) -> bool: + """Indicates whether this generator has a pre-training phase. + + The value can be used, e.g., when allocating time-steps for pre-training. + + By default, True is returned if the unsupervised_pretrain() method is not + overriden, bud subclasses may choose to override this behavior. + """ + orig_impl = TrajectoryGenerator.unsupervised_pretrain + return type(self).unsupervised_pretrain != orig_impl + def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: """Pre-train an agent before collecting comparisons. - By default, this method does nothing and doesn't need - to be overridden in subclasses that don't require pre-training. + By default, this method asserts that pre-training has zero steps allocated. + Override this behavior in subclasses that implement pre-training. Args: steps: number of environment steps to train for. **kwargs: additional keyword arguments to pass on to the training procedure. """ + if steps > 0: + self._logger.warn( + f"{steps} timesteps allocated for unsupervised pre-training:" + " Trajectory generators without pre-training implementation should" + " not consume any timesteps (otherwise the total number of" + " timesteps executed may be misleading)" + ) def train(self, steps: int, **kwargs: Any) -> None: """Train an agent if the trajectory generator uses one. @@ -1823,9 +1842,12 @@ def _preference_gather_schedule(self, total_comparisons): return schedule def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]: - unsupervised_pretrain_timesteps = int( - total_timesteps * self.unsupervised_agent_pretrain_frac, - ) + if self.trajectory_generator.has_pretraining: + unsupervised_pretrain_timesteps = int( + total_timesteps * self.unsupervised_agent_pretrain_frac, + ) + else: + unsupervised_pretrain_timesteps = 0 timesteps_per_iteration, extra_timesteps = divmod( total_timesteps - unsupervised_pretrain_timesteps, self.num_iterations, From 8cb244979e3ffd251a81790f2dc425eac5ffb565 Mon Sep 17 00:00:00 2001 From: Mifeet Date: Sat, 3 Dec 2022 00:01:47 +0100 Subject: [PATCH 44/55] Update src/imitation/algorithms/preference_comparisons.py Co-authored-by: Adam Gleave --- src/imitation/algorithms/preference_comparisons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index ba44e338c..03f1bc25c 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1746,7 +1746,7 @@ def train( """ # Compute the number of comparisons to request at each iteration in advance. preference_query_schedule = self._preference_gather_schedule(total_comparisons) - print(f"Query schedule: {preference_query_schedule}") + self.logger.log(f"Query schedule: {preference_query_schedule}") ( unsupervised_pretrain_timesteps, From 378baa86eb54c4cded4cbd99e60c1dd928efa2dd Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Fri, 2 Dec 2022 23:59:38 +0100 Subject: [PATCH 45/55] #625 code review: remove ignore --- src/imitation/algorithms/preference_comparisons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 03f1bc25c..cc3164182 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -382,7 +382,7 @@ def __init__( def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: self.train(steps, **kwargs) fn = self.reward_fn - fn.unsupervised_exploration_finish() # type: ignore[attribute-error] + fn.unsupervised_exploration_finish() def _get_trajectories( From d7ad4145f48c7995e24f8226e22da9260bba8744 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 3 Dec 2022 00:03:20 +0100 Subject: [PATCH 46/55] #625 code review - skip pretrainining if zero timesteps --- .../algorithms/preference_comparisons.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index cc3164182..c3a77e579 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -1749,7 +1749,7 @@ def train( self.logger.log(f"Query schedule: {preference_query_schedule}") ( - unsupervised_pretrain_timesteps, + unsup_pretrain_timesteps, timesteps_per_iteration, extra_timesteps, ) = self._compute_timesteps(total_timesteps) @@ -1759,13 +1759,14 @@ def train( ################################################### # Pre-training agent before gathering preferences # ################################################### - with self.logger.accumulate_means("agent"): - self.logger.log( - f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps", - ) - self.trajectory_generator.unsupervised_pretrain( - unsupervised_pretrain_timesteps, - ) + if unsup_pretrain_timesteps: + with self.logger.accumulate_means("agent"): + self.logger.log( + f"Pre-training agent for {unsup_pretrain_timesteps} timesteps", + ) + self.trajectory_generator.unsupervised_pretrain( + unsup_pretrain_timesteps, + ) for i, num_pairs in enumerate(preference_query_schedule): ########################## From 412550de84f1b75c154ea1573aa0acaa7d7a5748 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 3 Dec 2022 00:23:29 +0100 Subject: [PATCH 47/55] #625 code review: separate pebble and environment configuration --- .../scripts/config/train_preference_comparisons.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 0c4ed6411..f01a7d6c0 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -81,8 +81,6 @@ def pebble(): train = { "policy_cls": base.SAC1024Policy, # noqa: F841 } - common = {"env_name": "MountainCarContinuous-v0"} - allow_variable_horizon = True locals() # quieten flake8 @@ -141,6 +139,13 @@ def seals_mountain_car(): common = dict(env_name="seals/MountainCar-v0") +@train_preference_comparisons_ex.named_config +def mountain_car_continuous(): + common = {"env_name": "MountainCarContinuous-v0"} + allow_variable_horizon = True + locals() # quieten flake8 + + @train_preference_comparisons_ex.named_config def fast(): # Minimize the amount of computation. Useful for test cases. From 7c3470e584c5bf02903620f5c119fca0bd5afa8c Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 3 Dec 2022 00:25:29 +0100 Subject: [PATCH 48/55] #625 fix even even more pre-commit errors --- src/imitation/algorithms/preference_comparisons.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index c3a77e579..ec1816143 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -85,6 +85,9 @@ def has_pretraining(self) -> bool: By default, True is returned if the unsupervised_pretrain() method is not overriden, bud subclasses may choose to override this behavior. + + Returns: + True if this generator has a pre-training phase, False otherwise """ orig_impl = TrajectoryGenerator.unsupervised_pretrain return type(self).unsupervised_pretrain != orig_impl @@ -105,7 +108,7 @@ def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: f"{steps} timesteps allocated for unsupervised pre-training:" " Trajectory generators without pre-training implementation should" " not consume any timesteps (otherwise the total number of" - " timesteps executed may be misleading)" + " timesteps executed may be misleading)", ) def train(self, steps: int, **kwargs: Any) -> None: From 73b1e36ff968885d59ecbb804521ecfe90ad5fc1 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 3 Dec 2022 00:34:07 +0100 Subject: [PATCH 49/55] #625 fix even even more pre-commit errors --- .../algorithms/preference_comparisons.py | 3 ++- src/imitation/util/util.py | 23 +++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index ec1816143..411dc8c65 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -84,7 +84,7 @@ def has_pretraining(self) -> bool: The value can be used, e.g., when allocating time-steps for pre-training. By default, True is returned if the unsupervised_pretrain() method is not - overriden, bud subclasses may choose to override this behavior. + overridden, bud subclasses may choose to override this behavior. Returns: True if this generator has a pre-training phase, False otherwise @@ -385,6 +385,7 @@ def __init__( def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: self.train(steps, **kwargs) fn = self.reward_fn + assert isinstance(fn, PebbleStateEntropyReward) fn.unsupervised_exploration_finish() diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index 9bf1c1a40..cef2e6f38 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -377,16 +377,19 @@ def compute_state_entropy( A tensor containing the state entropy for `obs`. """ assert obs.shape[1:] == all_obs.shape[1:] + batch_size = 500 with th.no_grad(): non_batch_dimensions = tuple(range(2, len(obs.shape) + 1)) - distances_tensor = th.linalg.vector_norm( - obs[:, None] - all_obs[None, :], - dim=non_batch_dimensions, - ord=2, - ) - - # Note that we take the k+1'th value because the closest neighbor to - # a point is itself, which we want to skip. - assert distances_tensor.shape[-1] > k - knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values + dists: List[th.Tensor] = [] + for idx in range(len(all_obs) // batch_size + 1): + start = idx * batch_size + end = (idx + 1) * batch_size + distances_tensor = th.linalg.vector_norm( + obs[:, None] - all_obs[None, start:end], + dim=non_batch_dimensions, + ord=2, + ) + dists.append(distances_tensor) + all_dists = th.cat(dists, dim=1) + knn_dists = th.kthvalue(all_dists, k=k + 1, dim=1).values return knn_dists From 6daa4732c63081a5f90ca91606bb16bff5e9c87e Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 8 Dec 2022 00:24:19 +0100 Subject: [PATCH 50/55] #641 code review: remove set_replay_buffer --- .../algorithms/pebble/entropy_reward.py | 13 ++++----- .../algorithms/pebble/test_entropy_reward.py | 29 ++++++++++++++----- .../algorithms/test_preference_comparisons.py | 6 +++- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index ba844c682..9e7958fa6 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -58,6 +58,7 @@ def __init__( """ self.learned_reward_fn = learned_reward_fn self.nearest_neighbor_k = nearest_neighbor_k + self.entropy_stats = RunningNorm(1) self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION @@ -66,15 +67,9 @@ def __init__( self.obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]], None] = None def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): - self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape) + self.replay_buffer_view = replay_buffer.buffer_view + self.obs_shape = replay_buffer.obs_shape - def set_replay_buffer( - self, - replay_buffer: ReplayBufferView, - obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]]], - ): - self.replay_buffer_view = replay_buffer - self.obs_shape = obs_shape def unsupervised_exploration_finish(self): assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION @@ -112,7 +107,9 @@ def _entropy_reward(self, state, action, next_state, done): th.tensor(all_observations), self.nearest_neighbor_k, ) + normalized_entropies = self.entropy_stats.forward(entropies) + return normalized_entropies.numpy() def __getstate__(self): diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 84b59107a..bc020e86c 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -25,9 +25,11 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): all_observations = rng.random((BUFFER_SIZE, VENVS, *OBS_SHAPE)) reward_fn = PebbleStateEntropyReward(Mock(), K) - reward_fn.set_replay_buffer( - ReplayBufferView(all_observations, lambda: slice(None)), - OBS_SHAPE, + reward_fn.on_replay_buffer_initialized( + replay_buffer_mock( + ReplayBufferView(all_observations, lambda: slice(None)), + OBS_SHAPE, + ) ) # Act @@ -54,9 +56,11 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): reward_fn = PebbleStateEntropyReward(Mock(), K) all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) - reward_fn.set_replay_buffer( - ReplayBufferView(all_observations, lambda: slice(None)), - OBS_SHAPE, + reward_fn.on_replay_buffer_initialized( + replay_buffer_mock( + ReplayBufferView(all_observations, lambda: slice(None)), + OBS_SHAPE, + ) ) dim = 8 @@ -112,13 +116,15 @@ def test_pebble_entropy_reward_can_pickle(): obs1 = np.random.rand(VENVS, *OBS_SHAPE) reward_fn = PebbleStateEntropyReward(reward_fn_stub, K) - reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE) + reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, OBS_SHAPE)) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Act pickled = pickle.dumps(reward_fn) reward_fn_deserialized = pickle.loads(pickled) - reward_fn_deserialized.set_replay_buffer(replay_buffer, OBS_SHAPE) + reward_fn_deserialized.on_replay_buffer_initialized( + replay_buffer_mock(replay_buffer, OBS_SHAPE) + ) # Assert obs2 = np.random.rand(VENVS, *OBS_SHAPE) @@ -129,3 +135,10 @@ def test_pebble_entropy_reward_can_pickle(): def reward_fn_stub(state, action, next_state, done): return state + + +def replay_buffer_mock(buffer_view: ReplayBufferView, obs_shape: tuple) -> Mock: + replay_buffer_mock = Mock() + replay_buffer_mock.buffer_view = buffer_view + replay_buffer_mock.obs_shape = obs_shape + return replay_buffer_mock diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py index 3dedc4482..fb63e71d0 100644 --- a/tests/algorithms/test_preference_comparisons.py +++ b/tests/algorithms/test_preference_comparisons.py @@ -3,6 +3,7 @@ import math import re from typing import Any, Sequence +from unittest.mock import Mock import gym import numpy as np @@ -81,8 +82,11 @@ def replay_buffer(rng): @pytest.fixture def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer): + replay_buffer_mock = Mock() + replay_buffer_mock.buffer_view = replay_buffer + replay_buffer_mock.obs_shape = (4,) reward_fn = PebbleStateEntropyReward(reward_net.predict_processed) - reward_fn.set_replay_buffer(replay_buffer, (4,)) + reward_fn.on_replay_buffer_initialized(replay_buffer_mock) return preference_comparisons.PebbleAgentTrainer( algorithm=agent, reward_fn=reward_fn, From c80fb80109647e193d7afde2c0ef3bdcabf9dcfc Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Thu, 8 Dec 2022 00:28:17 +0100 Subject: [PATCH 51/55] #641 code review: fix comment --- src/imitation/algorithms/preference_comparisons.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 411dc8c65..72f5da5cf 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -95,8 +95,9 @@ def has_pretraining(self) -> bool: def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: """Pre-train an agent before collecting comparisons. - By default, this method asserts that pre-training has zero steps allocated. Override this behavior in subclasses that implement pre-training. + If not overriden, this method raises ValueError when non-zero steps are + allocated for pre-training. Args: steps: number of environment steps to train for. @@ -104,7 +105,7 @@ def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: the training procedure. """ if steps > 0: - self._logger.warn( + raise ValueError( f"{steps} timesteps allocated for unsupervised pre-training:" " Trajectory generators without pre-training implementation should" " not consume any timesteps (otherwise the total number of" From 50577b046752193db92182b6ed40f517e0d53eb3 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 10 Dec 2022 01:02:32 +0100 Subject: [PATCH 52/55] #641 code review: replace RunningNorm with NormalizedRewardNet --- .../algorithms/pebble/entropy_reward.py | 144 ++++++++++++++---- .../config/train_preference_comparisons.py | 1 + src/imitation/util/util.py | 5 +- .../algorithms/pebble/test_entropy_reward.py | 49 +++--- 4 files changed, 139 insertions(+), 60 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 9e7958fa6..f1bb373ba 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -1,8 +1,9 @@ """Reward function for the PEBBLE training algorithm.""" import enum -from typing import Dict, Optional, Tuple, Union +from typing import Optional, Tuple +import gym import numpy as np import torch as th @@ -12,6 +13,7 @@ ReplayBufferView, ) from imitation.rewards.reward_function import RewardFn +from imitation.rewards.reward_nets import NormalizedRewardNet, RewardNet from imitation.util import util from imitation.util.networks import RunningNorm @@ -23,6 +25,92 @@ class PebbleRewardPhase(enum.Enum): POLICY_AND_REWARD_LEARNING = enum.auto() # Learned reward +class InsufficientObservations(RuntimeError): + pass + + +class EntropyRewardNet(RewardNet): + def __init__( + self, + nearest_neighbor_k: int, + replay_buffer_view: ReplayBufferView, + observation_space: gym.Space, + action_space: gym.Space, + normalize_images: bool = True, + ): + """Initialize the RewardNet. + + Args: + observation_space: the observation space of the environment + action_space: the action space of the environment + normalize_images: whether to automatically normalize + image observations to [0, 1] (from 0 to 255). Defaults to True. + """ + super().__init__(observation_space, action_space, normalize_images) + self.nearest_neighbor_k = nearest_neighbor_k + self._replay_buffer_view = replay_buffer_view + + def set_replay_buffer(self, replay_buffer: ReplayBufferRewardWrapper): + """This method needs to be called after unpickling. + + See also __getstate__() / __setstate__() + """ + assert self.observation_space == replay_buffer.observation_space + assert self.action_space == replay_buffer.action_space + self._replay_buffer_view = replay_buffer.buffer_view + + def forward( + self, + state: th.Tensor, + action: th.Tensor, + next_state: th.Tensor, + done: th.Tensor, + ) -> th.Tensor: + assert ( + self._replay_buffer_view is not None + ), "Missing replay buffer (possibly after unpickle)" + + all_observations = self._replay_buffer_view.observations + # ReplayBuffer sampling flattens the venv dimension, let's adapt to that + all_observations = all_observations.reshape( + (-1,) + self.observation_space.shape + ) + + if all_observations.shape[0] < self.nearest_neighbor_k: + raise InsufficientObservations( + "Insufficient observations for entropy calculation" + ) + + return util.compute_state_entropy( + state, all_observations, self.nearest_neighbor_k + ) + + def preprocess( + self, + state: np.ndarray, + action: np.ndarray, + next_state: np.ndarray, + done: np.ndarray, + ) -> Tuple[th.Tensor, th.Tensor, th.Tensor, th.Tensor]: + """Override default preprocessing to avoid the default one-hot encoding. + + We also know forward() only works with state, so no need to convert + other tensors. + """ + state_th = util.safe_to_tensor(state).to(self.device) + action_th = next_state_th = done_th = th.empty(0) + return state_th, action_th, next_state_th, done_th + + def __getstate__(self): + state = self.__dict__.copy() + del state["_replay_buffer_view"] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self._replay_buffer_view = None + + class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): """Reward function for implementation of the PEBBLE learning algorithm. @@ -59,17 +147,27 @@ def __init__( self.learned_reward_fn = learned_reward_fn self.nearest_neighbor_k = nearest_neighbor_k - self.entropy_stats = RunningNorm(1) self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION # These two need to be set with set_replay_buffer(): - self.replay_buffer_view: Optional[ReplayBufferView] = None - self.obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]], None] = None + self._entropy_reward_net: Optional[EntropyRewardNet] = None + self._normalized_entropy_reward_net: Optional[RewardNet] = None def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): - self.replay_buffer_view = replay_buffer.buffer_view - self.obs_shape = replay_buffer.obs_shape - + if self._normalized_entropy_reward_net is None: + self._entropy_reward_net = EntropyRewardNet( + nearest_neighbor_k=self.nearest_neighbor_k, + replay_buffer_view=replay_buffer.buffer_view, + observation_space=replay_buffer.observation_space, + action_space=replay_buffer.action_space, + normalize_images=False, + ) + self._normalized_entropy_reward_net = NormalizedRewardNet( + self._entropy_reward_net, RunningNorm + ) + else: + assert self._entropy_reward_net is not None + self._entropy_reward_net.set_replay_buffer(replay_buffer) def unsupervised_exploration_finish(self): assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION @@ -88,35 +186,15 @@ def __call__( return self.learned_reward_fn(state, action, next_state, done) def _entropy_reward(self, state, action, next_state, done): - if self.replay_buffer_view is None: + if self._normalized_entropy_reward_net is None: raise ValueError( "Replay buffer must be supplied before entropy reward can be used", ) - all_observations = self.replay_buffer_view.observations - # ReplayBuffer sampling flattens the venv dimension, let's adapt to that - all_observations = all_observations.reshape((-1, *self.obs_shape)) - - if all_observations.shape[0] < self.nearest_neighbor_k: + try: + return self._normalized_entropy_reward_net.predict_processed( + state, action, next_state, done, update_stats=True + ) + except InsufficientObservations: # not enough observations to compare to, fall back to the learned function; # (falling back to a constant may also be ok) return self.learned_reward_fn(state, action, next_state, done) - else: - # TODO #625: deal with the conversion back and forth between np and torch - entropies = util.compute_state_entropy( - th.tensor(state), - th.tensor(all_observations), - self.nearest_neighbor_k, - ) - - normalized_entropies = self.entropy_stats.forward(entropies) - - return normalized_entropies.numpy() - - def __getstate__(self): - state = self.__dict__.copy() - del state["replay_buffer_view"] - return state - - def __setstate__(self, state): - self.__dict__.update(state) - self.replay_buffer_view = None diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index f01a7d6c0..3a66349c5 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -158,3 +158,4 @@ def fast(): reward_trainer_kwargs = { "epochs": 1, } + locals() # quieten flake8 diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index cef2e6f38..c56e81f4c 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -384,12 +384,15 @@ def compute_state_entropy( for idx in range(len(all_obs) // batch_size + 1): start = idx * batch_size end = (idx + 1) * batch_size + all_obs_batch = all_obs[start:end] distances_tensor = th.linalg.vector_norm( - obs[:, None] - all_obs[None, start:end], + obs[:, None] - all_obs_batch[None, :], dim=non_batch_dimensions, ord=2, ) + assert distances_tensor.shape == (obs.shape[0], all_obs_batch.shape[0]) dists.append(distances_tensor) all_dists = th.cat(dists, dim=1) knn_dists = th.kthvalue(all_dists, k=k + 1, dim=1).values return knn_dists + diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index bc020e86c..e318eced2 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -5,15 +5,14 @@ import numpy as np import torch as th -from gym.spaces import Discrete - +from gym.spaces import Discrete, Box +from gym.spaces.space import Space from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.policies.replay_buffer_wrapper import ReplayBufferView from imitation.util import util -SPACE = Discrete(4) -OBS_SHAPE = (1,) -PLACEHOLDER = np.empty(OBS_SHAPE) +SPACE = Box(-1, 1, shape=(1,)) +PLACEHOLDER = np.empty(SPACE.shape) BUFFER_SIZE = 20 K = 4 @@ -22,30 +21,27 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): - all_observations = rng.random((BUFFER_SIZE, VENVS, *OBS_SHAPE)) + all_observations = rng.random((BUFFER_SIZE, VENVS) + SPACE.shape) reward_fn = PebbleStateEntropyReward(Mock(), K) reward_fn.on_replay_buffer_initialized( replay_buffer_mock( ReplayBufferView(all_observations, lambda: slice(None)), - OBS_SHAPE, + SPACE, ) ) # Act - observations = th.rand((BATCH_SIZE, *OBS_SHAPE)) + observations = th.rand((BATCH_SIZE, *SPACE.shape)) reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Assert expected = util.compute_state_entropy( observations, - all_observations.reshape(-1, *OBS_SHAPE), + all_observations.reshape(-1, *SPACE.shape), K, ) - expected_normalized = reward_fn.entropy_stats.normalize( - th.as_tensor(expected), - ).numpy() - np.testing.assert_allclose(reward, expected_normalized) + np.testing.assert_allclose(reward, expected, rtol=0.005, atol=0.005) def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): @@ -55,11 +51,11 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): m.side_effect = lambda obs, all_obs, k: obs reward_fn = PebbleStateEntropyReward(Mock(), K) - all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) + all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape)) reward_fn.on_replay_buffer_initialized( replay_buffer_mock( ReplayBufferView(all_observations, lambda: slice(None)), - OBS_SHAPE, + SPACE, ) ) @@ -97,7 +93,7 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin reward_fn.unsupervised_exploration_finish() # Act - observations = np.ones((BATCH_SIZE, *OBS_SHAPE)) + observations = np.ones((BATCH_SIZE, *SPACE.shape)) reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Assert @@ -111,23 +107,23 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin def test_pebble_entropy_reward_can_pickle(): - all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE)) + all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape)) replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) - obs1 = np.random.rand(VENVS, *OBS_SHAPE) + obs1 = np.random.rand(VENVS, *SPACE.shape) reward_fn = PebbleStateEntropyReward(reward_fn_stub, K) - reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, OBS_SHAPE)) + reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, SPACE)) reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) # Act pickled = pickle.dumps(reward_fn) reward_fn_deserialized = pickle.loads(pickled) reward_fn_deserialized.on_replay_buffer_initialized( - replay_buffer_mock(replay_buffer, OBS_SHAPE) + replay_buffer_mock(replay_buffer, SPACE) ) # Assert - obs2 = np.random.rand(VENVS, *OBS_SHAPE) + obs2 = np.random.rand(VENVS, *SPACE.shape) expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) np.testing.assert_allclose(actual_result, expected_result) @@ -137,8 +133,9 @@ def reward_fn_stub(state, action, next_state, done): return state -def replay_buffer_mock(buffer_view: ReplayBufferView, obs_shape: tuple) -> Mock: - replay_buffer_mock = Mock() - replay_buffer_mock.buffer_view = buffer_view - replay_buffer_mock.obs_shape = obs_shape - return replay_buffer_mock +def replay_buffer_mock(buffer_view: ReplayBufferView, obs_space: Space) -> Mock: + mock = Mock() + mock.buffer_view = buffer_view + mock.observation_space = obs_space + mock.action_space = SPACE + return mock From 531b3532cfd6633cd57023a7ecb30b468395d97e Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 10 Dec 2022 01:56:27 +0100 Subject: [PATCH 53/55] #641 code review: refactor PebbleStateEntropyReward so that inner RewardNets can be injected from the outside --- .../algorithms/pebble/entropy_reward.py | 89 ++++----- .../scripts/train_preference_comparisons.py | 48 ++++- src/imitation/util/util.py | 1 - .../algorithms/pebble/test_entropy_reward.py | 178 ++++++++++-------- .../algorithms/test_preference_comparisons.py | 4 +- .../test_train_preference_comparisons.py | 64 +++++++ 6 files changed, 241 insertions(+), 143 deletions(-) create mode 100644 tests/scripts/test_train_preference_comparisons.py diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index f1bb373ba..074281e90 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -13,47 +13,46 @@ ReplayBufferView, ) from imitation.rewards.reward_function import RewardFn -from imitation.rewards.reward_nets import NormalizedRewardNet, RewardNet +from imitation.rewards.reward_nets import RewardNet from imitation.util import util -from imitation.util.networks import RunningNorm - - -class PebbleRewardPhase(enum.Enum): - """States representing different behaviors for PebbleStateEntropyReward.""" - - UNSUPERVISED_EXPLORATION = enum.auto() # Entropy based reward - POLICY_AND_REWARD_LEARNING = enum.auto() # Learned reward class InsufficientObservations(RuntimeError): pass -class EntropyRewardNet(RewardNet): +class EntropyRewardNet(RewardNet, ReplayBufferAwareRewardFn): def __init__( self, nearest_neighbor_k: int, - replay_buffer_view: ReplayBufferView, observation_space: gym.Space, action_space: gym.Space, normalize_images: bool = True, + replay_buffer_view: Optional[ReplayBufferView] = None, ): """Initialize the RewardNet. Args: + nearest_neighbor_k: Parameter for entropy computation (see + compute_state_entropy()) observation_space: the observation space of the environment action_space: the action space of the environment normalize_images: whether to automatically normalize image observations to [0, 1] (from 0 to 255). Defaults to True. + replay_buffer_view: Replay buffer view with observations to compare + against when computing entropy. If None is given, the buffer needs to + be set with on_replay_buffer_initialized() before EntropyRewardNet can + be used """ super().__init__(observation_space, action_space, normalize_images) self.nearest_neighbor_k = nearest_neighbor_k self._replay_buffer_view = replay_buffer_view - def set_replay_buffer(self, replay_buffer: ReplayBufferRewardWrapper): - """This method needs to be called after unpickling. + def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): + """Sets replay buffer. - See also __getstate__() / __setstate__() + This method needs to be called, e.g., after unpickling. + See also __getstate__() / __setstate__(). """ assert self.observation_space == replay_buffer.observation_space assert self.action_space == replay_buffer.action_space @@ -111,6 +110,13 @@ def __setstate__(self, state): self._replay_buffer_view = None +class PebbleRewardPhase(enum.Enum): + """States representing different behaviors for PebbleStateEntropyReward.""" + + UNSUPERVISED_EXPLORATION = enum.auto() # Entropy based reward + POLICY_AND_REWARD_LEARNING = enum.auto() # Learned reward + + class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): """Reward function for implementation of the PEBBLE learning algorithm. @@ -126,48 +132,30 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn): reward is returned. The second phase requires that a buffer with observations to compare against is - supplied with set_replay_buffer() or on_replay_buffer_initialized(). - To transition to the last phase, unsupervised_exploration_finish() needs - to be called. + supplied with on_replay_buffer_initialized(). To transition to the last phase, + unsupervised_exploration_finish() needs to be called. """ def __init__( self, + entropy_reward_fn: RewardFn, learned_reward_fn: RewardFn, - nearest_neighbor_k: int = 5, ): """Builds this class. Args: + entropy_reward_fn: The entropy-based reward function used during + unsupervised exploration learned_reward_fn: The learned reward function used after unsupervised exploration is finished - nearest_neighbor_k: Parameter for entropy computation (see - compute_state_entropy()) """ + self.entropy_reward_fn = entropy_reward_fn self.learned_reward_fn = learned_reward_fn - self.nearest_neighbor_k = nearest_neighbor_k - self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION - # These two need to be set with set_replay_buffer(): - self._entropy_reward_net: Optional[EntropyRewardNet] = None - self._normalized_entropy_reward_net: Optional[RewardNet] = None - def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper): - if self._normalized_entropy_reward_net is None: - self._entropy_reward_net = EntropyRewardNet( - nearest_neighbor_k=self.nearest_neighbor_k, - replay_buffer_view=replay_buffer.buffer_view, - observation_space=replay_buffer.observation_space, - action_space=replay_buffer.action_space, - normalize_images=False, - ) - self._normalized_entropy_reward_net = NormalizedRewardNet( - self._entropy_reward_net, RunningNorm - ) - else: - assert self._entropy_reward_net is not None - self._entropy_reward_net.set_replay_buffer(replay_buffer) + if isinstance(self.entropy_reward_fn, ReplayBufferAwareRewardFn): + self.entropy_reward_fn.on_replay_buffer_initialized(replay_buffer) def unsupervised_exploration_finish(self): assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION @@ -181,20 +169,11 @@ def __call__( done: np.ndarray, ) -> np.ndarray: if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION: - return self._entropy_reward(state, action, next_state, done) + try: + return self.entropy_reward_fn(state, action, next_state, done) + except InsufficientObservations: + # not enough observations to compare to, fall back to the learned function; + # (falling back to a constant may also be ok) + return self.learned_reward_fn(state, action, next_state, done) else: return self.learned_reward_fn(state, action, next_state, done) - - def _entropy_reward(self, state, action, next_state, done): - if self._normalized_entropy_reward_net is None: - raise ValueError( - "Replay buffer must be supplied before entropy reward can be used", - ) - try: - return self._normalized_entropy_reward_net.predict_processed( - state, action, next_state, done, update_stats=True - ) - except InsufficientObservations: - # not enough observations to compare to, fall back to the learned function; - # (falling back to a constant may also be ok) - return self.learned_reward_fn(state, action, next_state, done) diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index 659b47a74..524734713 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -13,10 +13,18 @@ from stable_baselines3.common import base_class, type_aliases, vec_env from imitation.algorithms import preference_comparisons -from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward +from imitation.algorithms.pebble.entropy_reward import ( + EntropyRewardNet, + PebbleStateEntropyReward, +) from imitation.data import types from imitation.policies import serialize +from imitation.policies.replay_buffer_wrapper import ( + ReplayBufferAwareRewardFn, + ReplayBufferRewardWrapper, +) from imitation.rewards import reward_function, reward_nets +from imitation.rewards.reward_nets import NormalizedRewardNet from imitation.scripts.common import common, reward from imitation.scripts.common import rl as rl_common from imitation.scripts.common import train @@ -24,6 +32,7 @@ train_preference_comparisons_ex, ) from imitation.util import logger as imit_logger +from imitation.util.networks import RunningNorm def save_model( @@ -71,14 +80,47 @@ def make_reward_function( reward_net.predict_processed, update_stats=False, ) + observation_space = reward_net.observation_space + action_space = reward_net.action_space if pebble_enabled: - relabel_reward_fn = PebbleStateEntropyReward( - relabel_reward_fn, # type: ignore[assignment] + relabel_reward_fn = create_pebble_reward_fn( + relabel_reward_fn, pebble_nearest_neighbor_k, + action_space, + observation_space, ) return relabel_reward_fn +def create_pebble_reward_fn( + relabel_reward_fn, pebble_nearest_neighbor_k, action_space, observation_space +): + entropy_reward_net = EntropyRewardNet( + nearest_neighbor_k=pebble_nearest_neighbor_k, + observation_space=observation_space, + action_space=action_space, + normalize_images=False, + ) + normalized_entropy_reward_net = NormalizedRewardNet(entropy_reward_net, RunningNorm) + + class EntropyRewardFn(ReplayBufferAwareRewardFn): + """Adapter for entropy reward adding on_replay_buffer_initialized() hook.""" + + def __call__(self, *args, **kwargs) -> np.ndarray: + kwargs["update_stats"] = True + return normalized_entropy_reward_net.predict_processed(*args, **kwargs) + + def on_replay_buffer_initialized( + self, replay_buffer: ReplayBufferRewardWrapper + ): + entropy_reward_net.on_replay_buffer_initialized(replay_buffer) + + return PebbleStateEntropyReward( + EntropyRewardFn(), + relabel_reward_fn, # type: ignore[assignment] + ) + + @train_preference_comparisons_ex.capture def make_agent_trajectory_generator( venv: vec_env.VecEnv, diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py index c56e81f4c..cf38cee5a 100644 --- a/src/imitation/util/util.py +++ b/src/imitation/util/util.py @@ -395,4 +395,3 @@ def compute_state_entropy( all_dists = th.cat(dists, dim=1) knn_dists = th.kthvalue(all_dists, k=k + 1, dim=1).values return knn_dists - diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index e318eced2..833a9ba94 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -1,14 +1,22 @@ """Tests for `imitation.algorithms.entropy_reward`.""" - import pickle -from unittest.mock import Mock, patch +from unittest.mock import Mock import numpy as np +import pytest import torch as th -from gym.spaces import Discrete, Box +from gym.spaces import Box from gym.spaces.space import Space -from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward -from imitation.policies.replay_buffer_wrapper import ReplayBufferView + +from imitation.algorithms.pebble.entropy_reward import ( + EntropyRewardNet, + InsufficientObservations, + PebbleStateEntropyReward, +) +from imitation.policies.replay_buffer_wrapper import ( + ReplayBufferAwareRewardFn, + ReplayBufferView, +) from imitation.util import util SPACE = Box(-1, 1, shape=(1,)) @@ -20,112 +28,115 @@ VENVS = 2 -def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng): - all_observations = rng.random((BUFFER_SIZE, VENVS) + SPACE.shape) +def test_pebble_entropy_reward_returns_entropy_for_pretraining(): + expected_result = th.rand(BATCH_SIZE) + observations = th.rand((BATCH_SIZE,) + SPACE.shape) + entropy_fn = Mock() + entropy_fn.return_value = expected_result + learned_fn = Mock() - reward_fn = PebbleStateEntropyReward(Mock(), K) - reward_fn.on_replay_buffer_initialized( - replay_buffer_mock( - ReplayBufferView(all_observations, lambda: slice(None)), - SPACE, - ) + reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn) + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + np.testing.assert_allclose(reward, expected_result) + entropy_fn.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER ) - # Act - observations = th.rand((BATCH_SIZE, *SPACE.shape)) + +def test_pebble_entropy_reward_returns_learned_rew_on_insufficient_observations(rng): + expected_result = th.rand(BATCH_SIZE) + observations = th.rand((BATCH_SIZE,) + SPACE.shape) + entropy_fn = Mock() + entropy_fn.side_effect = InsufficientObservations("test error") + learned_fn = Mock() + learned_fn.return_value = expected_result + + reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn) reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) - # Assert - expected = util.compute_state_entropy( - observations, - all_observations.reshape(-1, *SPACE.shape), - K, + np.testing.assert_allclose(reward, expected_result) + learned_fn.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER ) - np.testing.assert_allclose(reward, expected, rtol=0.005, atol=0.005) -def test_pebble_entropy_reward_returns_normalized_values_for_pretraining(): - with patch("imitation.util.util.compute_state_entropy") as m: - # mock entropy computation so that we can test - # only stats collection in this test - m.side_effect = lambda obs, all_obs, k: obs +def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training(): + expected_result = th.rand(BATCH_SIZE) + observations = th.rand((BATCH_SIZE,) + SPACE.shape) + entropy_fn = Mock() + learned_fn = Mock() + learned_fn.return_value = expected_result - reward_fn = PebbleStateEntropyReward(Mock(), K) - all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape)) - reward_fn.on_replay_buffer_initialized( - replay_buffer_mock( - ReplayBufferView(all_observations, lambda: slice(None)), - SPACE, - ) - ) + reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn) + reward_fn.unsupervised_exploration_finish() + reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + np.testing.assert_allclose(reward, expected_result) + learned_fn.assert_called_once_with( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) - dim = 8 - shift = 3 - scale = 2 - # Act - for _ in range(1000): - state = th.randn(dim) * scale + shift - reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) +def test_pebble_entropy_reward_propagates_on_replay_buffer_initialized(): + replay_buffer = replay_buffer_mock(np.empty((BUFFER_SIZE, VENVS) + SPACE.shape)) + entropy_fn = Mock(spec=ReplayBufferAwareRewardFn) + learned_fn = Mock() - normalized_reward = reward_fn( - np.zeros(dim), - PLACEHOLDER, - PLACEHOLDER, - PLACEHOLDER, - ) + reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn) + reward_fn.on_replay_buffer_initialized(replay_buffer) - # Assert - np.testing.assert_allclose( - normalized_reward, - np.repeat(-shift / scale, dim), - rtol=0.05, - atol=0.05, - ) + entropy_fn.on_replay_buffer_initialized.assert_called_once_with(replay_buffer) -def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training(): - expected_reward = np.ones(1) - learned_reward_mock = Mock() - learned_reward_mock.return_value = expected_reward - reward_fn = PebbleStateEntropyReward(learned_reward_mock) - # move all the way to the last state - reward_fn.unsupervised_exploration_finish() +def test_entropy_reward_net_returns_entropy_for_pretraining(rng): + observations = th.rand((BATCH_SIZE, *SPACE.shape)) + all_observations = rng.random((BUFFER_SIZE, VENVS) + SPACE.shape) + reward_net = EntropyRewardNet(K, SPACE, SPACE) + reward_net.on_replay_buffer_initialized(replay_buffer_mock(all_observations)) # Act - observations = np.ones((BATCH_SIZE, *SPACE.shape)) - reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + reward = reward_net.predict_processed( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) # Assert - assert reward == expected_reward - learned_reward_mock.assert_called_once_with( + expected = util.compute_state_entropy( observations, - PLACEHOLDER, - PLACEHOLDER, - PLACEHOLDER, + all_observations.reshape(-1, *SPACE.shape), + K, ) + np.testing.assert_allclose(reward, expected, rtol=0.005, atol=0.005) -def test_pebble_entropy_reward_can_pickle(): - all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape)) - replay_buffer = ReplayBufferView(all_observations, lambda: slice(None)) +def test_entropy_reward_net_raises_on_insufficient_observations(rng): + observations = th.rand((BATCH_SIZE, *SPACE.shape)) + all_observations = rng.random((K - 1, 1) + SPACE.shape) + reward_net = EntropyRewardNet(K, SPACE, SPACE) + reward_net.on_replay_buffer_initialized(replay_buffer_mock(all_observations)) + + # Act + with pytest.raises(InsufficientObservations): + reward_net.predict_processed( + observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + ) - obs1 = np.random.rand(VENVS, *SPACE.shape) - reward_fn = PebbleStateEntropyReward(reward_fn_stub, K) - reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, SPACE)) - reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + +def test_entropy_reward_net_can_pickle(rng): + all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape)) + replay_buffer = replay_buffer_mock(all_observations) + reward_net = EntropyRewardNet(K, SPACE, SPACE) + reward_net.on_replay_buffer_initialized(replay_buffer) # Act - pickled = pickle.dumps(reward_fn) + pickled = pickle.dumps(reward_net) reward_fn_deserialized = pickle.loads(pickled) - reward_fn_deserialized.on_replay_buffer_initialized( - replay_buffer_mock(replay_buffer, SPACE) - ) + reward_fn_deserialized.on_replay_buffer_initialized(replay_buffer) # Assert - obs2 = np.random.rand(VENVS, *SPACE.shape) - expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) - actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + obs = th.rand(VENVS, *SPACE.shape) + expected_result = reward_net(obs, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + actual_result = reward_fn_deserialized(obs, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) np.testing.assert_allclose(actual_result, expected_result) @@ -133,7 +144,8 @@ def reward_fn_stub(state, action, next_state, done): return state -def replay_buffer_mock(buffer_view: ReplayBufferView, obs_space: Space) -> Mock: +def replay_buffer_mock(all_observations: np.ndarray, obs_space: Space = SPACE) -> Mock: + buffer_view = ReplayBufferView(all_observations, lambda: slice(None)) mock = Mock() mock.buffer_view = buffer_view mock.observation_space = obs_space diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py index fb63e71d0..f31fdceb8 100644 --- a/tests/algorithms/test_preference_comparisons.py +++ b/tests/algorithms/test_preference_comparisons.py @@ -85,7 +85,9 @@ def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer): replay_buffer_mock = Mock() replay_buffer_mock.buffer_view = replay_buffer replay_buffer_mock.obs_shape = (4,) - reward_fn = PebbleStateEntropyReward(reward_net.predict_processed) + reward_fn = PebbleStateEntropyReward( + reward_net.predict_processed, venv.observation_space, venv.action_space + ) reward_fn.on_replay_buffer_initialized(replay_buffer_mock) return preference_comparisons.PebbleAgentTrainer( algorithm=agent, diff --git a/tests/scripts/test_train_preference_comparisons.py b/tests/scripts/test_train_preference_comparisons.py new file mode 100644 index 000000000..d05ebd27a --- /dev/null +++ b/tests/scripts/test_train_preference_comparisons.py @@ -0,0 +1,64 @@ +from unittest.mock import Mock, patch + +import numpy as np +import torch as th +from gym import Space +from gym.spaces import Box + +from imitation.policies.replay_buffer_wrapper import ReplayBufferView +from imitation.scripts.train_preference_comparisons import create_pebble_reward_fn + +K = 4 +SPACE = Box(-1, 1, shape=(1,)) +BUFFER_SIZE = 20 +VENVS = 2 +PLACEHOLDER = np.empty(SPACE.shape) + + +def test_creates_normalized_entropy_pebble_reward(): + with patch("imitation.util.util.compute_state_entropy") as m: + # mock entropy computation so that we can test + # only stats collection in this test + m.side_effect = lambda obs, all_obs, k: obs + + reward_fn = create_pebble_reward_fn(reward_fn_stub, K, SPACE, SPACE) + + all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape)) + reward_fn.on_replay_buffer_initialized(replay_buffer_mock(all_observations)) + + dim = 8 + shift = 3 + scale = 2 + + # Act + for _ in range(1000): + state = th.randn(dim) * scale + shift + reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + + normalized_reward = reward_fn( + np.zeros(dim), + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, + ) + + # Assert + np.testing.assert_allclose( + normalized_reward, + np.repeat(-shift / scale, dim), + rtol=0.05, + atol=0.05, + ) + + +def reward_fn_stub(state, action, next_state, done): + return state + + +def replay_buffer_mock(all_observations: np.ndarray, obs_space: Space = SPACE) -> Mock: + buffer_view = ReplayBufferView(all_observations, lambda: slice(None)) + mock = Mock() + mock.buffer_view = buffer_view + mock.observation_space = obs_space + mock.action_space = SPACE + return mock From 74ba96b17d75c785ad5a8af6725f645dbd17df9e Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Sat, 10 Dec 2022 12:45:24 +0100 Subject: [PATCH 54/55] #641 fix static analysis and tests --- .../algorithms/pebble/entropy_reward.py | 32 +++++++++++++++---- .../algorithms/preference_comparisons.py | 6 +++- .../scripts/train_preference_comparisons.py | 22 +++++++------ .../algorithms/pebble/test_entropy_reward.py | 25 ++++++++++++--- .../algorithms/test_preference_comparisons.py | 12 ++++--- .../test_train_preference_comparisons.py | 2 ++ 6 files changed, 74 insertions(+), 25 deletions(-) diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py index 074281e90..eba53405b 100644 --- a/src/imitation/algorithms/pebble/entropy_reward.py +++ b/src/imitation/algorithms/pebble/entropy_reward.py @@ -1,7 +1,7 @@ """Reward function for the PEBBLE training algorithm.""" import enum -from typing import Optional, Tuple +from typing import Any, Callable, Optional, Tuple import gym import numpy as np @@ -18,10 +18,16 @@ class InsufficientObservations(RuntimeError): + """Error signifying not enough observations for entropy calculation.""" + pass class EntropyRewardNet(RewardNet, ReplayBufferAwareRewardFn): + """RewardNet wrapping entropy reward function.""" + + __call__: Callable[..., Any] # Needed to appease pytype + def __init__( self, nearest_neighbor_k: int, @@ -53,6 +59,9 @@ def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper) This method needs to be called, e.g., after unpickling. See also __getstate__() / __setstate__(). + + Args: + replay_buffer: replay buffer with history of observations """ assert self.observation_space == replay_buffer.observation_space assert self.action_space == replay_buffer.action_space @@ -72,16 +81,18 @@ def forward( all_observations = self._replay_buffer_view.observations # ReplayBuffer sampling flattens the venv dimension, let's adapt to that all_observations = all_observations.reshape( - (-1,) + self.observation_space.shape + (-1,) + self.observation_space.shape, ) if all_observations.shape[0] < self.nearest_neighbor_k: raise InsufficientObservations( - "Insufficient observations for entropy calculation" + "Insufficient observations for entropy calculation", ) return util.compute_state_entropy( - state, all_observations, self.nearest_neighbor_k + state, + all_observations, + self.nearest_neighbor_k, ) def preprocess( @@ -95,6 +106,15 @@ def preprocess( We also know forward() only works with state, so no need to convert other tensors. + + Args: + state: The observation input. + action: The action input. + next_state: The observation input. + done: Whether the episode has terminated. + + Returns: + Observations preprocessed by converting them to Tensor. """ state_th = util.safe_to_tensor(state).to(self.device) action_th = next_state_th = done_th = th.empty(0) @@ -172,8 +192,8 @@ def __call__( try: return self.entropy_reward_fn(state, action, next_state, done) except InsufficientObservations: - # not enough observations to compare to, fall back to the learned function; - # (falling back to a constant may also be ok) + # not enough observations to compare to, fall back to the learned + # function; (falling back to a constant may also be ok) return self.learned_reward_fn(state, action, next_state, done) else: return self.learned_reward_fn(state, action, next_state, done) diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py index 72f5da5cf..fccd7958d 100644 --- a/src/imitation/algorithms/preference_comparisons.py +++ b/src/imitation/algorithms/preference_comparisons.py @@ -96,13 +96,17 @@ def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None: """Pre-train an agent before collecting comparisons. Override this behavior in subclasses that implement pre-training. - If not overriden, this method raises ValueError when non-zero steps are + If not overridden, this method raises ValueError when non-zero steps are allocated for pre-training. Args: steps: number of environment steps to train for. **kwargs: additional keyword arguments to pass on to the training procedure. + + Raises: + ValueError: Unsupervised pre-training not implemented but non-zero + steps are allocated for pre-training. """ if steps > 0: raise ValueError( diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index 524734713..5e07b094c 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -7,6 +7,7 @@ import pathlib from typing import Any, Mapping, Optional, Type, Union +import gym import numpy as np import torch as th from sacred.observers import FileStorageObserver @@ -24,6 +25,7 @@ ReplayBufferRewardWrapper, ) from imitation.rewards import reward_function, reward_nets +from imitation.rewards.reward_function import RewardFn from imitation.rewards.reward_nets import NormalizedRewardNet from imitation.scripts.common import common, reward from imitation.scripts.common import rl as rl_common @@ -80,21 +82,22 @@ def make_reward_function( reward_net.predict_processed, update_stats=False, ) - observation_space = reward_net.observation_space - action_space = reward_net.action_space if pebble_enabled: relabel_reward_fn = create_pebble_reward_fn( - relabel_reward_fn, + relabel_reward_fn, # type: ignore[assignment] pebble_nearest_neighbor_k, - action_space, - observation_space, + reward_net.action_space, + reward_net.observation_space, ) return relabel_reward_fn def create_pebble_reward_fn( - relabel_reward_fn, pebble_nearest_neighbor_k, action_space, observation_space -): + relabel_reward_fn: RewardFn, + pebble_nearest_neighbor_k: int, + action_space: gym.Space, + observation_space: gym.Space, +) -> PebbleStateEntropyReward: entropy_reward_net = EntropyRewardNet( nearest_neighbor_k=pebble_nearest_neighbor_k, observation_space=observation_space, @@ -111,13 +114,14 @@ def __call__(self, *args, **kwargs) -> np.ndarray: return normalized_entropy_reward_net.predict_processed(*args, **kwargs) def on_replay_buffer_initialized( - self, replay_buffer: ReplayBufferRewardWrapper + self, + replay_buffer: ReplayBufferRewardWrapper, ): entropy_reward_net.on_replay_buffer_initialized(replay_buffer) return PebbleStateEntropyReward( EntropyRewardFn(), - relabel_reward_fn, # type: ignore[assignment] + relabel_reward_fn, ) diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index 833a9ba94..b598ac75e 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -40,7 +40,10 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(): np.testing.assert_allclose(reward, expected_result) entropy_fn.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + observations, + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) @@ -57,7 +60,10 @@ def test_pebble_entropy_reward_returns_learned_rew_on_insufficient_observations( np.testing.assert_allclose(reward, expected_result) learned_fn.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + observations, + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) @@ -74,7 +80,10 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin np.testing.assert_allclose(reward, expected_result) learned_fn.assert_called_once_with( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + observations, + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) @@ -97,7 +106,10 @@ def test_entropy_reward_net_returns_entropy_for_pretraining(rng): # Act reward = reward_net.predict_processed( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + observations, + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) # Assert @@ -118,7 +130,10 @@ def test_entropy_reward_net_raises_on_insufficient_observations(rng): # Act with pytest.raises(InsufficientObservations): reward_net.predict_processed( - observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER + observations, + PLACEHOLDER, + PLACEHOLDER, + PLACEHOLDER, ) diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py index f31fdceb8..d863cc4b0 100644 --- a/tests/algorithms/test_preference_comparisons.py +++ b/tests/algorithms/test_preference_comparisons.py @@ -18,12 +18,12 @@ import imitation.testing.reward_nets as testing_reward_nets from imitation.algorithms import preference_comparisons -from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward from imitation.data import types from imitation.data.types import TrajectoryWithRew from imitation.policies.replay_buffer_wrapper import ReplayBufferView from imitation.regularization import regularizers, updaters from imitation.rewards import reward_nets +from imitation.scripts.train_preference_comparisons import create_pebble_reward_fn from imitation.util import networks, util UNCERTAINTY_ON = ["logit", "probability", "label"] @@ -84,9 +84,13 @@ def replay_buffer(rng): def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer): replay_buffer_mock = Mock() replay_buffer_mock.buffer_view = replay_buffer - replay_buffer_mock.obs_shape = (4,) - reward_fn = PebbleStateEntropyReward( - reward_net.predict_processed, venv.observation_space, venv.action_space + replay_buffer_mock.observation_space = venv.observation_space + replay_buffer_mock.action_space = venv.action_space + reward_fn = create_pebble_reward_fn( + reward_net.predict_processed, + 5, + venv.action_space, + venv.observation_space, ) reward_fn.on_replay_buffer_initialized(replay_buffer_mock) return preference_comparisons.PebbleAgentTrainer( diff --git a/tests/scripts/test_train_preference_comparisons.py b/tests/scripts/test_train_preference_comparisons.py index d05ebd27a..c4390dd6b 100644 --- a/tests/scripts/test_train_preference_comparisons.py +++ b/tests/scripts/test_train_preference_comparisons.py @@ -1,3 +1,5 @@ +"""Tests train_preferences_comparisons helper methods.""" + from unittest.mock import Mock, patch import numpy as np From b344cbdf13f57425cc2d4487129fc0efb9484ab3 Mon Sep 17 00:00:00 2001 From: Jan Michelfeit Date: Mon, 12 Dec 2022 22:16:20 +0100 Subject: [PATCH 55/55] #641 increase coverage --- .../policies/replay_buffer_wrapper.py | 11 +++++-- .../algorithms/pebble/test_entropy_reward.py | 4 --- .../algorithms/test_preference_comparisons.py | 30 +++++++++++++++++++ tests/policies/test_replay_buffer_wrapper.py | 21 ------------- tests/scripts/test_scripts.py | 14 +++++++++ .../test_train_preference_comparisons.py | 3 ++ 6 files changed, 56 insertions(+), 27 deletions(-) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 255e01f3b..a309917c2 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -143,5 +143,12 @@ class ReplayBufferAwareRewardFn(RewardFn, abc.ABC): def on_replay_buffer_initialized( self, replay_buffer: ReplayBufferRewardWrapper, - ): - pass + ) -> None: + """Hook method to be called when ReplayBuffer is initialized. + + Needed to propagate the ReplayBuffer to a reward function because the buffer + is created indirectly in ReplayBufferRewardWrapper. + + Args: + replay_buffer: the created ReplayBuffer + """ # noqa: DAR202 diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py index b598ac75e..461a7dd5a 100644 --- a/tests/algorithms/pebble/test_entropy_reward.py +++ b/tests/algorithms/pebble/test_entropy_reward.py @@ -155,10 +155,6 @@ def test_entropy_reward_net_can_pickle(rng): np.testing.assert_allclose(actual_result, expected_result) -def reward_fn_stub(state, action, next_state, done): - return state - - def replay_buffer_mock(all_observations: np.ndarray, obs_space: Space = SPACE) -> Mock: buffer_view = ReplayBufferView(all_observations, lambda: slice(None)) mock = Mock() diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py index d863cc4b0..c66dcc157 100644 --- a/tests/algorithms/test_preference_comparisons.py +++ b/tests/algorithms/test_preference_comparisons.py @@ -18,11 +18,16 @@ import imitation.testing.reward_nets as testing_reward_nets from imitation.algorithms import preference_comparisons +from imitation.algorithms.preference_comparisons import ( + PebbleAgentTrainer, + TrajectoryGenerator, +) from imitation.data import types from imitation.data.types import TrajectoryWithRew from imitation.policies.replay_buffer_wrapper import ReplayBufferView from imitation.regularization import regularizers, updaters from imitation.rewards import reward_nets +from imitation.rewards.reward_function import RewardFn from imitation.scripts.train_preference_comparisons import create_pebble_reward_fn from imitation.util import networks, util @@ -1120,3 +1125,28 @@ def test_that_trainer_improves( ) assert np.mean(trained_agent_rewards) > np.mean(novice_agent_rewards) + + +def test_trajectory_generator_raises_on_pretrain_if_not_implemented(): + class TrajectoryGeneratorTestImpl(TrajectoryGenerator): + def sample(self, steps: int) -> Sequence[TrajectoryWithRew]: + return [] + + generator = TrajectoryGeneratorTestImpl() + assert generator.has_pretraining is False + with pytest.raises(ValueError, match="should not consume any timesteps"): + generator.unsupervised_pretrain(1) + + generator.sample(1) # just to make coverage happy + + +def test_pebble_agent_trainer_expects_pebble_reward(agent, venv, rng): + reward_fn: RewardFn = lambda state, action, next, done: state + + with pytest.raises(ValueError, match="PebbleStateEntropyReward"): + PebbleAgentTrainer( + algorithm=agent, + reward_fn=reward_fn, # type: ignore[call-arg] + venv=venv, + rng=rng, + ) diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py index 7c26dd2d4..7b92e64ba 100644 --- a/tests/policies/test_replay_buffer_wrapper.py +++ b/tests/policies/test_replay_buffer_wrapper.py @@ -4,7 +4,6 @@ from typing import Type from unittest.mock import Mock -import gym import numpy as np import pytest import stable_baselines3 as sb3 @@ -122,26 +121,6 @@ def test_wrapper_class(tmpdir, rng): replay_buffer_wrapper._get_samples() -class ActionIsObsEnv(gym.Env): - """Simple environment where the obs is the action.""" - - def __init__(self): - """Initialize environment.""" - super().__init__() - self.action_space = spaces.Box(np.array([0]), np.array([1])) - self.observation_space = spaces.Box(np.array([0]), np.array([1])) - - def step(self, action): - obs = action - reward = 0 - done = False - info = {} - return obs, reward, done, info - - def reset(self): - return np.array([0]) - - def test_replay_buffer_view_provides_buffered_observations(): space = spaces.Box(np.array([0]), np.array([5])) n_envs = 2 diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 226b6b3c2..1f8a0d23d 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -254,6 +254,20 @@ def test_train_preference_comparisons_reward_named_config(tmpdir, named_configs) assert isinstance(run.result, dict) +def test_train_preference_comparisons_pebble_config(tmpdir): + config_updates = dict(common=dict(log_root=tmpdir)) + run = train_preference_comparisons.train_preference_comparisons_ex.run( + # make sure rl.sac named_config is called after rl.fast to overwrite + # rl_kwargs.batch_size to None + named_configs=ALGO_FAST_CONFIGS["preference_comparison"] + + ["pebble", "mountain_car_continuous"], + config_updates=config_updates, + ) + assert run.config["rl"]["rl_cls"] is stable_baselines3.SAC + assert run.status == "COMPLETED" + assert isinstance(run.result, dict) + + def test_train_dagger_main(tmpdir): with pytest.warns(None) as record: run = train_imitation.train_imitation_ex.run( diff --git a/tests/scripts/test_train_preference_comparisons.py b/tests/scripts/test_train_preference_comparisons.py index c4390dd6b..cf794fecf 100644 --- a/tests/scripts/test_train_preference_comparisons.py +++ b/tests/scripts/test_train_preference_comparisons.py @@ -52,6 +52,9 @@ def test_creates_normalized_entropy_pebble_reward(): atol=0.05, ) + # Just to make coverage happy: + reward_fn_stub(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER) + def reward_fn_stub(state, action, next_state, done): return state