From 8d5900a380ea49f6871c900254e467fc7a0f5278 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Wed, 9 Nov 2022 16:26:10 -0800
Subject: [PATCH 01/55] Welfords alg and test

---
 src/imitation/util/util.py | 41 ++++++++++++++++++++++
 tests/util/test_util.py    | 69 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index bbb7b2c37..317b17bbb 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -359,3 +359,44 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]:
         return_iterable = iterable
 
     return first_element, return_iterable
+
+
+class RunningMeanAndVar:
+    """Stores a running mean and variance using Wellford's algorithm."""
+
+    def __init__(
+        self,
+        shape: Tuple[int, ...] = (),
+        device: Optional[str] = None,
+    ):
+        """Initialize blank mean, variance, count."""
+        self.mean = th.zeros(shape, device=device)
+        self.M2 = th.zeros(shape, device=device)
+        self.count = 0
+
+    def update(self, x: th.Tensor):
+        with th.no_grad():
+            batch_mean = th.mean(x, dim=0)
+            batch_var = th.var(x, dim=0, unbiased=False)
+            batch_count = x.shape[0]
+            batch_M2 = batch_var * batch_count
+            if self.count == 0:
+                self.count = batch_count
+                self.mean = batch_mean
+                self.M2 = batch_M2
+                return
+
+            delta = batch_mean - self.mean
+            total_count = self.count + batch_count
+            self.mean += delta * batch_count / total_count
+
+            self.M2 += (
+                batch_M2 + delta * delta * (self.count * batch_count) / total_count
+            )
+
+            self.count = total_count
+
+    @property
+    def var(self):
+        """Returns the unbiased estimate of the variance."""
+        return self.M2 / (self.count - 1)
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index ce663d8e0..b58e32b7c 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -118,3 +118,72 @@ def test_tensor_iter_norm():
     assert np.allclose(norm_1, 14.0)
     with pytest.raises(ValueError):
         util.tensor_iter_norm(tensor_list, ord=0.0)
+
+
+def test_RunningMeanAndVarSimple():
+    running_stats = util.RunningMeanAndVar(shape=(3, 4))
+    first_half = th.ones(size=(10, 3, 4), dtype=th.double)
+
+    running_stats.update(first_half)
+    np.testing.assert_allclose(
+        running_stats.mean,
+        first_half.mean(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+    np.testing.assert_allclose(
+        running_stats.var,
+        first_half.var(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+
+    second_half = 2 * th.ones(size=(10, 3, 4), dtype=th.double)
+    data = th.cat([first_half, second_half])
+    running_stats.update(second_half)
+    np.testing.assert_allclose(
+        running_stats.mean,
+        data.mean(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+    np.testing.assert_allclose(
+        running_stats.var,
+        data.var(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+
+
+def test_RunningMeanAndVar():
+    running_stats = util.RunningMeanAndVar(shape=(3, 4))
+    data = th.normal(mean=10 * th.ones(size=(20, 3, 4), dtype=th.double))
+
+    first_half = data[:10]
+    running_stats.update(first_half)
+    np.testing.assert_allclose(
+        running_stats.mean,
+        first_half.mean(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+    np.testing.assert_allclose(
+        running_stats.var,
+        first_half.var(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+
+    running_stats.update(data[10:])
+    np.testing.assert_allclose(
+        running_stats.mean,
+        data.mean(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )
+    np.testing.assert_allclose(
+        running_stats.var,
+        data.var(dim=0),
+        atol=1e-5,
+        rtol=1e-4,
+    )

From 4aac074b73d89b5e96e169d7a27aea031c5a6a05 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Thu, 10 Nov 2022 08:59:27 -0800
Subject: [PATCH 02/55] Next func

---
 src/imitation/util/util.py | 50 ++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index 317b17bbb..af09e2fe3 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -368,13 +368,13 @@ def __init__(
         self,
         shape: Tuple[int, ...] = (),
         device: Optional[str] = None,
-    ):
+    ) -> None:
         """Initialize blank mean, variance, count."""
         self.mean = th.zeros(shape, device=device)
         self.M2 = th.zeros(shape, device=device)
         self.count = 0
 
-    def update(self, x: th.Tensor):
+    def update(self, x: th.Tensor) -> None:
         with th.no_grad():
             batch_mean = th.mean(x, dim=0)
             batch_var = th.var(x, dim=0, unbiased=False)
@@ -390,13 +390,49 @@ def update(self, x: th.Tensor):
             total_count = self.count + batch_count
             self.mean += delta * batch_count / total_count
 
-            self.M2 += (
-                batch_M2 + delta * delta * (self.count * batch_count) / total_count
-            )
+            self.M2 += batch_M2 + delta * delta * self.count * batch_count / total_count
 
             self.count = total_count
 
     @property
-    def var(self):
-        """Returns the unbiased estimate of the variance."""
+    def var(self) -> th.Tensor:
+        """Returns the unbiased estimate of the variances."""
         return self.M2 / (self.count - 1)
+
+
+def compute_state_entropy(
+    obs: th.Tensor,
+    all_obs: th.Tensor,
+    k: int,
+    batch_size: int = 500,
+) -> th.Tensor:
+    """Compute the state entropy given by KNN distance.
+
+    Args:
+        obs: The tensor of states to compute entropy for.
+        all_obs: The tensor of all states in our experience,
+                 generally from a replay buffer.
+        k: the number of neighbors to consider
+        batch_size: when computing distances, how many to consider at once.
+
+    Returns:
+        A tensor containing the state entropy for `obs`.
+    """
+    with th.no_grad():
+        distances = []
+        for i in range(len(all_obs) // batch_size + 1):
+            start = i * batch_size
+            end = min((i + 1) * batch_size, obs.shape[1])
+            # TODO what is going on w/ these shapes?
+            # TODO use a non-deprecated norm function
+            distance = th.norm(
+                obs[:, None, :] - all_obs[None, start:end, :],
+                dim=-1,
+                p=2,
+            )
+            distances.append(distance)
+
+        distances_tensor = th.cat(distances, dim=1)
+        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
+        state_entropy = knn_dists
+    return state_entropy.unsqueeze(1)

From 383fce06fb9aef2283131a2589039767aa0b8c2c Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Thu, 10 Nov 2022 10:29:50 -0800
Subject: [PATCH 03/55] Test update

---
 tests/util/test_util.py | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index b58e32b7c..1f2862cce 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -120,41 +120,6 @@ def test_tensor_iter_norm():
         util.tensor_iter_norm(tensor_list, ord=0.0)
 
 
-def test_RunningMeanAndVarSimple():
-    running_stats = util.RunningMeanAndVar(shape=(3, 4))
-    first_half = th.ones(size=(10, 3, 4), dtype=th.double)
-
-    running_stats.update(first_half)
-    np.testing.assert_allclose(
-        running_stats.mean,
-        first_half.mean(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-    np.testing.assert_allclose(
-        running_stats.var,
-        first_half.var(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-
-    second_half = 2 * th.ones(size=(10, 3, 4), dtype=th.double)
-    data = th.cat([first_half, second_half])
-    running_stats.update(second_half)
-    np.testing.assert_allclose(
-        running_stats.mean,
-        data.mean(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-    np.testing.assert_allclose(
-        running_stats.var,
-        data.var(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-
-
 def test_RunningMeanAndVar():
     running_stats = util.RunningMeanAndVar(shape=(3, 4))
     data = th.normal(mean=10 * th.ones(size=(20, 3, 4), dtype=th.double))

From 055fa67ab7b76ba3c2d14629d5eb5d0132ae0f4c Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Thu, 10 Nov 2022 17:11:06 -0800
Subject: [PATCH 04/55] compute_state_entropy and test

---
 src/imitation/util/util.py | 35 ++++++++++++++---------------------
 tests/util/test_util.py    | 26 ++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index af09e2fe3..b5f58a0cb 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -404,35 +404,28 @@ def compute_state_entropy(
     obs: th.Tensor,
     all_obs: th.Tensor,
     k: int,
-    batch_size: int = 500,
 ) -> th.Tensor:
     """Compute the state entropy given by KNN distance.
 
     Args:
-        obs: The tensor of states to compute entropy for.
-        all_obs: The tensor of all states in our experience,
-                 generally from a replay buffer.
+        obs: A single observation.
+        all_obs: The tensor of all states to compare to.
         k: the number of neighbors to consider
-        batch_size: when computing distances, how many to consider at once.
 
     Returns:
         A tensor containing the state entropy for `obs`.
     """
+    assert obs.shape == all_obs.shape[1:]
     with th.no_grad():
-        distances = []
-        for i in range(len(all_obs) // batch_size + 1):
-            start = i * batch_size
-            end = min((i + 1) * batch_size, obs.shape[1])
-            # TODO what is going on w/ these shapes?
-            # TODO use a non-deprecated norm function
-            distance = th.norm(
-                obs[:, None, :] - all_obs[None, start:end, :],
-                dim=-1,
-                p=2,
-            )
-            distances.append(distance)
-
-        distances_tensor = th.cat(distances, dim=1)
-        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
+        non_batch_dimensions = tuple(range(1, len(obs.shape) + 1))
+        distances_tensor = th.linalg.vector_norm(
+            obs[None] - all_obs,
+            dim=non_batch_dimensions,
+            ord=2,
+        )
+
+        # Note that we take the k+1'th value because the closest neighbor to
+        # a point is itself, which we want to skip.
+        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=0).values
         state_entropy = knn_dists
-    return state_entropy.unsqueeze(1)
+    return state_entropy.unsqueeze(0)
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index 1f2862cce..54c89fe80 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -152,3 +152,29 @@ def test_RunningMeanAndVar():
         atol=1e-5,
         rtol=1e-4,
     )
+
+
+def test_compute_state_entropy_1d():
+    all_obs = th.arange(10, dtype=th.float).unsqueeze(1)
+    obs = all_obs[5]
+    assert util.compute_state_entropy(obs, all_obs, k=1) == 1
+    assert util.compute_state_entropy(obs, all_obs, k=2) == 1
+    assert util.compute_state_entropy(obs, all_obs, k=3) == 2
+    assert util.compute_state_entropy(obs, all_obs, k=4) == 2
+    assert util.compute_state_entropy(obs, all_obs, k=5) == 3
+
+
+def test_compute_state_entropy_2d():
+    all_obs_x = th.arange(10, dtype=th.float)
+    all_obs_y = th.arange(0, 100, step=10, dtype=th.float)
+    all_obs = th.stack((all_obs_x, all_obs_y), dim=1)
+
+    obs = all_obs[5]
+    np.testing.assert_allclose(
+        util.compute_state_entropy(obs, all_obs, k=1),
+        np.sqrt(10**2 + 1**2),
+    )
+    np.testing.assert_allclose(
+        util.compute_state_entropy(obs, all_obs, k=3),
+        np.sqrt(20**2 + 2**2),
+    )

From 5c278f40d511d4ccea45e8505d17efa0527da082 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Thu, 10 Nov 2022 17:32:36 -0800
Subject: [PATCH 05/55] Sketch of the entropy reward replay buffer

---
 .../policies/replay_buffer_wrapper.py         | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 6d0d70449..167261109 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -101,3 +101,54 @@ def _get_samples(self):
             "_get_samples() is intentionally not implemented."
             "This method should not be called.",
         )
+
+
+class ReplayBufferEntropyRewardWrapper(ReplayBuffer):
+    """Relabel the rewards from a ReplayBuffer, initially using entropy as reward."""
+
+    def __init__(
+        self,
+        buffer_size: int,
+        observation_space: spaces.Space,
+        action_space: spaces.Space,
+        *,
+        replay_buffer_class: Type[ReplayBuffer],
+        reward_fn: RewardFn,
+        entropy_as_reward_samples: int,
+        **kwargs,
+    ):
+        """Builds ReplayBufferRewardWrapper.
+
+        Args:
+            buffer_size: Max number of elements in the buffer
+            observation_space: Observation space
+            action_space: Action space
+            replay_buffer_class: Class of the replay buffer.
+            reward_fn: Reward function for reward relabeling.
+            entropy_as_reward_samples: Number of samples to use entropy as the reward,
+                before switching to using the reward_fn for relabeling.
+            **kwargs: keyword arguments for ReplayBuffer.
+        """
+        super().__init__(
+            buffer_size,
+            observation_space,
+            action_space,
+            replay_buffer_class,
+            reward_fn,
+            **kwargs,
+        )
+        # TODO should we limit by number of batches (as this does)
+        #      or number of observations returned?
+        self.samples = 0
+        self.entropy_as_reward_samples = entropy_as_reward_samples
+
+    def sample(self, *args, **kwargs):
+        self.samples += 1
+        samples = super().sample(*args, **kwargs)
+        if self.samples > self.entropy_as_reward_samples:
+            return samples
+
+        # TODO make the state entropy function accept batches
+        # TODO compute state entropy for each reward
+        # TODO replace the reward with the entropies
+        # TODO note that we really ought to reset the reward network when we are done w/ entropy, and we have no business training it before then

From 49dc26f6fb1069a0066b907fd565e5da4713ee99 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 11:20:52 -0800
Subject: [PATCH 06/55] Batchify state entropy func

---
 src/imitation/util/util.py | 12 ++++++------
 tests/util/test_util.py    | 14 +++++++-------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index b5f58a0cb..4e0bd90e0 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -408,24 +408,24 @@ def compute_state_entropy(
     """Compute the state entropy given by KNN distance.
 
     Args:
-        obs: A single observation.
+        obs: A batch of observations.
         all_obs: The tensor of all states to compare to.
         k: the number of neighbors to consider
 
     Returns:
         A tensor containing the state entropy for `obs`.
     """
-    assert obs.shape == all_obs.shape[1:]
+    assert obs.shape[1:] == all_obs.shape[1:]
     with th.no_grad():
-        non_batch_dimensions = tuple(range(1, len(obs.shape) + 1))
+        non_batch_dimensions = tuple(range(2, len(obs.shape) + 1))
         distances_tensor = th.linalg.vector_norm(
-            obs[None] - all_obs,
+            obs[:, None] - all_obs[None, :],
             dim=non_batch_dimensions,
             ord=2,
         )
 
         # Note that we take the k+1'th value because the closest neighbor to
         # a point is itself, which we want to skip.
-        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=0).values
+        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
         state_entropy = knn_dists
-    return state_entropy.unsqueeze(0)
+    return state_entropy.unsqueeze(1)
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index 54c89fe80..6f90dd832 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -156,12 +156,12 @@ def test_RunningMeanAndVar():
 
 def test_compute_state_entropy_1d():
     all_obs = th.arange(10, dtype=th.float).unsqueeze(1)
-    obs = all_obs[5]
-    assert util.compute_state_entropy(obs, all_obs, k=1) == 1
-    assert util.compute_state_entropy(obs, all_obs, k=2) == 1
-    assert util.compute_state_entropy(obs, all_obs, k=3) == 2
-    assert util.compute_state_entropy(obs, all_obs, k=4) == 2
-    assert util.compute_state_entropy(obs, all_obs, k=5) == 3
+    obs = all_obs[4:6]
+    np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=1), 1)
+    np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=2), 1)
+    np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=3), 2)
+    np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=4), 2)
+    np.testing.assert_allclose(util.compute_state_entropy(obs, all_obs, k=5), 3)
 
 
 def test_compute_state_entropy_2d():
@@ -169,7 +169,7 @@ def test_compute_state_entropy_2d():
     all_obs_y = th.arange(0, 100, step=10, dtype=th.float)
     all_obs = th.stack((all_obs_x, all_obs_y), dim=1)
 
-    obs = all_obs[5]
+    obs = all_obs[4:6]
     np.testing.assert_allclose(
         util.compute_state_entropy(obs, all_obs, k=1),
         np.sqrt(10**2 + 1**2),

From 394ad56a94e4c308c9fb762828807d9b14eee122 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 11:35:35 -0800
Subject: [PATCH 07/55] Final sketch of replay entropy buffer.

---
 .../policies/replay_buffer_wrapper.py         | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 167261109..c7eebfac8 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -115,6 +115,7 @@ def __init__(
         replay_buffer_class: Type[ReplayBuffer],
         reward_fn: RewardFn,
         entropy_as_reward_samples: int,
+        k: int = 5,
         **kwargs,
     ):
         """Builds ReplayBufferRewardWrapper.
@@ -127,6 +128,7 @@ def __init__(
             reward_fn: Reward function for reward relabeling.
             entropy_as_reward_samples: Number of samples to use entropy as the reward,
                 before switching to using the reward_fn for relabeling.
+            k: Use the k'th nearest neighbor's distance when computing state entropy.
             **kwargs: keyword arguments for ReplayBuffer.
         """
         super().__init__(
@@ -141,14 +143,32 @@ def __init__(
         #      or number of observations returned?
         self.samples = 0
         self.entropy_as_reward_samples = entropy_as_reward_samples
+        self.k = k
 
     def sample(self, *args, **kwargs):
         self.samples += 1
         samples = super().sample(*args, **kwargs)
         if self.samples > self.entropy_as_reward_samples:
             return samples
+        # TODO we really ought to reset the reward network when we are done w/ entropy,
+        #      and we have no business training it before then
+
+        if self.full:
+            all_obs = self.observations
+        else:
+            all_obs = self.observations[: self.pos]
+        entropies = util.compute_state_entropy(samples.observations, all_obs, self.k)
+        entropies_th = (
+            util.safe_to_tensor(entropies)
+            .reshape(samples.rewards.shape)
+            .to(samples.rewards.device)
+        )
+        # TODO normalize entropies w/ RunningMeanAndVar
 
-        # TODO make the state entropy function accept batches
-        # TODO compute state entropy for each reward
-        # TODO replace the reward with the entropies
-        # TODO note that we really ought to reset the reward network when we are done w/ entropy, and we have no business training it before then
+        return ReplayBufferSamples(
+            samples.observations,
+            samples.actions,
+            samples.next_observations,
+            samples.dones,
+            entropies_th,
+        )

From 21da5328836bfd04452e175d07f1ad3ef2e75869 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 12:42:39 -0800
Subject: [PATCH 08/55] First test

---
 .../policies/replay_buffer_wrapper.py         |  20 +++-
 src/imitation/util/util.py                    |   6 +
 tests/policies/test_replay_buffer_wrapper.py  | 104 +++++++++++++++++-
 3 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index c7eebfac8..3e6a2ac8c 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -103,7 +103,7 @@ def _get_samples(self):
         )
 
 
-class ReplayBufferEntropyRewardWrapper(ReplayBuffer):
+class ReplayBufferEntropyRewardWrapper(ReplayBufferRewardWrapper):
     """Relabel the rewards from a ReplayBuffer, initially using entropy as reward."""
 
     def __init__(
@@ -135,8 +135,8 @@ def __init__(
             buffer_size,
             observation_space,
             action_space,
-            replay_buffer_class,
-            reward_fn,
+            replay_buffer_class=replay_buffer_class,
+            reward_fn=reward_fn,
             **kwargs,
         )
         # TODO should we limit by number of batches (as this does)
@@ -144,26 +144,34 @@ def __init__(
         self.samples = 0
         self.entropy_as_reward_samples = entropy_as_reward_samples
         self.k = k
+        # TODO support n_envs > 1
+        self.entropy_stats = util.RunningMeanAndVar(shape=(1,))
 
     def sample(self, *args, **kwargs):
         self.samples += 1
         samples = super().sample(*args, **kwargs)
         if self.samples > self.entropy_as_reward_samples:
             return samples
-        # TODO we really ought to reset the reward network when we are done w/ entropy,
-        #      and we have no business training it before then
+        # TODO we really ought to reset the reward network once we are done w/
+        #      the entropy based pre-training. We also have no reason to train
+        #      or even use the reward network before then.
 
         if self.full:
             all_obs = self.observations
         else:
             all_obs = self.observations[: self.pos]
         entropies = util.compute_state_entropy(samples.observations, all_obs, self.k)
+
+        # Normalize to have mean of 0 and standard deviation of 1
+        self.entropy_stats.update(entropies)
+        entropies -= self.entropy_stats.mean
+        entropies /= self.entropy_stats.std
+
         entropies_th = (
             util.safe_to_tensor(entropies)
             .reshape(samples.rewards.shape)
             .to(samples.rewards.device)
         )
-        # TODO normalize entropies w/ RunningMeanAndVar
 
         return ReplayBufferSamples(
             samples.observations,
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index 4e0bd90e0..33b2179c7 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -375,6 +375,7 @@ def __init__(
         self.count = 0
 
     def update(self, x: th.Tensor) -> None:
+        """Update the mean and variance with a batch `x`."""
         with th.no_grad():
             batch_mean = th.mean(x, dim=0)
             batch_var = th.var(x, dim=0, unbiased=False)
@@ -399,6 +400,11 @@ def var(self) -> th.Tensor:
         """Returns the unbiased estimate of the variances."""
         return self.M2 / (self.count - 1)
 
+    @property
+    def std(self) -> th.Tensor:
+        """Returns the unbiased estimate of the standard deviations."""
+        return np.sqrt(self.var)
+
 
 def compute_state_entropy(
     obs: th.Tensor,
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 40fc6eac5..bc6fb436e 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -11,7 +11,10 @@
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.save_util import load_from_pkl
 
-from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper
+from imitation.policies.replay_buffer_wrapper import (
+    ReplayBufferEntropyRewardWrapper,
+    ReplayBufferRewardWrapper,
+)
 from imitation.util import util
 
 
@@ -112,3 +115,102 @@ def test_wrapper_class(tmpdir, rng):
     # raise error for _get_samples()
     with pytest.raises(NotImplementedError, match=r".*_get_samples.*"):
         replay_buffer_wrapper._get_samples()
+
+
+# Combine this with the above test via parameterization over the buffer class
+def test_entropy_wrapper_class_no_op(tmpdir, rng):
+    buffer_size = 15
+    total_timesteps = 20
+
+    venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng)
+    rl_algo = sb3.SAC(
+        policy=sb3.sac.policies.SACPolicy,
+        policy_kwargs=dict(),
+        env=venv,
+        seed=42,
+        replay_buffer_class=ReplayBufferEntropyRewardWrapper,
+        replay_buffer_kwargs=dict(
+            replay_buffer_class=buffers.ReplayBuffer,
+            reward_fn=zero_reward_fn,
+            entropy_as_reward_samples=0,
+        ),
+        buffer_size=buffer_size,
+    )
+
+    rl_algo.learn(total_timesteps=total_timesteps)
+
+    buffer_path = osp.join(tmpdir, "buffer.pkl")
+    rl_algo.save_replay_buffer(buffer_path)
+    replay_buffer_wrapper = load_from_pkl(buffer_path)
+    replay_buffer = replay_buffer_wrapper.replay_buffer
+
+    # replay_buffer_wrapper.sample(...) should return zero-reward transitions
+    assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size()
+    assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all()
+    assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all()  # seed=42
+
+    # replay_buffer_wrapper.pos, replay_buffer_wrapper.full
+    assert replay_buffer_wrapper.pos == total_timesteps - buffer_size
+    assert replay_buffer_wrapper.full
+
+    # reset()
+    replay_buffer_wrapper.reset()
+    assert 0 == replay_buffer_wrapper.size() == replay_buffer.size()
+    assert replay_buffer_wrapper.pos == 0
+    assert not replay_buffer_wrapper.full
+
+    # to_torch()
+    tensor = replay_buffer_wrapper.to_torch(np.ones(42))
+    assert type(tensor) is th.Tensor
+
+
+# Combine this with the above test via parameterization over the buffer class
+def test_entropy_wrapper_class(tmpdir, rng):
+    buffer_size = 15
+    total_timesteps = 20
+
+    # TODO make entropy reward wrapper
+    # TODO learn w/ entropy for X timesteps on dummy environment where
+    # next observation is action, as is reward
+    # TODO expect that our behavior is approximately uniformly distributed
+
+    venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng)
+    rl_algo = sb3.SAC(
+        policy=sb3.sac.policies.SACPolicy,
+        policy_kwargs=dict(),
+        env=venv,
+        seed=42,
+        replay_buffer_class=ReplayBufferEntropyRewardWrapper,
+        replay_buffer_kwargs=dict(
+            replay_buffer_class=buffers.ReplayBuffer,
+            reward_fn=zero_reward_fn,
+            entropy_as_reward_samples=0,
+        ),
+        buffer_size=buffer_size,
+    )
+
+    rl_algo.learn(total_timesteps=total_timesteps)
+
+    buffer_path = osp.join(tmpdir, "buffer.pkl")
+    rl_algo.save_replay_buffer(buffer_path)
+    replay_buffer_wrapper = load_from_pkl(buffer_path)
+    replay_buffer = replay_buffer_wrapper.replay_buffer
+
+    # replay_buffer_wrapper.sample(...) should return zero-reward transitions
+    assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size()
+    assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all()
+    assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all()  # seed=42
+
+    # replay_buffer_wrapper.pos, replay_buffer_wrapper.full
+    assert replay_buffer_wrapper.pos == total_timesteps - buffer_size
+    assert replay_buffer_wrapper.full
+
+    # reset()
+    replay_buffer_wrapper.reset()
+    assert 0 == replay_buffer_wrapper.size() == replay_buffer.size()
+    assert replay_buffer_wrapper.pos == 0
+    assert not replay_buffer_wrapper.full
+
+    # to_torch()
+    tensor = replay_buffer_wrapper.to_torch(np.ones(42))
+    assert type(tensor) is th.Tensor

From 15dad9999004b595e20229f71f751e568ab56c6a Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 14:49:15 -0800
Subject: [PATCH 09/55] Test cleanup

---
 tests/policies/test_replay_buffer_wrapper.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index bc6fb436e..c8b9e24a0 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -164,12 +164,11 @@ def test_entropy_wrapper_class_no_op(tmpdir, rng):
     assert type(tensor) is th.Tensor
 
 
-# Combine this with the above test via parameterization over the buffer class
 def test_entropy_wrapper_class(tmpdir, rng):
     buffer_size = 15
+    entropy_samples = 10
     total_timesteps = 20
 
-    # TODO make entropy reward wrapper
     # TODO learn w/ entropy for X timesteps on dummy environment where
     # next observation is action, as is reward
     # TODO expect that our behavior is approximately uniformly distributed
@@ -184,7 +183,7 @@ def test_entropy_wrapper_class(tmpdir, rng):
         replay_buffer_kwargs=dict(
             replay_buffer_class=buffers.ReplayBuffer,
             reward_fn=zero_reward_fn,
-            entropy_as_reward_samples=0,
+            entropy_as_reward_samples=entropy_samples,
         ),
         buffer_size=buffer_size,
     )

From 0c280797a117e32bc8fce3a250a5e070ddcb05a2 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 15:02:53 -0800
Subject: [PATCH 10/55] Update

---
 tests/policies/test_replay_buffer_wrapper.py | 24 ++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index c8b9e24a0..3cf506c8d 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -3,10 +3,12 @@
 import os.path as osp
 from typing import Type
 
+import gym
 import numpy as np
 import pytest
 import stable_baselines3 as sb3
 import torch as th
+from gym import spaces
 from stable_baselines3.common import buffers, off_policy_algorithm, policies
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.save_util import load_from_pkl
@@ -164,13 +166,31 @@ def test_entropy_wrapper_class_no_op(tmpdir, rng):
     assert type(tensor) is th.Tensor
 
 
+class ActionIsObsEnv(gym.Env):
+    """Simple environment where the obs is the action."""
+
+    def __init__(self):
+        """Initialize environment."""
+        super().__init__()
+        self.action_space = spaces.Discrete(50)
+        self.observation_space = spaces.Discrete(50)
+
+    def step(self, action):
+        obs = action
+        reward = 0
+        done = False
+        info = {}
+        return obs, reward, done, info
+
+    def reset(self):
+        return self.action_space.sample()
+
+
 def test_entropy_wrapper_class(tmpdir, rng):
     buffer_size = 15
     entropy_samples = 10
     total_timesteps = 20
 
-    # TODO learn w/ entropy for X timesteps on dummy environment where
-    # next observation is action, as is reward
     # TODO expect that our behavior is approximately uniformly distributed
 
     venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng)

From 5ab9d28a1fcebf7847c2972d99e207fde10384f9 Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 16:34:22 -0800
Subject: [PATCH 11/55] Commit for diff

---
 .../policies/replay_buffer_wrapper.py         | 28 +++++----
 tests/policies/test_replay_buffer_wrapper.py  | 60 ++++++++-----------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 3e6a2ac8c..77fde0eec 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -131,6 +131,13 @@ def __init__(
             k: Use the k'th nearest neighbor's distance when computing state entropy.
             **kwargs: keyword arguments for ReplayBuffer.
         """
+        # TODO should we limit by number of batches (as this does)
+        #      or number of observations returned?
+        self.sample_count = 0
+        self.entropy_as_reward_samples = entropy_as_reward_samples
+        self.k = k
+        # TODO support n_envs > 1
+        self.entropy_stats = util.RunningMeanAndVar(shape=(1,))
         super().__init__(
             buffer_size,
             observation_space,
@@ -139,18 +146,14 @@ def __init__(
             reward_fn=reward_fn,
             **kwargs,
         )
-        # TODO should we limit by number of batches (as this does)
-        #      or number of observations returned?
-        self.samples = 0
-        self.entropy_as_reward_samples = entropy_as_reward_samples
-        self.k = k
-        # TODO support n_envs > 1
-        self.entropy_stats = util.RunningMeanAndVar(shape=(1,))
 
+    # TODO this seems to never actually get called?
     def sample(self, *args, **kwargs):
-        self.samples += 1
+        self.sample_count += 1
         samples = super().sample(*args, **kwargs)
-        if self.samples > self.entropy_as_reward_samples:
+        print(self.sample_count)
+        print(self.entropy_as_reward_samples)
+        if self.sample_count > 500:
             return samples
         # TODO we really ought to reset the reward network once we are done w/
         #      the entropy based pre-training. We also have no reason to train
@@ -160,7 +163,12 @@ def sample(self, *args, **kwargs):
             all_obs = self.observations
         else:
             all_obs = self.observations[: self.pos]
-        entropies = util.compute_state_entropy(samples.observations, all_obs, self.k)
+        entropies = util.compute_state_entropy(
+            # TODO support multiple environments
+            samples.observations.unsqueeze(1),
+            all_obs,
+            self.k,
+        )
 
         # Normalize to have mean of 0 and standard deviation of 1
         self.entropy_stats.update(entropies)
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 3cf506c8d..3753078b1 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -12,6 +12,7 @@
 from stable_baselines3.common import buffers, off_policy_algorithm, policies
 from stable_baselines3.common.policies import BasePolicy
 from stable_baselines3.common.save_util import load_from_pkl
+from stable_baselines3.common.vec_env import DummyVecEnv
 
 from imitation.policies.replay_buffer_wrapper import (
     ReplayBufferEntropyRewardWrapper,
@@ -123,6 +124,7 @@ def test_wrapper_class(tmpdir, rng):
 def test_entropy_wrapper_class_no_op(tmpdir, rng):
     buffer_size = 15
     total_timesteps = 20
+    entropy_samples = 0
 
     venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng)
     rl_algo = sb3.SAC(
@@ -134,7 +136,7 @@ def test_entropy_wrapper_class_no_op(tmpdir, rng):
         replay_buffer_kwargs=dict(
             replay_buffer_class=buffers.ReplayBuffer,
             reward_fn=zero_reward_fn,
-            entropy_as_reward_samples=0,
+            entropy_as_reward_samples=entropy_samples,
         ),
         buffer_size=buffer_size,
     )
@@ -172,8 +174,8 @@ class ActionIsObsEnv(gym.Env):
     def __init__(self):
         """Initialize environment."""
         super().__init__()
-        self.action_space = spaces.Discrete(50)
-        self.observation_space = spaces.Discrete(50)
+        self.action_space = spaces.Box(np.array([0]), np.array([1]))
+        self.observation_space = spaces.Box(np.array([0]), np.array([1]))
 
     def step(self, action):
         obs = action
@@ -183,17 +185,15 @@ def step(self, action):
         return obs, reward, done, info
 
     def reset(self):
-        return self.action_space.sample()
+        return np.array([0])
 
 
 def test_entropy_wrapper_class(tmpdir, rng):
-    buffer_size = 15
-    entropy_samples = 10
-    total_timesteps = 20
-
-    # TODO expect that our behavior is approximately uniformly distributed
+    buffer_size = 20
+    entropy_samples = 40
+    k = 4
 
-    venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng)
+    venv = DummyVecEnv([ActionIsObsEnv])
     rl_algo = sb3.SAC(
         policy=sb3.sac.policies.SACPolicy,
         policy_kwargs=dict(),
@@ -204,32 +204,24 @@ def test_entropy_wrapper_class(tmpdir, rng):
             replay_buffer_class=buffers.ReplayBuffer,
             reward_fn=zero_reward_fn,
             entropy_as_reward_samples=entropy_samples,
+            k=k,
         ),
         buffer_size=buffer_size,
     )
 
-    rl_algo.learn(total_timesteps=total_timesteps)
-
-    buffer_path = osp.join(tmpdir, "buffer.pkl")
-    rl_algo.save_replay_buffer(buffer_path)
-    replay_buffer_wrapper = load_from_pkl(buffer_path)
-    replay_buffer = replay_buffer_wrapper.replay_buffer
-
-    # replay_buffer_wrapper.sample(...) should return zero-reward transitions
-    assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size()
-    assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all()
-    assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all()  # seed=42
-
-    # replay_buffer_wrapper.pos, replay_buffer_wrapper.full
-    assert replay_buffer_wrapper.pos == total_timesteps - buffer_size
-    assert replay_buffer_wrapper.full
-
-    # reset()
-    replay_buffer_wrapper.reset()
-    assert 0 == replay_buffer_wrapper.size() == replay_buffer.size()
-    assert replay_buffer_wrapper.pos == 0
-    assert not replay_buffer_wrapper.full
+    rl_algo.learn(total_timesteps=buffer_size)
+    initial_entropy = util.compute_state_entropy(
+        th.Tensor(rl_algo.replay_buffer.observations),
+        th.Tensor(rl_algo.replay_buffer.observations),
+        k=k,
+    )
 
-    # to_torch()
-    tensor = replay_buffer_wrapper.to_torch(np.ones(42))
-    assert type(tensor) is th.Tensor
+    rl_algo.learn(total_timesteps=entropy_samples - buffer_size)
+    # Expect that the entropy of our replay buffer is now higher,
+    # since we trained with that as the reward.
+    trained_entropy = util.compute_state_entropy(
+        th.Tensor(rl_algo.replay_buffer.observations),
+        th.Tensor(rl_algo.replay_buffer.observations),
+        k=k,
+    )
+    assert trained_entropy.mean() > initial_entropy.mean()

From 9410c31f166365e0db849eef2c4d846eb4ee5e2f Mon Sep 17 00:00:00 2001
From: Dan Pandori <dantweinand@gmail.com>
Date: Fri, 11 Nov 2022 16:41:35 -0800
Subject: [PATCH 12/55] Push final-ish state

---
 src/imitation/policies/replay_buffer_wrapper.py | 17 ++++++++---------
 tests/policies/test_replay_buffer_wrapper.py    | 10 +++++-----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 77fde0eec..9da9ac5ea 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -133,11 +133,6 @@ def __init__(
         """
         # TODO should we limit by number of batches (as this does)
         #      or number of observations returned?
-        self.sample_count = 0
-        self.entropy_as_reward_samples = entropy_as_reward_samples
-        self.k = k
-        # TODO support n_envs > 1
-        self.entropy_stats = util.RunningMeanAndVar(shape=(1,))
         super().__init__(
             buffer_size,
             observation_space,
@@ -146,14 +141,18 @@ def __init__(
             reward_fn=reward_fn,
             **kwargs,
         )
+        self.sample_count = 0
+        self.k = k
+        # TODO support n_envs > 1
+        self.entropy_stats = util.RunningMeanAndVar(shape=(1,))
+        self.entropy_as_reward_samples = entropy_as_reward_samples
 
-    # TODO this seems to never actually get called?
     def sample(self, *args, **kwargs):
         self.sample_count += 1
         samples = super().sample(*args, **kwargs)
-        print(self.sample_count)
-        print(self.entropy_as_reward_samples)
-        if self.sample_count > 500:
+        # For some reason self.entropy_as_reward_samples seems to get cleared,
+        # and I have no idea why.
+        if self.sample_count > self.entropy_as_reward_samples:
             return samples
         # TODO we really ought to reset the reward network once we are done w/
         #      the entropy based pre-training. We also have no reason to train
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 3753078b1..5d06139aa 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -190,7 +190,7 @@ def reset(self):
 
 def test_entropy_wrapper_class(tmpdir, rng):
     buffer_size = 20
-    entropy_samples = 40
+    entropy_samples = 500
     k = 4
 
     venv = DummyVecEnv([ActionIsObsEnv])
@@ -211,8 +211,8 @@ def test_entropy_wrapper_class(tmpdir, rng):
 
     rl_algo.learn(total_timesteps=buffer_size)
     initial_entropy = util.compute_state_entropy(
-        th.Tensor(rl_algo.replay_buffer.observations),
-        th.Tensor(rl_algo.replay_buffer.observations),
+        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
+        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
         k=k,
     )
 
@@ -220,8 +220,8 @@ def test_entropy_wrapper_class(tmpdir, rng):
     # Expect that the entropy of our replay buffer is now higher,
     # since we trained with that as the reward.
     trained_entropy = util.compute_state_entropy(
-        th.Tensor(rl_algo.replay_buffer.observations),
-        th.Tensor(rl_algo.replay_buffer.observations),
+        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
+        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
         k=k,
     )
     assert trained_entropy.mean() > initial_entropy.mean()

From fdcdf0d898a3a01afc8a0a2e2110115a4825ba1e Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Tue, 29 Nov 2022 16:03:23 +0100
Subject: [PATCH 13/55] #625 refactor RunningMeanAndVar

---
 .../policies/replay_buffer_wrapper.py         |  2 +-
 src/imitation/util/util.py                    | 33 ++++++++-----------
 tests/util/test_util.py                       |  4 +--
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 9da9ac5ea..680026d1d 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -171,7 +171,7 @@ def sample(self, *args, **kwargs):
 
         # Normalize to have mean of 0 and standard deviation of 1
         self.entropy_stats.update(entropies)
-        entropies -= self.entropy_stats.mean
+        entropies -= self.entropy_stats.running_mean
         entropies /= self.entropy_stats.std
 
         entropies_th = (
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index 33b2179c7..3a7ead70e 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -370,30 +370,25 @@ def __init__(
         device: Optional[str] = None,
     ) -> None:
         """Initialize blank mean, variance, count."""
-        self.mean = th.zeros(shape, device=device)
+        self.running_mean = th.zeros(shape, device=device)
         self.M2 = th.zeros(shape, device=device)
         self.count = 0
 
-    def update(self, x: th.Tensor) -> None:
+    def update(self, batch: th.Tensor) -> None:
         """Update the mean and variance with a batch `x`."""
         with th.no_grad():
-            batch_mean = th.mean(x, dim=0)
-            batch_var = th.var(x, dim=0, unbiased=False)
-            batch_count = x.shape[0]
-            batch_M2 = batch_var * batch_count
-            if self.count == 0:
-                self.count = batch_count
-                self.mean = batch_mean
-                self.M2 = batch_M2
-                return
-
-            delta = batch_mean - self.mean
-            total_count = self.count + batch_count
-            self.mean += delta * batch_count / total_count
-
-            self.M2 += batch_M2 + delta * delta * self.count * batch_count / total_count
-
-            self.count = total_count
+            batch_mean = th.mean(batch, dim=0)
+            batch_var = th.var(batch, dim=0, unbiased=False)
+            batch_count = batch.shape[0]
+
+            delta = batch_mean - self.running_mean
+            tot_count = self.count + batch_count
+            self.running_mean += delta * batch_count / tot_count
+
+            self.M2 += batch_var * batch_count
+            self.M2 += th.square(delta) * self.count * batch_count / tot_count
+
+            self.count += batch_count
 
     @property
     def var(self) -> th.Tensor:
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index 6f90dd832..6ce2efcc2 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -127,7 +127,7 @@ def test_RunningMeanAndVar():
     first_half = data[:10]
     running_stats.update(first_half)
     np.testing.assert_allclose(
-        running_stats.mean,
+        running_stats.running_mean,
         first_half.mean(dim=0),
         atol=1e-5,
         rtol=1e-4,
@@ -141,7 +141,7 @@ def test_RunningMeanAndVar():
 
     running_stats.update(data[10:])
     np.testing.assert_allclose(
-        running_stats.mean,
+        running_stats.running_mean,
         data.mean(dim=0),
         atol=1e-5,
         rtol=1e-4,

From 0cd12557fef89008763442433cd478044bd3b9b7 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Tue, 29 Nov 2022 17:35:12 +0100
Subject: [PATCH 14/55] #625 use RunningNorm instead of RunningMeanAndVar

---
 .../policies/replay_buffer_wrapper.py         |  9 ++---
 src/imitation/util/networks.py                | 10 ++---
 src/imitation/util/util.py                    | 40 -------------------
 tests/util/test_util.py                       | 34 ----------------
 4 files changed, 9 insertions(+), 84 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 680026d1d..539f2e512 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -9,6 +9,7 @@
 
 from imitation.rewards.reward_function import RewardFn
 from imitation.util import util
+from imitation.util.networks import RunningNorm
 
 
 def _samples_to_reward_fn_input(
@@ -144,7 +145,7 @@ def __init__(
         self.sample_count = 0
         self.k = k
         # TODO support n_envs > 1
-        self.entropy_stats = util.RunningMeanAndVar(shape=(1,))
+        self.entropy_stats = RunningNorm(1)
         self.entropy_as_reward_samples = entropy_as_reward_samples
 
     def sample(self, *args, **kwargs):
@@ -169,10 +170,8 @@ def sample(self, *args, **kwargs):
             self.k,
         )
 
-        # Normalize to have mean of 0 and standard deviation of 1
-        self.entropy_stats.update(entropies)
-        entropies -= self.entropy_stats.running_mean
-        entropies /= self.entropy_stats.std
+        # Normalize to have mean of 0 and standard deviation of 1 according to running stats
+        entropies = self.entropy_stats.forward(entropies)
 
         entropies_th = (
             util.safe_to_tensor(entropies)
diff --git a/src/imitation/util/networks.py b/src/imitation/util/networks.py
index c27aea2cd..048273656 100644
--- a/src/imitation/util/networks.py
+++ b/src/imitation/util/networks.py
@@ -126,12 +126,12 @@ def update_stats(self, batch: th.Tensor) -> None:
         tot_count = self.count + batch_count
         self.running_mean += delta * batch_count / tot_count
 
-        self.running_var *= self.count
-        self.running_var += batch_var * batch_count
-        self.running_var += th.square(delta) * self.count * batch_count / tot_count
-        self.running_var /= tot_count
+        m_a = self.running_var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + th.square(delta) * self.count * batch_count / tot_count
+        self.running_var = M2 / tot_count
 
-        self.count += batch_count
+        self.count = tot_count
 
 
 class EMANorm(BaseNorm):
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index 3a7ead70e..df8eb6a6a 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -361,46 +361,6 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]:
     return first_element, return_iterable
 
 
-class RunningMeanAndVar:
-    """Stores a running mean and variance using Wellford's algorithm."""
-
-    def __init__(
-        self,
-        shape: Tuple[int, ...] = (),
-        device: Optional[str] = None,
-    ) -> None:
-        """Initialize blank mean, variance, count."""
-        self.running_mean = th.zeros(shape, device=device)
-        self.M2 = th.zeros(shape, device=device)
-        self.count = 0
-
-    def update(self, batch: th.Tensor) -> None:
-        """Update the mean and variance with a batch `x`."""
-        with th.no_grad():
-            batch_mean = th.mean(batch, dim=0)
-            batch_var = th.var(batch, dim=0, unbiased=False)
-            batch_count = batch.shape[0]
-
-            delta = batch_mean - self.running_mean
-            tot_count = self.count + batch_count
-            self.running_mean += delta * batch_count / tot_count
-
-            self.M2 += batch_var * batch_count
-            self.M2 += th.square(delta) * self.count * batch_count / tot_count
-
-            self.count += batch_count
-
-    @property
-    def var(self) -> th.Tensor:
-        """Returns the unbiased estimate of the variances."""
-        return self.M2 / (self.count - 1)
-
-    @property
-    def std(self) -> th.Tensor:
-        """Returns the unbiased estimate of the standard deviations."""
-        return np.sqrt(self.var)
-
-
 def compute_state_entropy(
     obs: th.Tensor,
     all_obs: th.Tensor,
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index 6ce2efcc2..28678dc8b 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -120,40 +120,6 @@ def test_tensor_iter_norm():
         util.tensor_iter_norm(tensor_list, ord=0.0)
 
 
-def test_RunningMeanAndVar():
-    running_stats = util.RunningMeanAndVar(shape=(3, 4))
-    data = th.normal(mean=10 * th.ones(size=(20, 3, 4), dtype=th.double))
-
-    first_half = data[:10]
-    running_stats.update(first_half)
-    np.testing.assert_allclose(
-        running_stats.running_mean,
-        first_half.mean(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-    np.testing.assert_allclose(
-        running_stats.var,
-        first_half.var(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-
-    running_stats.update(data[10:])
-    np.testing.assert_allclose(
-        running_stats.running_mean,
-        data.mean(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-    np.testing.assert_allclose(
-        running_stats.var,
-        data.var(dim=0),
-        atol=1e-5,
-        rtol=1e-4,
-    )
-
-
 def test_compute_state_entropy_1d():
     all_obs = th.arange(10, dtype=th.float).unsqueeze(1)
     obs = all_obs[4:6]

From d88ba4441557f1ab7d198f6b1c40c170831ea772 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Tue, 29 Nov 2022 22:52:58 +0100
Subject: [PATCH 15/55] #625 make copy of train_preference_comparisons.py for
 pebble

---
 .../train_preference_comparisons_pebble.py    | 128 ++++++++
 .../train_preference_comparisons_pebble.py    | 292 ++++++++++++++++++
 2 files changed, 420 insertions(+)
 create mode 100644 src/imitation/scripts/config/train_preference_comparisons_pebble.py
 create mode 100644 src/imitation/scripts/train_preference_comparisons_pebble.py

diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
new file mode 100644
index 000000000..d6887e066
--- /dev/null
+++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
@@ -0,0 +1,128 @@
+"""Configuration for imitation.scripts.train_preference_comparisons_pebble."""
+
+import sacred
+
+from imitation.algorithms import preference_comparisons
+from imitation.scripts.common import common, reward, rl, train
+
+train_preference_comparisons_pebble_ex = sacred.Experiment(
+    "train_preference_comparisons_pebble",
+    ingredients=[
+        common.common_ingredient,
+        reward.reward_ingredient,
+        rl.rl_ingredient,
+        train.train_ingredient,
+    ],
+)
+
+
+MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1)))
+ANT_SHARED_LOCALS = dict(
+    total_timesteps=int(3e7),
+    rl=dict(batch_size=16384),
+)
+
+
+@train_preference_comparisons_pebble_ex.config
+def train_defaults():
+    fragment_length = 100  # timesteps per fragment used for comparisons
+    total_timesteps = int(1e6)  # total number of environment timesteps
+    total_comparisons = 5000  # total number of comparisons to elicit
+    num_iterations = 5  # Arbitrary, should be tuned for the task
+    comparison_queue_size = None
+    # factor by which to oversample transitions before creating fragments
+    transition_oversampling = 1
+    # fraction of total_comparisons that will be sampled right at the beginning
+    initial_comparison_frac = 0.1
+    # fraction of sampled trajectories that will include some random actions
+    exploration_frac = 0.0
+    preference_model_kwargs = {}
+    reward_trainer_kwargs = {
+        "epochs": 3,
+    }
+    save_preferences = False  # save preference dataset at the end?
+    agent_path = None  # path to a (partially) trained agent to load at the beginning
+    # type of PreferenceGatherer to use
+    gatherer_cls = preference_comparisons.SyntheticGatherer
+    # arguments passed on to the PreferenceGatherer specified by gatherer_cls
+    gatherer_kwargs = {}
+    active_selection = False
+    active_selection_oversampling = 2
+    uncertainty_on = "logit"
+    fragmenter_kwargs = {
+        "warning_threshold": 0,
+    }
+    # path to a pickled sequence of trajectories used instead of training an agent
+    trajectory_path = None
+    trajectory_generator_kwargs = {}  # kwargs to pass to trajectory generator
+    allow_variable_horizon = False
+
+    checkpoint_interval = 0  # Num epochs between saving (<0 disables, =0 final only)
+    query_schedule = "hyperbolic"
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def cartpole():
+    common = dict(env_name="CartPole-v1")
+    allow_variable_horizon = True
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def seals_ant():
+    locals().update(**MUJOCO_SHARED_LOCALS)
+    locals().update(**ANT_SHARED_LOCALS)
+    common = dict(env_name="seals/Ant-v0")
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def half_cheetah():
+    locals().update(**MUJOCO_SHARED_LOCALS)
+    common = dict(env_name="HalfCheetah-v2")
+    rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024))
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def seals_hopper():
+    locals().update(**MUJOCO_SHARED_LOCALS)
+    common = dict(env_name="seals/Hopper-v0")
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def seals_humanoid():
+    locals().update(**MUJOCO_SHARED_LOCALS)
+    common = dict(env_name="seals/Humanoid-v0")
+    total_timesteps = int(4e6)
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def seals_cartpole():
+    common = dict(env_name="seals/CartPole-v0")
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def pendulum():
+    common = dict(env_name="Pendulum-v1")
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def mountain_car():
+    common = dict(env_name="MountainCar-v0")
+    allow_variable_horizon = True
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def seals_mountain_car():
+    common = dict(env_name="seals/MountainCar-v0")
+
+
+@train_preference_comparisons_pebble_ex.named_config
+def fast():
+    # Minimize the amount of computation. Useful for test cases.
+    total_timesteps = 50
+    total_comparisons = 5
+    initial_comparison_frac = 0.2
+    num_iterations = 1
+    fragment_length = 2
+    reward_trainer_kwargs = {
+        "epochs": 1,
+    }
diff --git a/src/imitation/scripts/train_preference_comparisons_pebble.py b/src/imitation/scripts/train_preference_comparisons_pebble.py
new file mode 100644
index 000000000..f34eefb9d
--- /dev/null
+++ b/src/imitation/scripts/train_preference_comparisons_pebble.py
@@ -0,0 +1,292 @@
+"""Train a reward model using preference comparisons.
+
+Can be used as a CLI script, or the `train_preference_comparisons` function
+can be called directly.
+"""
+
+import functools
+import pathlib
+from typing import Any, Mapping, Optional, Type, Union
+
+import torch as th
+from sacred.observers import FileStorageObserver
+from stable_baselines3.common import type_aliases
+
+from imitation.algorithms import preference_comparisons
+from imitation.data import types
+from imitation.policies import serialize
+from imitation.scripts.common import common, reward
+from imitation.scripts.common import rl as rl_common
+from imitation.scripts.common import train
+from imitation.scripts.config.train_preference_comparisons_pebble import (
+    train_preference_comparisons_pebble_ex,
+)
+
+
+def save_model(
+    agent_trainer: preference_comparisons.AgentTrainer,
+    save_path: pathlib.Path,
+):
+    """Save the model as `model.zip`."""
+    serialize.save_stable_model(
+        output_dir=save_path / "policy",
+        model=agent_trainer.algorithm,
+    )
+
+
+def save_checkpoint(
+    trainer: preference_comparisons.PreferenceComparisons,
+    save_path: pathlib.Path,
+    allow_save_policy: Optional[bool],
+):
+    """Save reward model and optionally policy."""
+    save_path.mkdir(parents=True, exist_ok=True)
+    th.save(trainer.model, save_path / "reward_net.pt")
+    if allow_save_policy:
+        # Note: We should only save the model as model.zip if `trajectory_generator`
+        # contains one. Currently we are slightly over-conservative, by requiring
+        # that an AgentTrainer be used if we're saving the policy.
+        assert isinstance(
+            trainer.trajectory_generator,
+            preference_comparisons.AgentTrainer,
+        )
+        save_model(trainer.trajectory_generator, save_path)
+    else:
+        trainer.logger.warn(
+            "trainer.trajectory_generator doesn't contain a policy to save.",
+        )
+
+
+@train_preference_comparisons_pebble_ex.main
+def train_preference_comparisons(
+    total_timesteps: int,
+    total_comparisons: int,
+    num_iterations: int,
+    comparison_queue_size: Optional[int],
+    fragment_length: int,
+    transition_oversampling: float,
+    initial_comparison_frac: float,
+    exploration_frac: float,
+    trajectory_path: Optional[str],
+    trajectory_generator_kwargs: Mapping[str, Any],
+    save_preferences: bool,
+    agent_path: Optional[str],
+    preference_model_kwargs: Mapping[str, Any],
+    reward_trainer_kwargs: Mapping[str, Any],
+    gatherer_cls: Type[preference_comparisons.PreferenceGatherer],
+    gatherer_kwargs: Mapping[str, Any],
+    active_selection: bool,
+    active_selection_oversampling: int,
+    uncertainty_on: str,
+    fragmenter_kwargs: Mapping[str, Any],
+    allow_variable_horizon: bool,
+    checkpoint_interval: int,
+    query_schedule: Union[str, type_aliases.Schedule],
+) -> Mapping[str, Any]:
+    """Train a reward model using preference comparisons.
+
+    Args:
+        total_timesteps: number of environment interaction steps
+        total_comparisons: number of preferences to gather in total
+        num_iterations: number of times to train the agent against the reward model
+            and then train the reward model against newly gathered preferences.
+        comparison_queue_size: the maximum number of comparisons to keep in the
+            queue for training the reward model. If None, the queue will grow
+            without bound as new comparisons are added.
+        fragment_length: number of timesteps per fragment that is used to elicit
+            preferences
+        transition_oversampling: factor by which to oversample transitions before
+            creating fragments. Since fragments are sampled with replacement,
+            this is usually chosen > 1 to avoid having the same transition
+            in too many fragments.
+        initial_comparison_frac: fraction of total_comparisons that will be
+            sampled before the rest of training begins (using the randomly initialized
+            agent). This can be used to pretrain the reward model before the agent
+            is trained on the learned reward.
+        exploration_frac: fraction of trajectory samples that will be created using
+            partially random actions, rather than the current policy. Might be helpful
+            if the learned policy explores too little and gets stuck with a wrong
+            reward.
+        trajectory_path: either None, in which case an agent will be trained
+            and used to sample trajectories on the fly, or a path to a pickled
+            sequence of TrajectoryWithRew to be trained on.
+        trajectory_generator_kwargs: kwargs to pass to the trajectory generator.
+        save_preferences: if True, store the final dataset of preferences to disk.
+        agent_path: if given, initialize the agent using this stored policy
+            rather than randomly.
+        preference_model_kwargs: passed to PreferenceModel
+        reward_trainer_kwargs: passed to BasicRewardTrainer or EnsembleRewardTrainer
+        gatherer_cls: type of PreferenceGatherer to use (defaults to SyntheticGatherer)
+        gatherer_kwargs: passed to the PreferenceGatherer specified by gatherer_cls
+        active_selection: use active selection fragmenter instead of random fragmenter
+        active_selection_oversampling: factor by which to oversample random fragments
+            from the base fragmenter of active selection.
+            this is usually chosen > 1 to allow the active selection algorithm to pick
+            fragment pairs with highest uncertainty. = 1 implies no active selection.
+        uncertainty_on: passed to ActiveSelectionFragmenter
+        fragmenter_kwargs: passed to RandomFragmenter
+        allow_variable_horizon: If False (default), algorithm will raise an
+            exception if it detects trajectories of different length during
+            training. If True, overrides this safety check. WARNING: variable
+            horizon episodes leak information about the reward via termination
+            condition, and can seriously confound evaluation. Read
+            https://imitation.readthedocs.io/en/latest/guide/variable_horizon.html
+            before overriding this.
+        checkpoint_interval: Save the reward model and policy models (if
+            trajectory_generator contains a policy) every `checkpoint_interval`
+            iterations and after training is complete. If 0, then only save weights
+            after training is complete. If <0, then don't save weights at all.
+        query_schedule: one of ("constant", "hyperbolic", "inverse_quadratic").
+            A function indicating how the total number of preference queries should
+            be allocated to each iteration. "hyperbolic" and "inverse_quadratic"
+            apportion fewer queries to later iterations when the policy is assumed
+            to be better and more stable.
+
+    Returns:
+        Rollout statistics from trained policy.
+
+    Raises:
+        ValueError: Inconsistency between config and deserialized policy normalization.
+    """
+    custom_logger, log_dir = common.setup_logging()
+    rng = common.make_rng()
+
+    with common.make_venv() as venv:
+        reward_net = reward.make_reward_net(venv)
+        relabel_reward_fn = functools.partial(
+            reward_net.predict_processed,
+            update_stats=False,
+        )
+        if agent_path is None:
+            agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn)
+        else:
+            agent = rl_common.load_rl_algo_from_path(
+                agent_path=agent_path,
+                venv=venv,
+                relabel_reward_fn=relabel_reward_fn,
+            )
+
+        if trajectory_path is None:
+            # Setting the logger here is not necessary (PreferenceComparisons takes care
+            # of it automatically) but it avoids creating unnecessary loggers.
+            agent_trainer = preference_comparisons.AgentTrainer(
+                algorithm=agent,
+                reward_fn=reward_net,
+                venv=venv,
+                exploration_frac=exploration_frac,
+                rng=rng,
+                custom_logger=custom_logger,
+                **trajectory_generator_kwargs,
+            )
+            # Stable Baselines will automatically occupy GPU 0 if it is available.
+            # Let's use the same device as the SB3 agent for the reward model.
+            reward_net = reward_net.to(agent_trainer.algorithm.device)
+            trajectory_generator: preference_comparisons.TrajectoryGenerator = (
+                agent_trainer
+            )
+        else:
+            if exploration_frac > 0:
+                raise ValueError(
+                    "exploration_frac can't be set when a trajectory dataset is used",
+                )
+            trajectory_generator = preference_comparisons.TrajectoryDataset(
+                trajectories=types.load_with_rewards(trajectory_path),
+                rng=rng,
+                custom_logger=custom_logger,
+                **trajectory_generator_kwargs,
+            )
+
+        fragmenter: preference_comparisons.Fragmenter = (
+            preference_comparisons.RandomFragmenter(
+                **fragmenter_kwargs,
+                rng=rng,
+                custom_logger=custom_logger,
+            )
+        )
+        preference_model = preference_comparisons.PreferenceModel(
+            **preference_model_kwargs,
+            model=reward_net,
+        )
+        if active_selection:
+            fragmenter = preference_comparisons.ActiveSelectionFragmenter(
+                preference_model=preference_model,
+                base_fragmenter=fragmenter,
+                fragment_sample_factor=active_selection_oversampling,
+                uncertainty_on=uncertainty_on,
+                custom_logger=custom_logger,
+            )
+        gatherer = gatherer_cls(
+            **gatherer_kwargs,
+            rng=rng,
+            custom_logger=custom_logger,
+        )
+
+        loss = preference_comparisons.CrossEntropyRewardLoss()
+
+        reward_trainer = preference_comparisons._make_reward_trainer(
+            preference_model,
+            loss,
+            rng,
+            reward_trainer_kwargs,
+        )
+
+        main_trainer = preference_comparisons.PreferenceComparisons(
+            trajectory_generator,
+            reward_net,
+            num_iterations=num_iterations,
+            fragmenter=fragmenter,
+            preference_gatherer=gatherer,
+            reward_trainer=reward_trainer,
+            comparison_queue_size=comparison_queue_size,
+            fragment_length=fragment_length,
+            transition_oversampling=transition_oversampling,
+            initial_comparison_frac=initial_comparison_frac,
+            custom_logger=custom_logger,
+            allow_variable_horizon=allow_variable_horizon,
+            query_schedule=query_schedule,
+        )
+
+        def save_callback(iteration_num):
+            if checkpoint_interval > 0 and iteration_num % checkpoint_interval == 0:
+                save_checkpoint(
+                    trainer=main_trainer,
+                    save_path=log_dir / "checkpoints" / f"{iteration_num:04d}",
+                    allow_save_policy=bool(trajectory_path is None),
+                )
+
+        results = main_trainer.train(
+            total_timesteps,
+            total_comparisons,
+            callback=save_callback,
+        )
+
+        # Storing and evaluating policy only useful if we generated trajectory data
+        if bool(trajectory_path is None):
+            results = dict(results)
+            results["rollout"] = train.eval_policy(agent, venv)
+
+    if save_preferences:
+        main_trainer.dataset.save(log_dir / "preferences.pkl")
+
+    # Save final artifacts.
+    if checkpoint_interval >= 0:
+        save_checkpoint(
+            trainer=main_trainer,
+            save_path=log_dir / "checkpoints" / "final",
+            allow_save_policy=bool(trajectory_path is None),
+        )
+
+    return results
+
+
+def main_console():
+    observer_path = (
+        pathlib.Path.cwd() / "output" / "sacred" / "train_preference_comparisons_pebble"
+    )
+    observer = FileStorageObserver(observer_path)
+    train_preference_comparisons_pebble_ex.observers.append(observer)
+    train_preference_comparisons_pebble_ex.run_commandline()
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main_console()

From 2d836deebebf79731a0ecfa13acd1a154730f302 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Wed, 30 Nov 2022 00:40:00 +0100
Subject: [PATCH 16/55] #625 use an OffPolicy for pebble

---
 .../train_preference_comparisons_pebble.py    | 34 ++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
index d6887e066..3dde185b5 100644
--- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py
+++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
@@ -1,8 +1,12 @@
 """Configuration for imitation.scripts.train_preference_comparisons_pebble."""
 
+import warnings
+
 import sacred
+import stable_baselines3 as sb3
 
 from imitation.algorithms import preference_comparisons
+from imitation.policies import base
 from imitation.scripts.common import common, reward, rl, train
 
 train_preference_comparisons_pebble_ex = sacred.Experiment(
@@ -15,7 +19,6 @@
     ],
 )
 
-
 MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1)))
 ANT_SHARED_LOCALS = dict(
     total_timesteps=int(3e7),
@@ -23,6 +26,35 @@
 )
 
 
+@rl.rl_ingredient.config
+def rl_sac():
+    # For recommended SAC hyperparams in each environment, see:
+    # https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/sac.yml
+    rl_cls = sb3.SAC
+    warnings.warn(
+        "SAC currently only supports continuous action spaces. "
+        "Consider adding a discrete version as mentioned here: "
+        "https://github.com/DLR-RM/stable-baselines3/issues/505",
+        category=RuntimeWarning,
+    )
+    # Default HPs are as follows:
+    batch_size = 256  # batch size for RL algorithm
+    rl_kwargs = dict(batch_size=None)  # make sure to set batch size to None
+    locals()  # quieten flake8
+
+
+@train.train_ingredient.config
+def train_sac():
+    policy_cls = base.SAC1024Policy  # noqa: F841
+    locals()  # quieten flake8
+
+
+@common.common_ingredient.config
+def mountain_car():
+    env_name = "MountainCarContinuous-v0"
+    locals()  # quieten flake8
+
+
 @train_preference_comparisons_pebble_ex.config
 def train_defaults():
     fragment_length = 100  # timesteps per fragment used for comparisons

From ec5f67e986e105816ff6f4f77145dd45e7bb5be8 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Wed, 30 Nov 2022 14:23:02 +0100
Subject: [PATCH 17/55] #625 fix assumptions about shapes in
 ReplayBufferEntropyRewardWrapper

---
 .../policies/replay_buffer_wrapper.py         | 24 ++++++++-----------
 .../train_preference_comparisons_pebble.py    |  2 +-
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 539f2e512..5a55b80bf 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -163,26 +163,22 @@ def sample(self, *args, **kwargs):
             all_obs = self.observations
         else:
             all_obs = self.observations[: self.pos]
+        # super().sample() flattens the venv dimension, let's do it too
+        all_obs = all_obs.reshape((-1, *self.obs_shape))
         entropies = util.compute_state_entropy(
-            # TODO support multiple environments
-            samples.observations.unsqueeze(1),
-            all_obs,
+            samples.observations,
+            all_obs.reshape((-1, *self.obs_shape)),
             self.k,
         )
 
         # Normalize to have mean of 0 and standard deviation of 1 according to running stats
         entropies = self.entropy_stats.forward(entropies)
-
-        entropies_th = (
-            util.safe_to_tensor(entropies)
-            .reshape(samples.rewards.shape)
-            .to(samples.rewards.device)
-        )
+        assert entropies.shape == samples.rewards.shape
 
         return ReplayBufferSamples(
-            samples.observations,
-            samples.actions,
-            samples.next_observations,
-            samples.dones,
-            entropies_th,
+            observations=samples.observations,
+            actions=samples.actions,
+            next_observations=samples.next_observations,
+            dones=samples.dones,
+            rewards=entropies,
         )
diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
index 3dde185b5..e65f38e37 100644
--- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py
+++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
@@ -50,7 +50,7 @@ def train_sac():
 
 
 @common.common_ingredient.config
-def mountain_car():
+def common_mountain_car_continuous():
     env_name = "MountainCarContinuous-v0"
     locals()  # quieten flake8
 

From da228bd8d37b093d79a7bf1dede0ebc92bf89daa Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 00:00:01 +0100
Subject: [PATCH 18/55] #625 entropy reward as a function

---
 .../algorithms/pebble/entropy_reward.py       | 44 ++++++++++++
 .../policies/replay_buffer_wrapper.py         | 32 ++++++++-
 src/imitation/util/networks.py                |  3 +
 src/imitation/util/util.py                    | 19 +++--
 .../algorithms/pebble/test_entropy_reward.py  | 70 +++++++++++++++++++
 tests/policies/test_replay_buffer_wrapper.py  | 39 +++++++++++
 tests/util/test_util.py                       | 12 ++++
 7 files changed, 211 insertions(+), 8 deletions(-)
 create mode 100644 src/imitation/algorithms/pebble/entropy_reward.py
 create mode 100644 tests/algorithms/pebble/test_entropy_reward.py

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
new file mode 100644
index 000000000..724fbf314
--- /dev/null
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -0,0 +1,44 @@
+import numpy as np
+import torch as th
+from gym.vector.utils import spaces
+from stable_baselines3.common.preprocessing import get_obs_shape
+
+from imitation.policies.replay_buffer_wrapper import ReplayBufferView
+from imitation.rewards.reward_function import RewardFn
+from imitation.util import util
+from imitation.util.networks import RunningNorm
+
+
+class StateEntropyReward(RewardFn):
+    def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
+        self.nearest_neighbor_k = nearest_neighbor_k
+        # TODO support n_envs > 1
+        self.entropy_stats = RunningNorm(1)
+        self.obs_shape = get_obs_shape(observation_space)
+        self.replay_buffer_view = ReplayBufferView(
+            np.empty(0, dtype=observation_space.dtype), lambda: slice(0)
+        )
+
+    def set_buffer_view(self, replay_buffer_view: ReplayBufferView):
+        self.replay_buffer_view = replay_buffer_view
+
+    def __call__(
+        self,
+        state: np.ndarray,
+        action: np.ndarray,
+        next_state: np.ndarray,
+        done: np.ndarray,
+    ) -> np.ndarray:
+        # TODO: should this work with torch instead of numpy internally?
+        #   (The RewardFn protocol requires numpy)
+
+        all_observations = self.replay_buffer_view.observations
+        # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
+        all_observations = all_observations.reshape((-1, *self.obs_shape))
+        entropies = util.compute_state_entropy(
+            state,
+            all_observations,
+            self.nearest_neighbor_k,
+        )
+        normalized_entropies = self.entropy_stats.forward(th.as_tensor(entropies))
+        return normalized_entropies.numpy()
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 5a55b80bf..477fb97b2 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -24,6 +24,29 @@ def _samples_to_reward_fn_input(
     )
 
 
+class ReplayBufferView:
+    """A read-only view over a valid records in a ReplayBuffer.
+
+    Args:
+        observations_buffer: Array buffer holding observations
+        buffer_slice_provider: Function returning slice of buffer
+            with valid observations
+    """
+
+    def __init__(
+        self,
+        observations_buffer: np.ndarray,
+        buffer_slice_provider: Callable[[], slice],
+    ):
+        self._observations_buffer = observations_buffer.view()
+        self._observations_buffer.flags.writeable = False
+        self._buffer_slice_provider = buffer_slice_provider
+
+    @property
+    def observations(self):
+        return self._observations_buffer[self._buffer_slice_provider()]
+
+
 class ReplayBufferRewardWrapper(ReplayBuffer):
     """Relabel the rewards in transitions sampled from a ReplayBuffer."""
 
@@ -79,6 +102,13 @@ def full(self) -> bool:
     def full(self, full: bool):
         self.replay_buffer.full = full
 
+    @property
+    def buffer_view(self) -> ReplayBufferView:
+        def valid_buffer_slice():
+            return slice(None) if self.full else slice(self.pos)
+
+        return ReplayBufferView(self.replay_buffer.observations, valid_buffer_slice)
+
     def sample(self, *args, **kwargs):
         samples = self.replay_buffer.sample(*args, **kwargs)
         rewards = self.reward_fn(**_samples_to_reward_fn_input(samples))
@@ -167,7 +197,7 @@ def sample(self, *args, **kwargs):
         all_obs = all_obs.reshape((-1, *self.obs_shape))
         entropies = util.compute_state_entropy(
             samples.observations,
-            all_obs.reshape((-1, *self.obs_shape)),
+            all_obs,
             self.k,
         )
 
diff --git a/src/imitation/util/networks.py b/src/imitation/util/networks.py
index 048273656..e9564ca44 100644
--- a/src/imitation/util/networks.py
+++ b/src/imitation/util/networks.py
@@ -86,6 +86,9 @@ def forward(self, x: th.Tensor) -> th.Tensor:
             with th.no_grad():
                 self.update_stats(x)
 
+        return self.normalize(x)
+
+    def normalize(self, x: th.Tensor) -> th.Tensor:
         # Note: this is different from the behavior in stable-baselines, see
         # https://github.com/HumanCompatibleAI/imitation/issues/442
         return (x - self.running_mean) / th.sqrt(self.running_var + self.eps)
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index df8eb6a6a..d88f775cd 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -362,10 +362,10 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]:
 
 
 def compute_state_entropy(
-    obs: th.Tensor,
-    all_obs: th.Tensor,
+    obs: np.ndarray,
+    all_obs: np.ndarray,
     k: int,
-) -> th.Tensor:
+) -> np.ndarray:
     """Compute the state entropy given by KNN distance.
 
     Args:
@@ -379,14 +379,19 @@ def compute_state_entropy(
     assert obs.shape[1:] == all_obs.shape[1:]
     with th.no_grad():
         non_batch_dimensions = tuple(range(2, len(obs.shape) + 1))
-        distances_tensor = th.linalg.vector_norm(
+        distances_tensor = np.linalg.norm(
             obs[:, None] - all_obs[None, :],
-            dim=non_batch_dimensions,
+            axis=non_batch_dimensions,
             ord=2,
         )
 
         # Note that we take the k+1'th value because the closest neighbor to
         # a point is itself, which we want to skip.
-        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
+        knn_dists = kth_value(distances_tensor, k+1)
         state_entropy = knn_dists
-    return state_entropy.unsqueeze(1)
+    return np.expand_dims(state_entropy, axis=1)
+
+
+def kth_value(x: np.ndarray, k: int):
+    assert k > 0
+    return np.partition(x, k - 1, axis=-1)[..., k - 1]
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
new file mode 100644
index 000000000..777a9b9d6
--- /dev/null
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -0,0 +1,70 @@
+from unittest.mock import patch
+
+import numpy as np
+import torch as th
+from gym.spaces import Discrete
+from stable_baselines3.common.preprocessing import get_obs_shape
+
+from imitation.algorithms.pebble.entropy_reward import StateEntropyReward
+from imitation.policies.replay_buffer_wrapper import ReplayBufferView
+from imitation.util import util
+
+SPACE = Discrete(4)
+PLACEHOLDER = np.empty(get_obs_shape(SPACE))
+
+BUFFER_SIZE = 20
+K = 4
+BATCH_SIZE = 8
+VENVS = 2
+
+
+def test_state_entropy_reward_returns_entropy(rng):
+    obs_shape = get_obs_shape(SPACE)
+    all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape))
+
+    reward_fn = StateEntropyReward(K, SPACE)
+    reward_fn.set_buffer_view(ReplayBufferView(all_observations, lambda: slice(None)))
+
+    # Act
+    observations = rng.random((BATCH_SIZE, *obs_shape))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    expected = util.compute_state_entropy(
+        observations, all_observations.reshape(-1, *obs_shape), K
+    )
+    expected_normalized = reward_fn.entropy_stats.normalize(th.as_tensor(expected)).numpy()
+    np.testing.assert_allclose(reward, expected_normalized)
+
+
+def test_state_entropy_reward_returns_normalized_values():
+    with patch("imitation.util.util.compute_state_entropy") as m:
+        # mock entropy computation so that we can test only stats collection in this test
+        m.side_effect = lambda obs, all_obs, k: obs
+
+        reward_fn = StateEntropyReward(K, SPACE)
+        all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
+        reward_fn.set_buffer_view(
+            ReplayBufferView(all_observations, lambda: slice(None))
+        )
+
+        dim = 8
+        shift = 3
+        scale = 2
+
+        # Act
+        for _ in range(1000):
+            state = th.randn(dim) * scale + shift
+            reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+        normalized_reward = reward_fn(
+            np.zeros(dim), PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        )
+
+        # Assert
+        np.testing.assert_allclose(
+            normalized_reward,
+            np.repeat(-shift / scale, dim),
+            rtol=0.05,
+            atol=0.05,
+        )
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 5d06139aa..668208b58 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -2,6 +2,7 @@
 
 import os.path as osp
 from typing import Type
+from unittest.mock import Mock
 
 import gym
 import numpy as np
@@ -10,7 +11,9 @@
 import torch as th
 from gym import spaces
 from stable_baselines3.common import buffers, off_policy_algorithm, policies
+from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.policies import BasePolicy
+from stable_baselines3.common.preprocessing import get_obs_shape, get_action_dim
 from stable_baselines3.common.save_util import load_from_pkl
 from stable_baselines3.common.vec_env import DummyVecEnv
 
@@ -225,3 +228,39 @@ def test_entropy_wrapper_class(tmpdir, rng):
         k=k,
     )
     assert trained_entropy.mean() > initial_entropy.mean()
+
+
+def test_replay_buffer_view_provides_buffered_observations():
+    space = spaces.Box(np.array([0]), np.array([5]))
+    n_envs = 2
+    buffer_size = 10
+    action = np.empty((n_envs, get_action_dim(space)))
+
+    obs_shape = get_obs_shape(space)
+    wrapper = ReplayBufferRewardWrapper(
+        buffer_size,
+        space,
+        space,
+        replay_buffer_class=ReplayBuffer,
+        reward_fn=Mock(),
+        n_envs=n_envs,
+        handle_timeout_termination=False,
+    )
+    view = wrapper.buffer_view
+
+    # initially empty
+    assert len(view.observations) == 0
+
+    # after adding observation
+    obs1 = np.random.random((n_envs, *obs_shape))
+    wrapper.add(obs1, obs1, action, np.empty(n_envs), np.empty(n_envs), [])
+    np.testing.assert_allclose(view.observations, np.array([obs1]))
+
+    # after filling buffer
+    observations = np.random.random((buffer_size // n_envs, n_envs, *obs_shape))
+    for obs in observations:
+        wrapper.add(obs, obs, action, np.empty(n_envs), np.empty(n_envs), [])
+
+    # ReplayBuffer internally uses a circular buffer
+    expected = np.roll(observations, 1, axis=0)
+    np.testing.assert_allclose(view.observations, expected)
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index 28678dc8b..be2487aee 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -11,6 +11,7 @@
 
 from imitation.util import sacred as sacred_util
 from imitation.util import util
+from imitation.util.util import kth_value
 
 
 def test_endless_iter():
@@ -144,3 +145,14 @@ def test_compute_state_entropy_2d():
         util.compute_state_entropy(obs, all_obs, k=3),
         np.sqrt(20**2 + 2**2),
     )
+
+
+def test_kth_value():
+    arr1 = np.arange(0, 10, 1)
+    np.random.shuffle(arr1)
+    arr2 = np.arange(0, 100, 10)
+    np.random.shuffle(arr2)
+    arr = np.stack([arr1, arr2])
+
+    result = kth_value(arr, 3)
+    np.testing.assert_array_equal(result, np.array([2, 20]))

From 1ec645ae7b2bc54fef31e8dd40e951e005e80f4c Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 01:20:00 +0100
Subject: [PATCH 19/55] #625 make entropy reward serializable with pickle

---
 .../algorithms/pebble/entropy_reward.py       | 16 +++++++++--
 .../policies/replay_buffer_wrapper.py         |  1 +
 .../algorithms/pebble/test_entropy_reward.py  | 28 +++++++++++++++++--
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 724fbf314..a1fff0e46 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -14,13 +14,14 @@ def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
         self.nearest_neighbor_k = nearest_neighbor_k
         # TODO support n_envs > 1
         self.entropy_stats = RunningNorm(1)
+        self.observation_space = observation_space
         self.obs_shape = get_obs_shape(observation_space)
         self.replay_buffer_view = ReplayBufferView(
             np.empty(0, dtype=observation_space.dtype), lambda: slice(0)
         )
 
-    def set_buffer_view(self, replay_buffer_view: ReplayBufferView):
-        self.replay_buffer_view = replay_buffer_view
+    def set_replay_buffer(self, replay_buffer: ReplayBufferView):
+        self.replay_buffer_view = replay_buffer
 
     def __call__(
         self,
@@ -42,3 +43,14 @@ def __call__(
         )
         normalized_entropies = self.entropy_stats.forward(th.as_tensor(entropies))
         return normalized_entropies.numpy()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["replay_buffer_view"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.replay_buffer_view = ReplayBufferView(
+            np.empty(0, self.observation_space.dtype), lambda: slice(0)
+        )
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 477fb97b2..a7d548165 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -10,6 +10,7 @@
 from imitation.rewards.reward_function import RewardFn
 from imitation.util import util
 from imitation.util.networks import RunningNorm
+from typing import Callable
 
 
 def _samples_to_reward_fn_input(
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 777a9b9d6..5571c304f 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -1,3 +1,4 @@
+import pickle
 from unittest.mock import patch
 
 import numpy as np
@@ -33,7 +34,9 @@ def test_state_entropy_reward_returns_entropy(rng):
     expected = util.compute_state_entropy(
         observations, all_observations.reshape(-1, *obs_shape), K
     )
-    expected_normalized = reward_fn.entropy_stats.normalize(th.as_tensor(expected)).numpy()
+    expected_normalized = reward_fn.entropy_stats.normalize(
+        th.as_tensor(expected)
+    ).numpy()
     np.testing.assert_allclose(reward, expected_normalized)
 
 
@@ -44,7 +47,7 @@ def test_state_entropy_reward_returns_normalized_values():
 
         reward_fn = StateEntropyReward(K, SPACE)
         all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
-        reward_fn.set_buffer_view(
+        reward_fn.set_replay_buffer(
             ReplayBufferView(all_observations, lambda: slice(None))
         )
 
@@ -68,3 +71,24 @@ def test_state_entropy_reward_returns_normalized_values():
             rtol=0.05,
             atol=0.05,
         )
+
+
+def test_state_entropy_reward_can_pickle():
+    all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
+    replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
+
+    obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE))
+    reward_fn = StateEntropyReward(K, SPACE)
+    reward_fn.set_replay_buffer(replay_buffer)
+    reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Act
+    pickled = pickle.dumps(reward_fn)
+    reward_fn_deserialized = pickle.loads(pickled)
+    reward_fn_deserialized.set_replay_buffer(replay_buffer)
+
+    # Assert
+    obs2 = np.random.rand(VENVS, *get_obs_shape(SPACE))
+    expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+    actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+    np.testing.assert_allclose(actual_result, expected_result)

From 4e16c424c6a8ffda4e046b3427b096d0a6de1783 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 14:24:23 +0100
Subject: [PATCH 20/55] #625 revert change of compute_state_entropy() from
 tensors to numpy

---
 src/imitation/util/util.py | 20 ++++++++------------
 tests/util/test_util.py    | 11 -----------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index d88f775cd..9e5815e0c 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -362,10 +362,10 @@ def get_first_iter_element(iterable: Iterable[T]) -> Tuple[T, Iterable[T]]:
 
 
 def compute_state_entropy(
-    obs: np.ndarray,
-    all_obs: np.ndarray,
+    obs: th.Tensor,
+    all_obs: th.Tensor,
     k: int,
-) -> np.ndarray:
+) -> th.Tensor:
     """Compute the state entropy given by KNN distance.
 
     Args:
@@ -379,19 +379,15 @@ def compute_state_entropy(
     assert obs.shape[1:] == all_obs.shape[1:]
     with th.no_grad():
         non_batch_dimensions = tuple(range(2, len(obs.shape) + 1))
-        distances_tensor = np.linalg.norm(
+        distances_tensor = th.linalg.vector_norm(
             obs[:, None] - all_obs[None, :],
-            axis=non_batch_dimensions,
+            dim=non_batch_dimensions,
             ord=2,
         )
 
         # Note that we take the k+1'th value because the closest neighbor to
         # a point is itself, which we want to skip.
-        knn_dists = kth_value(distances_tensor, k+1)
+        assert distances_tensor.shape[-1] > k
+        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
         state_entropy = knn_dists
-    return np.expand_dims(state_entropy, axis=1)
-
-
-def kth_value(x: np.ndarray, k: int):
-    assert k > 0
-    return np.partition(x, k - 1, axis=-1)[..., k - 1]
+    return state_entropy.unsqueeze(1)
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index be2487aee..745529d2d 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -11,7 +11,6 @@
 
 from imitation.util import sacred as sacred_util
 from imitation.util import util
-from imitation.util.util import kth_value
 
 
 def test_endless_iter():
@@ -146,13 +145,3 @@ def test_compute_state_entropy_2d():
         np.sqrt(20**2 + 2**2),
     )
 
-
-def test_kth_value():
-    arr1 = np.arange(0, 10, 1)
-    np.random.shuffle(arr1)
-    arr2 = np.arange(0, 100, 10)
-    np.random.shuffle(arr2)
-    arr = np.stack([arr1, arr2])
-
-    result = kth_value(arr, 3)
-    np.testing.assert_array_equal(result, np.array([2, 20]))

From acb51be5c3e0922cc992e1f7f98002f95084c814 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 14:41:06 +0100
Subject: [PATCH 21/55] #625 extract _preference_feedback_schedule()

---
 .../algorithms/preference_comparisons.py      | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 413cd979a..2b4a6d972 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1670,16 +1670,9 @@ def train(
             A dictionary with final metrics such as loss and accuracy
             of the reward model.
         """
-        initial_comparisons = int(total_comparisons * self.initial_comparison_frac)
-        total_comparisons -= initial_comparisons
-
         # Compute the number of comparisons to request at each iteration in advance.
-        vec_schedule = np.vectorize(self.query_schedule)
-        unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations))
-        probs = unnormalized_probs / np.sum(unnormalized_probs)
-        shares = util.oric(probs * total_comparisons)
-        schedule = [initial_comparisons] + shares.tolist()
-        print(f"Query schedule: {schedule}")
+        preference_query_schedule = self._preference_gather_schedule(total_comparisons)
+        print(f"Query schedule: {preference_query_schedule}")
 
         timesteps_per_iteration, extra_timesteps = divmod(
             total_timesteps,
@@ -1688,7 +1681,7 @@ def train(
         reward_loss = None
         reward_accuracy = None
 
-        for i, num_pairs in enumerate(schedule):
+        for i, num_pairs in enumerate(preference_query_schedule):
             ##########################
             # Gather new preferences #
             ##########################
@@ -1751,3 +1744,13 @@ def train(
             self._iteration += 1
 
         return {"reward_loss": reward_loss, "reward_accuracy": reward_accuracy}
+
+    def _preference_gather_schedule(self, total_comparisons):
+        initial_comparisons = int(total_comparisons * self.initial_comparison_frac)
+        total_comparisons -= initial_comparisons
+        vec_schedule = np.vectorize(self.query_schedule)
+        unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations))
+        probs = unnormalized_probs / np.sum(unnormalized_probs)
+        shares = util.oric(probs * total_comparisons)
+        schedule = [initial_comparisons] + shares.tolist()
+        return schedule

From 8143ba394e4909c6e5674767bf4594f4d061cb8c Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 15:18:28 +0100
Subject: [PATCH 22/55] #625 introduce parameter for pretraining steps

---
 .../algorithms/preference_comparisons.py      | 24 +++++++++++++++----
 .../train_preference_comparisons_pebble.py    |  3 +++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 2b4a6d972..ad2b8b6dc 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1495,6 +1495,7 @@ def __init__(
         transition_oversampling: float = 1,
         initial_comparison_frac: float = 0.1,
         initial_epoch_multiplier: float = 200.0,
+        initial_agent_pretrain_frac: float = 0.01,
         custom_logger: Optional[imit_logger.HierarchicalLogger] = None,
         allow_variable_horizon: bool = False,
         rng: Optional[np.random.Generator] = None,
@@ -1544,6 +1545,9 @@ def __init__(
             initial_epoch_multiplier: before agent training begins, train the reward
                 model for this many more epochs than usual (on fragments sampled from a
                 random agent).
+            initial_agent_pretrain_frac: fraction of total_timesteps for which the
+                agent will be trained without preference gathering (and reward model
+                training)
             custom_logger: Where to log to; if None (default), creates a new logger.
             allow_variable_horizon: If False (default), algorithm will raise an
                 exception if it detects trajectories of different length during
@@ -1642,6 +1646,7 @@ def __init__(
         self.fragment_length = fragment_length
         self.initial_comparison_frac = initial_comparison_frac
         self.initial_epoch_multiplier = initial_epoch_multiplier
+        self.initial_agent_pretrain_frac = initial_agent_pretrain_frac
         self.num_iterations = num_iterations
         self.transition_oversampling = transition_oversampling
         if callable(query_schedule):
@@ -1674,10 +1679,11 @@ def train(
         preference_query_schedule = self._preference_gather_schedule(total_comparisons)
         print(f"Query schedule: {preference_query_schedule}")
 
-        timesteps_per_iteration, extra_timesteps = divmod(
-            total_timesteps,
-            self.num_iterations,
-        )
+        (
+            agent_pretrain_timesteps,
+            timesteps_per_iteration,
+            extra_timesteps,
+        ) = self._compute_timesteps(total_timesteps)
         reward_loss = None
         reward_accuracy = None
 
@@ -1754,3 +1760,13 @@ def _preference_gather_schedule(self, total_comparisons):
         shares = util.oric(probs * total_comparisons)
         schedule = [initial_comparisons] + shares.tolist()
         return schedule
+
+    def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]:
+        agent_pretrain_timesteps = int(
+            total_timesteps * self.initial_agent_pretrain_frac
+        )
+        timesteps_per_iteration, extra_timesteps = divmod(
+            total_timesteps - agent_pretrain_timesteps,
+            self.num_iterations,
+        )
+        return agent_pretrain_timesteps, timesteps_per_iteration, extra_timesteps
diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
index e65f38e37..a497542e7 100644
--- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py
+++ b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
@@ -68,6 +68,8 @@ def train_defaults():
     initial_comparison_frac = 0.1
     # fraction of sampled trajectories that will include some random actions
     exploration_frac = 0.0
+    # fraction of total_timesteps for training before preference gathering
+    initial_agent_pretrain_frac = 0.05
     preference_model_kwargs = {}
     reward_trainer_kwargs = {
         "epochs": 3,
@@ -153,6 +155,7 @@ def fast():
     total_timesteps = 50
     total_comparisons = 5
     initial_comparison_frac = 0.2
+    initial_agent_pretrain_frac = 0.2
     num_iterations = 1
     fragment_length = 2
     reward_trainer_kwargs = {

From 184e191c4194e2f1c3105609e76cf6d43ef840b0 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 15:57:29 +0100
Subject: [PATCH 23/55] #625 add initialized callback to
 ReplayBufferRewardWrapper

---
 src/imitation/policies/replay_buffer_wrapper.py |  8 +++++++-
 tests/policies/test_replay_buffer_wrapper.py    | 15 +++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index a7d548165..a6f13b832 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -1,5 +1,6 @@
 """Wrapper for reward labeling for transitions sampled from a replay buffer."""
 
+from typing import Callable
 from typing import Mapping, Type
 
 import numpy as np
@@ -10,7 +11,6 @@
 from imitation.rewards.reward_function import RewardFn
 from imitation.util import util
 from imitation.util.networks import RunningNorm
-from typing import Callable
 
 
 def _samples_to_reward_fn_input(
@@ -59,6 +59,7 @@ def __init__(
         *,
         replay_buffer_class: Type[ReplayBuffer],
         reward_fn: RewardFn,
+        on_initialized_callback: Callable[["ReplayBufferRewardWrapper"], None] = None,
         **kwargs,
     ):
         """Builds ReplayBufferRewardWrapper.
@@ -69,6 +70,9 @@ def __init__(
             action_space: Action space
             replay_buffer_class: Class of the replay buffer.
             reward_fn: Reward function for reward relabeling.
+            on_initialized_callback: Callback called with reference to this object after
+                this instance is fully initialized. This provides a hook to access the
+                buffer after it is created from inside a Stable Baselines algorithm.
             **kwargs: keyword arguments for ReplayBuffer.
         """
         # Note(yawen-d): we directly inherit ReplayBuffer and leave out the case of
@@ -86,6 +90,8 @@ def __init__(
         self.reward_fn = reward_fn
         _base_kwargs = {k: v for k, v in kwargs.items() if k in ["device", "n_envs"]}
         super().__init__(buffer_size, observation_space, action_space, **_base_kwargs)
+        if on_initialized_callback is not None:
+            on_initialized_callback(self)
 
     @property
     def pos(self) -> int:
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 668208b58..38597dbc0 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -264,3 +264,18 @@ def test_replay_buffer_view_provides_buffered_observations():
     # ReplayBuffer internally uses a circular buffer
     expected = np.roll(observations, 1, axis=0)
     np.testing.assert_allclose(view.observations, expected)
+
+
+def test_replay_buffer_reward_wrapper_calls_initialization_callback_with_itself():
+    callback = Mock()
+    buffer = ReplayBufferRewardWrapper(
+        10,
+        spaces.Discrete(2),
+        spaces.Discrete(2),
+        replay_buffer_class=ReplayBuffer,
+        reward_fn=Mock(),
+        n_envs=2,
+        handle_timeout_termination=False,
+        on_initialized_callback=callback,
+    )
+    assert callback.call_args.args[0] is buffer

From 52d914ab865519995e8bef550b2479c4817a43e9 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 16:26:16 +0100
Subject: [PATCH 24/55] #625 fix entropy_reward.py

---
 src/imitation/algorithms/pebble/entropy_reward.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index a1fff0e46..01c2f9a9f 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -35,13 +35,14 @@ def __call__(
 
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
-        all_observations = all_observations.reshape((-1, *self.obs_shape))
+        all_observations = all_observations.reshape((-1, *state.shape[1:]))  # TODO #625: fix self.obs_shape
+        # TODO #625: deal with the conversion back and forth between np and torch
         entropies = util.compute_state_entropy(
-            state,
-            all_observations,
+            th.tensor(state),
+            th.tensor(all_observations),
             self.nearest_neighbor_k,
         )
-        normalized_entropies = self.entropy_stats.forward(th.as_tensor(entropies))
+        normalized_entropies = self.entropy_stats.forward(entropies)
         return normalized_entropies.numpy()
 
     def __getstate__(self):

From 1f01a7a0b4228bab782830148d130ff2c947a9d4 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 16:28:59 +0100
Subject: [PATCH 25/55] #625 remove ReplayBufferEntropyRewardWrapper

---
 .../algorithms/pebble/entropy_reward.py       |  4 +-
 .../policies/replay_buffer_wrapper.py         | 84 +---------------
 src/imitation/scripts/common/rl.py            |  9 +-
 tests/policies/test_replay_buffer_wrapper.py  | 95 +------------------
 tests/util/test_util.py                       |  1 -
 5 files changed, 11 insertions(+), 182 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 01c2f9a9f..812d1aa56 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -35,7 +35,9 @@ def __call__(
 
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
-        all_observations = all_observations.reshape((-1, *state.shape[1:]))  # TODO #625: fix self.obs_shape
+        all_observations = all_observations.reshape(
+            (-1, *state.shape[1:])  # TODO #625: fix self.obs_shape
+        )
         # TODO #625: deal with the conversion back and forth between np and torch
         entropies = util.compute_state_entropy(
             th.tensor(state),
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index a6f13b832..897957296 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -1,7 +1,6 @@
 """Wrapper for reward labeling for transitions sampled from a replay buffer."""
 
-from typing import Callable
-from typing import Mapping, Type
+from typing import Callable, Mapping, Type
 
 import numpy as np
 from gym import spaces
@@ -10,7 +9,6 @@
 
 from imitation.rewards.reward_function import RewardFn
 from imitation.util import util
-from imitation.util.networks import RunningNorm
 
 
 def _samples_to_reward_fn_input(
@@ -139,83 +137,3 @@ def _get_samples(self):
             "_get_samples() is intentionally not implemented."
             "This method should not be called.",
         )
-
-
-class ReplayBufferEntropyRewardWrapper(ReplayBufferRewardWrapper):
-    """Relabel the rewards from a ReplayBuffer, initially using entropy as reward."""
-
-    def __init__(
-        self,
-        buffer_size: int,
-        observation_space: spaces.Space,
-        action_space: spaces.Space,
-        *,
-        replay_buffer_class: Type[ReplayBuffer],
-        reward_fn: RewardFn,
-        entropy_as_reward_samples: int,
-        k: int = 5,
-        **kwargs,
-    ):
-        """Builds ReplayBufferRewardWrapper.
-
-        Args:
-            buffer_size: Max number of elements in the buffer
-            observation_space: Observation space
-            action_space: Action space
-            replay_buffer_class: Class of the replay buffer.
-            reward_fn: Reward function for reward relabeling.
-            entropy_as_reward_samples: Number of samples to use entropy as the reward,
-                before switching to using the reward_fn for relabeling.
-            k: Use the k'th nearest neighbor's distance when computing state entropy.
-            **kwargs: keyword arguments for ReplayBuffer.
-        """
-        # TODO should we limit by number of batches (as this does)
-        #      or number of observations returned?
-        super().__init__(
-            buffer_size,
-            observation_space,
-            action_space,
-            replay_buffer_class=replay_buffer_class,
-            reward_fn=reward_fn,
-            **kwargs,
-        )
-        self.sample_count = 0
-        self.k = k
-        # TODO support n_envs > 1
-        self.entropy_stats = RunningNorm(1)
-        self.entropy_as_reward_samples = entropy_as_reward_samples
-
-    def sample(self, *args, **kwargs):
-        self.sample_count += 1
-        samples = super().sample(*args, **kwargs)
-        # For some reason self.entropy_as_reward_samples seems to get cleared,
-        # and I have no idea why.
-        if self.sample_count > self.entropy_as_reward_samples:
-            return samples
-        # TODO we really ought to reset the reward network once we are done w/
-        #      the entropy based pre-training. We also have no reason to train
-        #      or even use the reward network before then.
-
-        if self.full:
-            all_obs = self.observations
-        else:
-            all_obs = self.observations[: self.pos]
-        # super().sample() flattens the venv dimension, let's do it too
-        all_obs = all_obs.reshape((-1, *self.obs_shape))
-        entropies = util.compute_state_entropy(
-            samples.observations,
-            all_obs,
-            self.k,
-        )
-
-        # Normalize to have mean of 0 and standard deviation of 1 according to running stats
-        entropies = self.entropy_stats.forward(entropies)
-        assert entropies.shape == samples.rewards.shape
-
-        return ReplayBufferSamples(
-            observations=samples.observations,
-            actions=samples.actions,
-            next_observations=samples.next_observations,
-            dones=samples.dones,
-            rewards=entropies,
-        )
diff --git a/src/imitation/scripts/common/rl.py b/src/imitation/scripts/common/rl.py
index 2bd3759a2..e879bbaf8 100644
--- a/src/imitation/scripts/common/rl.py
+++ b/src/imitation/scripts/common/rl.py
@@ -86,10 +86,11 @@ def _maybe_add_relabel_buffer(
     """Use ReplayBufferRewardWrapper in rl_kwargs if relabel_reward_fn is not None."""
     rl_kwargs = dict(rl_kwargs)
     if relabel_reward_fn:
-        _buffer_kwargs = dict(reward_fn=relabel_reward_fn)
-        _buffer_kwargs["replay_buffer_class"] = rl_kwargs.get(
-            "replay_buffer_class",
-            buffers.ReplayBuffer,
+        _buffer_kwargs = dict(
+            reward_fn=relabel_reward_fn,
+            replay_buffer_class=rl_kwargs.get(
+                "replay_buffer_class", buffers.ReplayBuffer
+            ),
         )
         rl_kwargs["replay_buffer_class"] = ReplayBufferRewardWrapper
 
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 38597dbc0..248018a75 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -13,14 +13,10 @@
 from stable_baselines3.common import buffers, off_policy_algorithm, policies
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.policies import BasePolicy
-from stable_baselines3.common.preprocessing import get_obs_shape, get_action_dim
+from stable_baselines3.common.preprocessing import get_action_dim, get_obs_shape
 from stable_baselines3.common.save_util import load_from_pkl
-from stable_baselines3.common.vec_env import DummyVecEnv
 
-from imitation.policies.replay_buffer_wrapper import (
-    ReplayBufferEntropyRewardWrapper,
-    ReplayBufferRewardWrapper,
-)
+from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper
 from imitation.util import util
 
 
@@ -123,54 +119,6 @@ def test_wrapper_class(tmpdir, rng):
         replay_buffer_wrapper._get_samples()
 
 
-# Combine this with the above test via parameterization over the buffer class
-def test_entropy_wrapper_class_no_op(tmpdir, rng):
-    buffer_size = 15
-    total_timesteps = 20
-    entropy_samples = 0
-
-    venv = util.make_vec_env("Pendulum-v1", n_envs=1, rng=rng)
-    rl_algo = sb3.SAC(
-        policy=sb3.sac.policies.SACPolicy,
-        policy_kwargs=dict(),
-        env=venv,
-        seed=42,
-        replay_buffer_class=ReplayBufferEntropyRewardWrapper,
-        replay_buffer_kwargs=dict(
-            replay_buffer_class=buffers.ReplayBuffer,
-            reward_fn=zero_reward_fn,
-            entropy_as_reward_samples=entropy_samples,
-        ),
-        buffer_size=buffer_size,
-    )
-
-    rl_algo.learn(total_timesteps=total_timesteps)
-
-    buffer_path = osp.join(tmpdir, "buffer.pkl")
-    rl_algo.save_replay_buffer(buffer_path)
-    replay_buffer_wrapper = load_from_pkl(buffer_path)
-    replay_buffer = replay_buffer_wrapper.replay_buffer
-
-    # replay_buffer_wrapper.sample(...) should return zero-reward transitions
-    assert buffer_size == replay_buffer_wrapper.size() == replay_buffer.size()
-    assert (replay_buffer_wrapper.sample(total_timesteps).rewards == 0.0).all()
-    assert (replay_buffer.sample(total_timesteps).rewards != 0.0).all()  # seed=42
-
-    # replay_buffer_wrapper.pos, replay_buffer_wrapper.full
-    assert replay_buffer_wrapper.pos == total_timesteps - buffer_size
-    assert replay_buffer_wrapper.full
-
-    # reset()
-    replay_buffer_wrapper.reset()
-    assert 0 == replay_buffer_wrapper.size() == replay_buffer.size()
-    assert replay_buffer_wrapper.pos == 0
-    assert not replay_buffer_wrapper.full
-
-    # to_torch()
-    tensor = replay_buffer_wrapper.to_torch(np.ones(42))
-    assert type(tensor) is th.Tensor
-
-
 class ActionIsObsEnv(gym.Env):
     """Simple environment where the obs is the action."""
 
@@ -191,45 +139,6 @@ def reset(self):
         return np.array([0])
 
 
-def test_entropy_wrapper_class(tmpdir, rng):
-    buffer_size = 20
-    entropy_samples = 500
-    k = 4
-
-    venv = DummyVecEnv([ActionIsObsEnv])
-    rl_algo = sb3.SAC(
-        policy=sb3.sac.policies.SACPolicy,
-        policy_kwargs=dict(),
-        env=venv,
-        seed=42,
-        replay_buffer_class=ReplayBufferEntropyRewardWrapper,
-        replay_buffer_kwargs=dict(
-            replay_buffer_class=buffers.ReplayBuffer,
-            reward_fn=zero_reward_fn,
-            entropy_as_reward_samples=entropy_samples,
-            k=k,
-        ),
-        buffer_size=buffer_size,
-    )
-
-    rl_algo.learn(total_timesteps=buffer_size)
-    initial_entropy = util.compute_state_entropy(
-        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
-        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
-        k=k,
-    )
-
-    rl_algo.learn(total_timesteps=entropy_samples - buffer_size)
-    # Expect that the entropy of our replay buffer is now higher,
-    # since we trained with that as the reward.
-    trained_entropy = util.compute_state_entropy(
-        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
-        th.Tensor(rl_algo.replay_buffer.replay_buffer.observations),
-        k=k,
-    )
-    assert trained_entropy.mean() > initial_entropy.mean()
-
-
 def test_replay_buffer_view_provides_buffered_observations():
     space = spaces.Box(np.array([0]), np.array([5]))
     n_envs = 2
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
index 745529d2d..28678dc8b 100644
--- a/tests/util/test_util.py
+++ b/tests/util/test_util.py
@@ -144,4 +144,3 @@ def test_compute_state_entropy_2d():
         util.compute_state_entropy(obs, all_obs, k=3),
         np.sqrt(20**2 + 2**2),
     )
-

From 1fbc590f999c62b8435a73e1c20588ddc27cf6ca Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 16:50:15 +0100
Subject: [PATCH 26/55] #625 introduce ReplayBufferAwareRewardFn

---
 .../algorithms/pebble/entropy_reward.py         | 17 +++++++++++++----
 src/imitation/policies/replay_buffer_wrapper.py | 13 ++++++-------
 src/imitation/rewards/reward_function.py        |  6 ++++++
 tests/algorithms/pebble/test_entropy_reward.py  |  8 +++++---
 tests/policies/test_replay_buffer_wrapper.py    | 10 +++++-----
 5 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 812d1aa56..f26af2479 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,15 +1,20 @@
+from typing import Tuple
+
 import numpy as np
 import torch as th
 from gym.vector.utils import spaces
 from stable_baselines3.common.preprocessing import get_obs_shape
 
-from imitation.policies.replay_buffer_wrapper import ReplayBufferView
-from imitation.rewards.reward_function import RewardFn
+from imitation.policies.replay_buffer_wrapper import (
+    ReplayBufferView,
+    ReplayBufferRewardWrapper,
+)
+from imitation.rewards.reward_function import ReplayBufferAwareRewardFn
 from imitation.util import util
 from imitation.util.networks import RunningNorm
 
 
-class StateEntropyReward(RewardFn):
+class StateEntropyReward(ReplayBufferAwareRewardFn):
     def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
         self.nearest_neighbor_k = nearest_neighbor_k
         # TODO support n_envs > 1
@@ -20,8 +25,12 @@ def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
             np.empty(0, dtype=observation_space.dtype), lambda: slice(0)
         )
 
-    def set_replay_buffer(self, replay_buffer: ReplayBufferView):
+    def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
+        self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape)
+
+    def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape:Tuple):
         self.replay_buffer_view = replay_buffer
+        self.obs_shape = obs_shape
 
     def __call__(
         self,
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 897957296..297a6b008 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -7,7 +7,7 @@
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
 
-from imitation.rewards.reward_function import RewardFn
+from imitation.rewards.reward_function import RewardFn, ReplayBufferAwareRewardFn
 from imitation.util import util
 
 
@@ -37,13 +37,13 @@ def __init__(
         observations_buffer: np.ndarray,
         buffer_slice_provider: Callable[[], slice],
     ):
-        self._observations_buffer = observations_buffer.view()
-        self._observations_buffer.flags.writeable = False
+        self._observations_buffer_view = observations_buffer.view()
+        self._observations_buffer_view.flags.writeable = False
         self._buffer_slice_provider = buffer_slice_provider
 
     @property
     def observations(self):
-        return self._observations_buffer[self._buffer_slice_provider()]
+        return self._observations_buffer_view[self._buffer_slice_provider()]
 
 
 class ReplayBufferRewardWrapper(ReplayBuffer):
@@ -57,7 +57,6 @@ def __init__(
         *,
         replay_buffer_class: Type[ReplayBuffer],
         reward_fn: RewardFn,
-        on_initialized_callback: Callable[["ReplayBufferRewardWrapper"], None] = None,
         **kwargs,
     ):
         """Builds ReplayBufferRewardWrapper.
@@ -88,8 +87,8 @@ def __init__(
         self.reward_fn = reward_fn
         _base_kwargs = {k: v for k, v in kwargs.items() if k in ["device", "n_envs"]}
         super().__init__(buffer_size, observation_space, action_space, **_base_kwargs)
-        if on_initialized_callback is not None:
-            on_initialized_callback(self)
+        if isinstance(reward_fn, ReplayBufferAwareRewardFn):
+            reward_fn.on_replay_buffer_initialized(self)
 
     @property
     def pos(self) -> int:
diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py
index 93761752d..e9d7bed30 100644
--- a/src/imitation/rewards/reward_function.py
+++ b/src/imitation/rewards/reward_function.py
@@ -32,3 +32,9 @@ def __call__(
         Returns:
             Computed rewards of shape `(batch_size,`).
         """  # noqa: DAR202
+
+
+class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
+    @abc.abstractmethod
+    def on_replay_buffer_initialized(self, replay_buffer: "ReplayBufferRewardWrapper"):
+        pass
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 5571c304f..16314a1e1 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -23,8 +23,9 @@ def test_state_entropy_reward_returns_entropy(rng):
     obs_shape = get_obs_shape(SPACE)
     all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape))
 
+
     reward_fn = StateEntropyReward(K, SPACE)
-    reward_fn.set_buffer_view(ReplayBufferView(all_observations, lambda: slice(None)))
+    reward_fn.set_replay_buffer(ReplayBufferView(all_observations, lambda: slice(None)), obs_shape)
 
     # Act
     observations = rng.random((BATCH_SIZE, *obs_shape))
@@ -48,7 +49,8 @@ def test_state_entropy_reward_returns_normalized_values():
         reward_fn = StateEntropyReward(K, SPACE)
         all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
         reward_fn.set_replay_buffer(
-            ReplayBufferView(all_observations, lambda: slice(None))
+            ReplayBufferView(all_observations, lambda: slice(None)),
+            get_obs_shape(SPACE)
         )
 
         dim = 8
@@ -79,7 +81,7 @@ def test_state_entropy_reward_can_pickle():
 
     obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE))
     reward_fn = StateEntropyReward(K, SPACE)
-    reward_fn.set_replay_buffer(replay_buffer)
+    reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE))
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Act
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 248018a75..02bb72ce2 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -17,6 +17,7 @@
 from stable_baselines3.common.save_util import load_from_pkl
 
 from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper
+from imitation.rewards.reward_function import ReplayBufferAwareRewardFn
 from imitation.util import util
 
 
@@ -175,16 +176,15 @@ def test_replay_buffer_view_provides_buffered_observations():
     np.testing.assert_allclose(view.observations, expected)
 
 
-def test_replay_buffer_reward_wrapper_calls_initialization_callback_with_itself():
-    callback = Mock()
+def test_replay_buffer_reward_wrapper_calls_reward_initialization_callback():
+    reward_fn = Mock(spec=ReplayBufferAwareRewardFn)
     buffer = ReplayBufferRewardWrapper(
         10,
         spaces.Discrete(2),
         spaces.Discrete(2),
         replay_buffer_class=ReplayBuffer,
-        reward_fn=Mock(),
+        reward_fn=reward_fn,
         n_envs=2,
         handle_timeout_termination=False,
-        on_initialized_callback=callback,
     )
-    assert callback.call_args.args[0] is buffer
+    assert reward_fn.on_replay_buffer_initialized.call_args.args[0] is buffer

From e19dd85e9cecada420a5e55664ceb8df23908b1e Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 16:59:45 +0100
Subject: [PATCH 27/55] #625 rename PebbleStateEntropyReward

---
 src/imitation/algorithms/pebble/entropy_reward.py | 3 ++-
 tests/algorithms/pebble/test_entropy_reward.py    | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index f26af2479..ab5d424b8 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -14,7 +14,8 @@
 from imitation.util.networks import RunningNorm
 
 
-class StateEntropyReward(ReplayBufferAwareRewardFn):
+class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
+    # TODO #625: get rid of the observation_space parameter
     def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
         self.nearest_neighbor_k = nearest_neighbor_k
         # TODO support n_envs > 1
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 16314a1e1..9ba4dd9cd 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -6,7 +6,7 @@
 from gym.spaces import Discrete
 from stable_baselines3.common.preprocessing import get_obs_shape
 
-from imitation.algorithms.pebble.entropy_reward import StateEntropyReward
+from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.policies.replay_buffer_wrapper import ReplayBufferView
 from imitation.util import util
 
@@ -24,7 +24,7 @@ def test_state_entropy_reward_returns_entropy(rng):
     all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape))
 
 
-    reward_fn = StateEntropyReward(K, SPACE)
+    reward_fn = PebbleStateEntropyReward(K, SPACE)
     reward_fn.set_replay_buffer(ReplayBufferView(all_observations, lambda: slice(None)), obs_shape)
 
     # Act
@@ -46,7 +46,7 @@ def test_state_entropy_reward_returns_normalized_values():
         # mock entropy computation so that we can test only stats collection in this test
         m.side_effect = lambda obs, all_obs, k: obs
 
-        reward_fn = StateEntropyReward(K, SPACE)
+        reward_fn = PebbleStateEntropyReward(K, SPACE)
         all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
         reward_fn.set_replay_buffer(
             ReplayBufferView(all_observations, lambda: slice(None)),
@@ -80,7 +80,7 @@ def test_state_entropy_reward_can_pickle():
     replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
 
     obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE))
-    reward_fn = StateEntropyReward(K, SPACE)
+    reward_fn = PebbleStateEntropyReward(K, SPACE)
     reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE))
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 

From da77f5c0373b42b8a9e46f58621bbf6183f134e2 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 17:14:39 +0100
Subject: [PATCH 28/55] #625 PebbleStateEntropyReward can switch from
 unsupervised pretraining

---
 .../algorithms/pebble/entropy_reward.py       | 26 ++++++++--
 .../algorithms/pebble/test_entropy_reward.py  | 51 +++++++++++++++----
 2 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index ab5d424b8..81d43daa8 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -9,14 +9,21 @@
     ReplayBufferView,
     ReplayBufferRewardWrapper,
 )
-from imitation.rewards.reward_function import ReplayBufferAwareRewardFn
+from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn
 from imitation.util import util
 from imitation.util.networks import RunningNorm
 
 
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     # TODO #625: get rid of the observation_space parameter
-    def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
+    # TODO #625: parametrize nearest_neighbor_k
+    def __init__(
+        self,
+        trained_reward_fn: RewardFn,
+        observation_space: spaces.Space,
+        nearest_neighbor_k: int = 5,
+    ):
+        self.trained_reward_fn = trained_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
         # TODO support n_envs > 1
         self.entropy_stats = RunningNorm(1)
@@ -25,14 +32,20 @@ def __init__(self, nearest_neighbor_k: int, observation_space: spaces.Space):
         self.replay_buffer_view = ReplayBufferView(
             np.empty(0, dtype=observation_space.dtype), lambda: slice(0)
         )
+        # This indicates that the training is in the "Unsupervised exploration"
+        # phase of the Pebble algorithm, where entropy is used as reward
+        self.unsupervised_exploration_active = True
 
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
         self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape)
 
-    def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape:Tuple):
+    def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple):
         self.replay_buffer_view = replay_buffer
         self.obs_shape = obs_shape
 
+    def on_unsupervised_exploration_finished(self):
+        self.unsupervised_exploration_active = False
+
     def __call__(
         self,
         state: np.ndarray,
@@ -40,9 +53,14 @@ def __call__(
         next_state: np.ndarray,
         done: np.ndarray,
     ) -> np.ndarray:
+        if self.unsupervised_exploration_active:
+            return self._entropy_reward(state)
+        else:
+            return self.trained_reward_fn(state, action, next_state, done)
+
+    def _entropy_reward(self, state):
         # TODO: should this work with torch instead of numpy internally?
         #   (The RewardFn protocol requires numpy)
-
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
         all_observations = all_observations.reshape(
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 9ba4dd9cd..42496a79c 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -1,5 +1,5 @@
 import pickle
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 
 import numpy as np
 import torch as th
@@ -19,13 +19,14 @@
 VENVS = 2
 
 
-def test_state_entropy_reward_returns_entropy(rng):
+def test_pebble_entropy_reward_returns_entropy(rng):
     obs_shape = get_obs_shape(SPACE)
     all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape))
 
-
-    reward_fn = PebbleStateEntropyReward(K, SPACE)
-    reward_fn.set_replay_buffer(ReplayBufferView(all_observations, lambda: slice(None)), obs_shape)
+    reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
+    reward_fn.set_replay_buffer(
+        ReplayBufferView(all_observations, lambda: slice(None)), obs_shape
+    )
 
     # Act
     observations = rng.random((BATCH_SIZE, *obs_shape))
@@ -41,16 +42,16 @@ def test_state_entropy_reward_returns_entropy(rng):
     np.testing.assert_allclose(reward, expected_normalized)
 
 
-def test_state_entropy_reward_returns_normalized_values():
+def test_pebble_entropy_reward_returns_normalized_values():
     with patch("imitation.util.util.compute_state_entropy") as m:
         # mock entropy computation so that we can test only stats collection in this test
         m.side_effect = lambda obs, all_obs, k: obs
 
-        reward_fn = PebbleStateEntropyReward(K, SPACE)
+        reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
         all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
         reward_fn.set_replay_buffer(
             ReplayBufferView(all_observations, lambda: slice(None)),
-            get_obs_shape(SPACE)
+            get_obs_shape(SPACE),
         )
 
         dim = 8
@@ -75,12 +76,12 @@ def test_state_entropy_reward_returns_normalized_values():
         )
 
 
-def test_state_entropy_reward_can_pickle():
+def test_pebble_entropy_reward_can_pickle():
     all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
     replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
 
     obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE))
-    reward_fn = PebbleStateEntropyReward(K, SPACE)
+    reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K)
     reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE))
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
@@ -94,3 +95,33 @@ def test_state_entropy_reward_can_pickle():
     expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     np.testing.assert_allclose(actual_result, expected_result)
+
+
+def test_pebble_entropy_reward_function_switches_to_inner():
+    obs_shape = get_obs_shape(SPACE)
+
+    expected_reward = np.ones(1)
+    reward_fn_mock = Mock()
+    reward_fn_mock.return_value = expected_reward
+    reward_fn = PebbleStateEntropyReward(reward_fn_mock, SPACE)
+
+    # Act
+    reward_fn.on_unsupervised_exploration_finished()
+    observations = np.ones((BATCH_SIZE, *obs_shape))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    assert reward == expected_reward
+    reward_fn_mock.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
+
+
+def reward_fn_stub(
+    self,
+    state: np.ndarray,
+    action: np.ndarray,
+    next_state: np.ndarray,
+    done: np.ndarray,
+) -> np.ndarray:
+    return state

From a11e7756f2a6b4d839386c1bea10187173e96340 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 19:25:22 +0100
Subject: [PATCH 29/55] #625 add optional pretraining to PreferenceComparisons

---
 .../algorithms/preference_comparisons.py      | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index ad2b8b6dc..e29433188 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -75,6 +75,19 @@ def sample(self, steps: int) -> Sequence[TrajectoryWithRew]:
             be the environment rewards, not ones from a reward model).
         """  # noqa: DAR202
 
+    def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
+        """Pre-train an agent if the trajectory generator uses one that
+            needs pre-training.
+
+        By default, this method does nothing and doesn't need
+        to be overridden in subclasses that don't require pre-training.
+
+        Args:
+            steps: number of environment steps to train for.
+            **kwargs: additional keyword arguments to pass on to
+                the training procedure.
+        """
+
     def train(self, steps: int, **kwargs: Any) -> None:
         """Train an agent if the trajectory generator uses one.
 
@@ -1495,7 +1508,7 @@ def __init__(
         transition_oversampling: float = 1,
         initial_comparison_frac: float = 0.1,
         initial_epoch_multiplier: float = 200.0,
-        initial_agent_pretrain_frac: float = 0.01,
+        initial_agent_pretrain_frac: float = 0.05,
         custom_logger: Optional[imit_logger.HierarchicalLogger] = None,
         allow_variable_horizon: bool = False,
         rng: Optional[np.random.Generator] = None,
@@ -1687,6 +1700,15 @@ def train(
         reward_loss = None
         reward_accuracy = None
 
+        ###################################################
+        # Pre-training agent before gathering preferences #
+        ###################################################
+        with self.logger.accumulate_means("agent"):
+            self.logger.log(
+                f"Pre-training agent for {agent_pretrain_timesteps} timesteps"
+            )
+            self.trajectory_generator.unsupervised_pretrain(agent_pretrain_timesteps)
+
         for i, num_pairs in enumerate(preference_query_schedule):
             ##########################
             # Gather new preferences #

From 7b12162aba277da580b344966044adea7ab6a989 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 20:30:09 +0100
Subject: [PATCH 30/55] #625 PebbleStateEntropyReward supports the initial
 phase before replay buffer is filled

---
 .../algorithms/pebble/entropy_reward.py       | 79 ++++++++++-----
 .../algorithms/pebble/test_entropy_reward.py  | 97 +++++++++++--------
 2 files changed, 109 insertions(+), 67 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 81d43daa8..04322f808 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,9 +1,8 @@
+from enum import Enum, auto
 from typing import Tuple
 
 import numpy as np
 import torch as th
-from gym.vector.utils import spaces
-from stable_baselines3.common.preprocessing import get_obs_shape
 
 from imitation.policies.replay_buffer_wrapper import (
     ReplayBufferView,
@@ -14,27 +13,53 @@
 from imitation.util.networks import RunningNorm
 
 
+class PebbleRewardPhase(Enum):
+    """States representing different behaviors for PebbleStateEntropyReward"""
+
+    # Collecting samples so that we have something for entropy calculation
+    LEARNING_START = auto()
+    # Entropy based reward
+    UNSUPERVISED_EXPLORATION = auto()
+    # Learned reward
+    POLICY_AND_REWARD_LEARNING = auto()
+
+
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
-    # TODO #625: get rid of the observation_space parameter
+    """
+    Reward function for implementation of the PEBBLE learning algorithm
+    (https://arxiv.org/pdf/2106.05091.pdf).
+
+    The rewards returned by this function go through the three phases
+    defined in PebbleRewardPhase. To transition between these phases,
+    unsupervised_exploration_start() and unsupervised_exploration_finish()
+    need to be called.
+
+    The second phase (UNSUPERVISED_EXPLORATION) also requires that a buffer
+    with observations to compare against is supplied with set_replay_buffer()
+    or on_replay_buffer_initialized().
+
+    Args:
+        learned_reward_fn: The learned reward function used after unsupervised
+            exploration is finished
+        nearest_neighbor_k: Parameter for entropy computation (see
+            compute_state_entropy())
+    """
+
     # TODO #625: parametrize nearest_neighbor_k
     def __init__(
         self,
-        trained_reward_fn: RewardFn,
-        observation_space: spaces.Space,
+        learned_reward_fn: RewardFn,
         nearest_neighbor_k: int = 5,
     ):
-        self.trained_reward_fn = trained_reward_fn
+        self.trained_reward_fn = learned_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
         # TODO support n_envs > 1
         self.entropy_stats = RunningNorm(1)
-        self.observation_space = observation_space
-        self.obs_shape = get_obs_shape(observation_space)
-        self.replay_buffer_view = ReplayBufferView(
-            np.empty(0, dtype=observation_space.dtype), lambda: slice(0)
-        )
-        # This indicates that the training is in the "Unsupervised exploration"
-        # phase of the Pebble algorithm, where entropy is used as reward
-        self.unsupervised_exploration_active = True
+        self.state = PebbleRewardPhase.LEARNING_START
+
+        # These two need to be set with set_replay_buffer():
+        self.replay_buffer_view = None
+        self.obs_shape = None
 
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
         self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape)
@@ -43,8 +68,13 @@ def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple):
         self.replay_buffer_view = replay_buffer
         self.obs_shape = obs_shape
 
-    def on_unsupervised_exploration_finished(self):
-        self.unsupervised_exploration_active = False
+    def unsupervised_exploration_start(self):
+        assert self.state == PebbleRewardPhase.LEARNING_START
+        self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
+
+    def unsupervised_exploration_finish(self):
+        assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION
+        self.state = PebbleRewardPhase.POLICY_AND_REWARD_LEARNING
 
     def __call__(
         self,
@@ -53,19 +83,20 @@ def __call__(
         next_state: np.ndarray,
         done: np.ndarray,
     ) -> np.ndarray:
-        if self.unsupervised_exploration_active:
+        if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION:
             return self._entropy_reward(state)
         else:
             return self.trained_reward_fn(state, action, next_state, done)
 
     def _entropy_reward(self, state):
-        # TODO: should this work with torch instead of numpy internally?
-        #   (The RewardFn protocol requires numpy)
+        if self.replay_buffer_view is None:
+            raise ValueError(
+                "Replay buffer must be supplied before entropy reward can be used"
+            )
+
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
-        all_observations = all_observations.reshape(
-            (-1, *state.shape[1:])  # TODO #625: fix self.obs_shape
-        )
+        all_observations = all_observations.reshape((-1, *self.obs_shape))
         # TODO #625: deal with the conversion back and forth between np and torch
         entropies = util.compute_state_entropy(
             th.tensor(state),
@@ -82,6 +113,4 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        self.replay_buffer_view = ReplayBufferView(
-            np.empty(0, self.observation_space.dtype), lambda: slice(0)
-        )
+        self.replay_buffer_view = None
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 42496a79c..3abd66752 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -11,7 +11,8 @@
 from imitation.util import util
 
 SPACE = Discrete(4)
-PLACEHOLDER = np.empty(get_obs_shape(SPACE))
+OBS_SHAPE = get_obs_shape(SPACE)
+PLACEHOLDER = np.empty(OBS_SHAPE)
 
 BUFFER_SIZE = 20
 K = 4
@@ -19,22 +20,59 @@
 VENVS = 2
 
 
-def test_pebble_entropy_reward_returns_entropy(rng):
-    obs_shape = get_obs_shape(SPACE)
-    all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape))
+def test_pebble_entropy_reward_function_returns_learned_reward_initially():
+    expected_reward = np.ones(1)
+    learned_reward_mock = Mock()
+    learned_reward_mock.return_value = expected_reward
+    reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE)
+
+    # Act
+    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    assert reward == expected_reward
+    learned_reward_mock.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
+
+
+def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training():
+    expected_reward = np.ones(1)
+    learned_reward_mock = Mock()
+    learned_reward_mock.return_value = expected_reward
+    reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE)
+    # move all the way to the last state
+    reward_fn.unsupervised_exploration_start()
+    reward_fn.unsupervised_exploration_finish()
+
+    # Act
+    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    assert reward == expected_reward
+    learned_reward_mock.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
+
+
+def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
+    all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE)))
 
     reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
     reward_fn.set_replay_buffer(
-        ReplayBufferView(all_observations, lambda: slice(None)), obs_shape
+        ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE
     )
+    reward_fn.unsupervised_exploration_start()
 
     # Act
-    observations = rng.random((BATCH_SIZE, *obs_shape))
+    observations = th.rand((BATCH_SIZE, *(OBS_SHAPE)))
     reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Assert
     expected = util.compute_state_entropy(
-        observations, all_observations.reshape(-1, *obs_shape), K
+        observations, all_observations.reshape(-1, *(OBS_SHAPE)), K
     )
     expected_normalized = reward_fn.entropy_stats.normalize(
         th.as_tensor(expected)
@@ -42,17 +80,18 @@ def test_pebble_entropy_reward_returns_entropy(rng):
     np.testing.assert_allclose(reward, expected_normalized)
 
 
-def test_pebble_entropy_reward_returns_normalized_values():
+def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
     with patch("imitation.util.util.compute_state_entropy") as m:
         # mock entropy computation so that we can test only stats collection in this test
         m.side_effect = lambda obs, all_obs, k: obs
 
         reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
-        all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
+        all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
         reward_fn.set_replay_buffer(
             ReplayBufferView(all_observations, lambda: slice(None)),
-            get_obs_shape(SPACE),
+            OBS_SHAPE,
         )
+        reward_fn.unsupervised_exploration_start()
 
         dim = 8
         shift = 3
@@ -77,51 +116,25 @@ def test_pebble_entropy_reward_returns_normalized_values():
 
 
 def test_pebble_entropy_reward_can_pickle():
-    all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
+    all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
     replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
 
-    obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE))
+    obs1 = np.random.rand(VENVS, *OBS_SHAPE)
     reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K)
-    reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE))
+    reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE)
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Act
     pickled = pickle.dumps(reward_fn)
     reward_fn_deserialized = pickle.loads(pickled)
-    reward_fn_deserialized.set_replay_buffer(replay_buffer)
+    reward_fn_deserialized.set_replay_buffer(replay_buffer, OBS_SHAPE)
 
     # Assert
-    obs2 = np.random.rand(VENVS, *get_obs_shape(SPACE))
+    obs2 = np.random.rand(VENVS, *OBS_SHAPE)
     expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     np.testing.assert_allclose(actual_result, expected_result)
 
 
-def test_pebble_entropy_reward_function_switches_to_inner():
-    obs_shape = get_obs_shape(SPACE)
-
-    expected_reward = np.ones(1)
-    reward_fn_mock = Mock()
-    reward_fn_mock.return_value = expected_reward
-    reward_fn = PebbleStateEntropyReward(reward_fn_mock, SPACE)
-
-    # Act
-    reward_fn.on_unsupervised_exploration_finished()
-    observations = np.ones((BATCH_SIZE, *obs_shape))
-    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
-
-    # Assert
-    assert reward == expected_reward
-    reward_fn_mock.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
-    )
-
-
-def reward_fn_stub(
-    self,
-    state: np.ndarray,
-    action: np.ndarray,
-    next_state: np.ndarray,
-    done: np.ndarray,
-) -> np.ndarray:
+def reward_fn_stub(state, action, next_state, done):
     return state

From e354e16c51bcc36be74247ece6bf1ce503f38883 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 20:54:10 +0100
Subject: [PATCH 31/55] #625 entropy_reward can automatically detect if enough
 observations are present

---
 .../algorithms/pebble/entropy_reward.py       | 62 +++++++++---------
 .../algorithms/pebble/test_entropy_reward.py  | 64 +++++++------------
 2 files changed, 53 insertions(+), 73 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 04322f808..3d9d76b00 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -16,12 +16,8 @@
 class PebbleRewardPhase(Enum):
     """States representing different behaviors for PebbleStateEntropyReward"""
 
-    # Collecting samples so that we have something for entropy calculation
-    LEARNING_START = auto()
-    # Entropy based reward
-    UNSUPERVISED_EXPLORATION = auto()
-    # Learned reward
-    POLICY_AND_REWARD_LEARNING = auto()
+    UNSUPERVISED_EXPLORATION = auto()  # Entropy based reward
+    POLICY_AND_REWARD_LEARNING = auto()  # Learned reward
 
 
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
@@ -29,14 +25,19 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     Reward function for implementation of the PEBBLE learning algorithm
     (https://arxiv.org/pdf/2106.05091.pdf).
 
-    The rewards returned by this function go through the three phases
-    defined in PebbleRewardPhase. To transition between these phases,
-    unsupervised_exploration_start() and unsupervised_exploration_finish()
-    need to be called.
+    The rewards returned by this function go through the three phases:
+    1. Before enough samples are collected for entropy calculation, the
+        underlying function is returned. This shouldn't matter because
+        OffPolicyAlgorithms have an initialization period for `learning_starts`
+        timesteps.
+    2. During the unsupervised exploration phase, entropy based reward is returned
+    3. After unsupervised exploration phase is finished, the underlying learned
+        reward is returned.
 
-    The second phase (UNSUPERVISED_EXPLORATION) also requires that a buffer
-    with observations to compare against is supplied with set_replay_buffer()
-    or on_replay_buffer_initialized().
+    The second phase requires that a buffer with observations to compare against is
+    supplied with set_replay_buffer() or on_replay_buffer_initialized().
+    To transition to the last phase, unsupervised_exploration_finish() needs
+    to be called.
 
     Args:
         learned_reward_fn: The learned reward function used after unsupervised
@@ -51,11 +52,10 @@ def __init__(
         learned_reward_fn: RewardFn,
         nearest_neighbor_k: int = 5,
     ):
-        self.trained_reward_fn = learned_reward_fn
+        self.learned_reward_fn = learned_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
-        # TODO support n_envs > 1
         self.entropy_stats = RunningNorm(1)
-        self.state = PebbleRewardPhase.LEARNING_START
+        self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
 
         # These two need to be set with set_replay_buffer():
         self.replay_buffer_view = None
@@ -68,10 +68,6 @@ def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple):
         self.replay_buffer_view = replay_buffer
         self.obs_shape = obs_shape
 
-    def unsupervised_exploration_start(self):
-        assert self.state == PebbleRewardPhase.LEARNING_START
-        self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
-
     def unsupervised_exploration_finish(self):
         assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION
         self.state = PebbleRewardPhase.POLICY_AND_REWARD_LEARNING
@@ -84,26 +80,30 @@ def __call__(
         done: np.ndarray,
     ) -> np.ndarray:
         if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION:
-            return self._entropy_reward(state)
+            return self._entropy_reward(state, action, next_state, done)
         else:
-            return self.trained_reward_fn(state, action, next_state, done)
+            return self.learned_reward_fn(state, action, next_state, done)
 
-    def _entropy_reward(self, state):
+    def _entropy_reward(self, state, action, next_state, done):
         if self.replay_buffer_view is None:
             raise ValueError(
                 "Replay buffer must be supplied before entropy reward can be used"
             )
-
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
         all_observations = all_observations.reshape((-1, *self.obs_shape))
-        # TODO #625: deal with the conversion back and forth between np and torch
-        entropies = util.compute_state_entropy(
-            th.tensor(state),
-            th.tensor(all_observations),
-            self.nearest_neighbor_k,
-        )
-        normalized_entropies = self.entropy_stats.forward(entropies)
+
+        if all_observations.shape[0] < self.nearest_neighbor_k:
+            # not enough observations to compare to, fall back to the learned function
+            return self.learned_reward_fn(state, action, next_state, done)
+        else:
+            # TODO #625: deal with the conversion back and forth between np and torch
+            entropies = util.compute_state_entropy(
+                th.tensor(state),
+                th.tensor(all_observations),
+                self.nearest_neighbor_k,
+            )
+            normalized_entropies = self.entropy_stats.forward(entropies)
         return normalized_entropies.numpy()
 
     def __getstate__(self):
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 3abd66752..c4f127b09 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -20,51 +20,13 @@
 VENVS = 2
 
 
-def test_pebble_entropy_reward_function_returns_learned_reward_initially():
-    expected_reward = np.ones(1)
-    learned_reward_mock = Mock()
-    learned_reward_mock.return_value = expected_reward
-    reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE)
-
-    # Act
-    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
-    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
-
-    # Assert
-    assert reward == expected_reward
-    learned_reward_mock.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
-    )
-
-
-def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training():
-    expected_reward = np.ones(1)
-    learned_reward_mock = Mock()
-    learned_reward_mock.return_value = expected_reward
-    reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE)
-    # move all the way to the last state
-    reward_fn.unsupervised_exploration_start()
-    reward_fn.unsupervised_exploration_finish()
-
-    # Act
-    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
-    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
-
-    # Assert
-    assert reward == expected_reward
-    learned_reward_mock.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
-    )
-
-
 def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
     all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE)))
 
-    reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
+    reward_fn = PebbleStateEntropyReward(Mock(), K)
     reward_fn.set_replay_buffer(
         ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE
     )
-    reward_fn.unsupervised_exploration_start()
 
     # Act
     observations = th.rand((BATCH_SIZE, *(OBS_SHAPE)))
@@ -85,13 +47,12 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
         # mock entropy computation so that we can test only stats collection in this test
         m.side_effect = lambda obs, all_obs, k: obs
 
-        reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
+        reward_fn = PebbleStateEntropyReward(Mock(), K)
         all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
         reward_fn.set_replay_buffer(
             ReplayBufferView(all_observations, lambda: slice(None)),
             OBS_SHAPE,
         )
-        reward_fn.unsupervised_exploration_start()
 
         dim = 8
         shift = 3
@@ -115,12 +76,31 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
         )
 
 
+def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training():
+    expected_reward = np.ones(1)
+    learned_reward_mock = Mock()
+    learned_reward_mock.return_value = expected_reward
+    reward_fn = PebbleStateEntropyReward(learned_reward_mock)
+    # move all the way to the last state
+    reward_fn.unsupervised_exploration_finish()
+
+    # Act
+    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    assert reward == expected_reward
+    learned_reward_mock.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
+
+
 def test_pebble_entropy_reward_can_pickle():
     all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
     replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
 
     obs1 = np.random.rand(VENVS, *OBS_SHAPE)
-    reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K)
+    reward_fn = PebbleStateEntropyReward(reward_fn_stub, K)
     reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE)
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 

From b8ccf2f8f2140909c47ac636167be432800c7c5e Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 21:12:18 +0100
Subject: [PATCH 32/55] #625 fix entropy shape

---
 src/imitation/algorithms/pebble/entropy_reward.py | 5 +++--
 src/imitation/util/util.py                        | 3 +--
 tests/algorithms/pebble/test_entropy_reward.py    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 3d9d76b00..e0d94c171 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -94,7 +94,8 @@ def _entropy_reward(self, state, action, next_state, done):
         all_observations = all_observations.reshape((-1, *self.obs_shape))
 
         if all_observations.shape[0] < self.nearest_neighbor_k:
-            # not enough observations to compare to, fall back to the learned function
+            # not enough observations to compare to, fall back to the learned function;
+            # (falling back to a constant may also be ok)
             return self.learned_reward_fn(state, action, next_state, done)
         else:
             # TODO #625: deal with the conversion back and forth between np and torch
@@ -104,7 +105,7 @@ def _entropy_reward(self, state, action, next_state, done):
                 self.nearest_neighbor_k,
             )
             normalized_entropies = self.entropy_stats.forward(entropies)
-        return normalized_entropies.numpy()
+            return normalized_entropies.numpy()
 
     def __getstate__(self):
         state = self.__dict__.copy()
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index 9e5815e0c..9bf1c1a40 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -389,5 +389,4 @@ def compute_state_entropy(
         # a point is itself, which we want to skip.
         assert distances_tensor.shape[-1] > k
         knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
-        state_entropy = knn_dists
-    return state_entropy.unsqueeze(1)
+        return knn_dists
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index c4f127b09..918222382 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -21,7 +21,7 @@
 
 
 def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
-    all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE)))
+    all_observations = rng.random((BUFFER_SIZE, VENVS, *OBS_SHAPE))
 
     reward_fn = PebbleStateEntropyReward(Mock(), K)
     reward_fn.set_replay_buffer(
@@ -29,12 +29,12 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
     )
 
     # Act
-    observations = th.rand((BATCH_SIZE, *(OBS_SHAPE)))
+    observations = th.rand((BATCH_SIZE, *OBS_SHAPE))
     reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Assert
     expected = util.compute_state_entropy(
-        observations, all_observations.reshape(-1, *(OBS_SHAPE)), K
+        observations, all_observations.reshape(-1, *OBS_SHAPE), K
     )
     expected_normalized = reward_fn.entropy_stats.normalize(
         th.as_tensor(expected)

From c5f1dba1bd1da18d9ed35410dd698bf1a8c9a167 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 21:15:03 +0100
Subject: [PATCH 33/55] #625 rename unsupervised_agent_pretrain_frac parameter

---
 .../algorithms/preference_comparisons.py      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index e29433188..96af17cfd 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1508,7 +1508,7 @@ def __init__(
         transition_oversampling: float = 1,
         initial_comparison_frac: float = 0.1,
         initial_epoch_multiplier: float = 200.0,
-        initial_agent_pretrain_frac: float = 0.05,
+        unsupervised_agent_pretrain_frac: float = 0.05,
         custom_logger: Optional[imit_logger.HierarchicalLogger] = None,
         allow_variable_horizon: bool = False,
         rng: Optional[np.random.Generator] = None,
@@ -1558,7 +1558,7 @@ def __init__(
             initial_epoch_multiplier: before agent training begins, train the reward
                 model for this many more epochs than usual (on fragments sampled from a
                 random agent).
-            initial_agent_pretrain_frac: fraction of total_timesteps for which the
+            unsupervised_agent_pretrain_frac: fraction of total_timesteps for which the
                 agent will be trained without preference gathering (and reward model
                 training)
             custom_logger: Where to log to; if None (default), creates a new logger.
@@ -1659,7 +1659,7 @@ def __init__(
         self.fragment_length = fragment_length
         self.initial_comparison_frac = initial_comparison_frac
         self.initial_epoch_multiplier = initial_epoch_multiplier
-        self.initial_agent_pretrain_frac = initial_agent_pretrain_frac
+        self.unsupervised_agent_pretrain_frac = unsupervised_agent_pretrain_frac
         self.num_iterations = num_iterations
         self.transition_oversampling = transition_oversampling
         if callable(query_schedule):
@@ -1693,7 +1693,7 @@ def train(
         print(f"Query schedule: {preference_query_schedule}")
 
         (
-            agent_pretrain_timesteps,
+            unsupervised_pretrain_timesteps,
             timesteps_per_iteration,
             extra_timesteps,
         ) = self._compute_timesteps(total_timesteps)
@@ -1705,9 +1705,9 @@ def train(
         ###################################################
         with self.logger.accumulate_means("agent"):
             self.logger.log(
-                f"Pre-training agent for {agent_pretrain_timesteps} timesteps"
+                f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps"
             )
-            self.trajectory_generator.unsupervised_pretrain(agent_pretrain_timesteps)
+            self.trajectory_generator.unsupervised_pretrain(unsupervised_pretrain_timesteps)
 
         for i, num_pairs in enumerate(preference_query_schedule):
             ##########################
@@ -1784,11 +1784,11 @@ def _preference_gather_schedule(self, total_comparisons):
         return schedule
 
     def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]:
-        agent_pretrain_timesteps = int(
-            total_timesteps * self.initial_agent_pretrain_frac
+        unsupervised_pretrain_timesteps = int(
+            total_timesteps * self.unsupervised_agent_pretrain_frac
         )
         timesteps_per_iteration, extra_timesteps = divmod(
-            total_timesteps - agent_pretrain_timesteps,
+            total_timesteps - unsupervised_pretrain_timesteps,
             self.num_iterations,
         )
-        return agent_pretrain_timesteps, timesteps_per_iteration, extra_timesteps
+        return unsupervised_pretrain_timesteps, timesteps_per_iteration, extra_timesteps

From 0ba89593602d3438c0d0bf07a89e351b1cdcbe6c Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 21:32:35 +0100
Subject: [PATCH 34/55] #625 specialized PebbleAgentTrainer to distinguish from
 old preference comparison trainer

---
 .../algorithms/preference_comparisons.py      | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 96af17cfd..7d7466b26 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -33,6 +33,7 @@
 from tqdm.auto import tqdm
 
 from imitation.algorithms import base
+from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.data import rollout, types, wrappers
 from imitation.data.types import (
     AnyPath,
@@ -329,6 +330,27 @@ def logger(self, value: imit_logger.HierarchicalLogger) -> None:
         self.algorithm.set_logger(self.logger)
 
 
+class PebbleAgentTrainer(AgentTrainer):
+    """
+    Specialization of AgentTrainer for PEBBLE training.
+    Includes unsupervised pretraining with an entropy based reward function.
+    """
+
+    reward_fn: PebbleStateEntropyReward
+
+    def __init__(
+        self,
+        *,
+        reward_fn: PebbleStateEntropyReward,
+        **kwargs,
+    ) -> None:
+        super().__init__(reward_fn=reward_fn, **kwargs)
+
+    def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
+        self.train(steps, **kwargs)
+        self.reward_fn.unsupervised_exploration_finish()
+
+
 def _get_trajectories(
     trajectories: Sequence[TrajectoryWithRew],
     steps: int,
@@ -1707,7 +1729,9 @@ def train(
             self.logger.log(
                 f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps"
             )
-            self.trajectory_generator.unsupervised_pretrain(unsupervised_pretrain_timesteps)
+            self.trajectory_generator.unsupervised_pretrain(
+                unsupervised_pretrain_timesteps
+            )
 
         for i, num_pairs in enumerate(preference_query_schedule):
             ##########################

From c55fee727dd100f1af0efaef983ecdb05a13428d Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 22:14:38 +0100
Subject: [PATCH 35/55] #625 merge pebble to train_preference_comparisons.py
 and configure only through sacred

---
 .../algorithms/preference_comparisons.py      |  4 ++++
 .../config/train_preference_comparisons.py    | 24 ++++++++++++++++++-
 .../scripts/train_preference_comparisons.py   |  8 +++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 7d7466b26..fe5dc472e 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -344,6 +344,10 @@ def __init__(
         reward_fn: PebbleStateEntropyReward,
         **kwargs,
     ) -> None:
+        if not isinstance(reward_fn, PebbleStateEntropyReward):
+            raise ValueError(
+                f"{self.__class__.__name__} expects {PebbleStateEntropyReward.__name__} reward function"
+            )
         super().__init__(reward_fn=reward_fn, **kwargs)
 
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index ba4e9483c..227142814 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -1,8 +1,10 @@
 """Configuration for imitation.scripts.train_preference_comparisons."""
 
 import sacred
+import stable_baselines3 as sb3
 
 from imitation.algorithms import preference_comparisons
+from imitation.policies import base
 from imitation.scripts.common import common, reward, rl, train
 
 train_preference_comparisons_ex = sacred.Experiment(
@@ -15,7 +17,6 @@
     ],
 )
 
-
 MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1)))
 ANT_SHARED_LOCALS = dict(
     total_timesteps=int(3e7),
@@ -61,6 +62,26 @@ def train_defaults():
     query_schedule = "hyperbolic"
 
 
+@train_preference_comparisons_ex.named_config
+def pebble():
+    # fraction of total_timesteps for training before preference gathering
+    unsupervised_agent_pretrain_frac = 0.05
+    pebble_nearest_neighbor_k = 5
+
+    rl = {
+        "rl_cls": sb3.SAC,
+        "batch_size": 256,  # batch size for RL algorithm
+        "rl_kwargs": {"batch_size": None},  # make sure to set batch size to None
+    }
+    train = {
+        "policy_cls": base.SAC1024Policy,  # noqa: F841
+    }
+    common = {"env_name": "MountainCarContinuous-v0"}
+    allow_variable_horizon = True
+
+    locals()  # quieten flake8
+
+
 @train_preference_comparisons_ex.named_config
 def cartpole():
     common = dict(env_name="CartPole-v1")
@@ -121,6 +142,7 @@ def fast():
     total_timesteps = 50
     total_comparisons = 5
     initial_comparison_frac = 0.2
+    unsupervised_agent_pretrain_frac = 0.2
     num_iterations = 1
     fragment_length = 2
     reward_trainer_kwargs = {
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index 331a4797a..cfa87a960 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -82,6 +82,8 @@ def train_preference_comparisons(
     allow_variable_horizon: bool,
     checkpoint_interval: int,
     query_schedule: Union[str, type_aliases.Schedule],
+    unsupervised_agent_pretrain_frac: Optional[float],
+    pebble_nearest_neighbor_k: Optional[int],
 ) -> Mapping[str, Any]:
     """Train a reward model using preference comparisons.
 
@@ -141,6 +143,11 @@ def train_preference_comparisons(
             be allocated to each iteration. "hyperbolic" and "inverse_quadratic"
             apportion fewer queries to later iterations when the policy is assumed
             to be better and more stable.
+        unsupervised_agent_pretrain_frac: fraction of total_timesteps for which the
+                agent will be trained without preference gathering (and reward model
+                training)
+        pebble_nearest_neighbor_k: Parameter for state entropy computation (for PEBBLE
+            training only)
 
     Returns:
         Rollout statistics from trained policy.
@@ -244,6 +251,7 @@ def train_preference_comparisons(
             custom_logger=custom_logger,
             allow_variable_horizon=allow_variable_horizon,
             query_schedule=query_schedule,
+            unsupervised_agent_pretrain_frac=unsupervised_agent_pretrain_frac,
         )
 
         def save_callback(iteration_num):

From 1f9642a362a1e1d3d075e7944bd8952ab915bf12 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 22:56:56 +0100
Subject: [PATCH 36/55] #625 plug in pebble according to parameters

---
 .../config/train_preference_comparisons.py    |   3 +
 .../train_preference_comparisons_pebble.py    | 163 ----------
 .../scripts/train_preference_comparisons.py   |  83 +++--
 .../train_preference_comparisons_pebble.py    | 292 ------------------
 4 files changed, 68 insertions(+), 473 deletions(-)
 delete mode 100644 src/imitation/scripts/config/train_preference_comparisons_pebble.py
 delete mode 100644 src/imitation/scripts/train_preference_comparisons_pebble.py

diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 227142814..ca0e786ff 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -60,11 +60,14 @@ def train_defaults():
 
     checkpoint_interval = 0  # Num epochs between saving (<0 disables, =0 final only)
     query_schedule = "hyperbolic"
+    # Whether to use the PEBBLE algorithm (https://arxiv.org/pdf/2106.05091.pdf)
+    pebble_enabled = False
 
 
 @train_preference_comparisons_ex.named_config
 def pebble():
     # fraction of total_timesteps for training before preference gathering
+    pebble_enabled = True
     unsupervised_agent_pretrain_frac = 0.05
     pebble_nearest_neighbor_k = 5
 
diff --git a/src/imitation/scripts/config/train_preference_comparisons_pebble.py b/src/imitation/scripts/config/train_preference_comparisons_pebble.py
deleted file mode 100644
index a497542e7..000000000
--- a/src/imitation/scripts/config/train_preference_comparisons_pebble.py
+++ /dev/null
@@ -1,163 +0,0 @@
-"""Configuration for imitation.scripts.train_preference_comparisons_pebble."""
-
-import warnings
-
-import sacred
-import stable_baselines3 as sb3
-
-from imitation.algorithms import preference_comparisons
-from imitation.policies import base
-from imitation.scripts.common import common, reward, rl, train
-
-train_preference_comparisons_pebble_ex = sacred.Experiment(
-    "train_preference_comparisons_pebble",
-    ingredients=[
-        common.common_ingredient,
-        reward.reward_ingredient,
-        rl.rl_ingredient,
-        train.train_ingredient,
-    ],
-)
-
-MUJOCO_SHARED_LOCALS = dict(rl=dict(rl_kwargs=dict(ent_coef=0.1)))
-ANT_SHARED_LOCALS = dict(
-    total_timesteps=int(3e7),
-    rl=dict(batch_size=16384),
-)
-
-
-@rl.rl_ingredient.config
-def rl_sac():
-    # For recommended SAC hyperparams in each environment, see:
-    # https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/sac.yml
-    rl_cls = sb3.SAC
-    warnings.warn(
-        "SAC currently only supports continuous action spaces. "
-        "Consider adding a discrete version as mentioned here: "
-        "https://github.com/DLR-RM/stable-baselines3/issues/505",
-        category=RuntimeWarning,
-    )
-    # Default HPs are as follows:
-    batch_size = 256  # batch size for RL algorithm
-    rl_kwargs = dict(batch_size=None)  # make sure to set batch size to None
-    locals()  # quieten flake8
-
-
-@train.train_ingredient.config
-def train_sac():
-    policy_cls = base.SAC1024Policy  # noqa: F841
-    locals()  # quieten flake8
-
-
-@common.common_ingredient.config
-def common_mountain_car_continuous():
-    env_name = "MountainCarContinuous-v0"
-    locals()  # quieten flake8
-
-
-@train_preference_comparisons_pebble_ex.config
-def train_defaults():
-    fragment_length = 100  # timesteps per fragment used for comparisons
-    total_timesteps = int(1e6)  # total number of environment timesteps
-    total_comparisons = 5000  # total number of comparisons to elicit
-    num_iterations = 5  # Arbitrary, should be tuned for the task
-    comparison_queue_size = None
-    # factor by which to oversample transitions before creating fragments
-    transition_oversampling = 1
-    # fraction of total_comparisons that will be sampled right at the beginning
-    initial_comparison_frac = 0.1
-    # fraction of sampled trajectories that will include some random actions
-    exploration_frac = 0.0
-    # fraction of total_timesteps for training before preference gathering
-    initial_agent_pretrain_frac = 0.05
-    preference_model_kwargs = {}
-    reward_trainer_kwargs = {
-        "epochs": 3,
-    }
-    save_preferences = False  # save preference dataset at the end?
-    agent_path = None  # path to a (partially) trained agent to load at the beginning
-    # type of PreferenceGatherer to use
-    gatherer_cls = preference_comparisons.SyntheticGatherer
-    # arguments passed on to the PreferenceGatherer specified by gatherer_cls
-    gatherer_kwargs = {}
-    active_selection = False
-    active_selection_oversampling = 2
-    uncertainty_on = "logit"
-    fragmenter_kwargs = {
-        "warning_threshold": 0,
-    }
-    # path to a pickled sequence of trajectories used instead of training an agent
-    trajectory_path = None
-    trajectory_generator_kwargs = {}  # kwargs to pass to trajectory generator
-    allow_variable_horizon = False
-
-    checkpoint_interval = 0  # Num epochs between saving (<0 disables, =0 final only)
-    query_schedule = "hyperbolic"
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def cartpole():
-    common = dict(env_name="CartPole-v1")
-    allow_variable_horizon = True
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def seals_ant():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    locals().update(**ANT_SHARED_LOCALS)
-    common = dict(env_name="seals/Ant-v0")
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def half_cheetah():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    common = dict(env_name="HalfCheetah-v2")
-    rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024))
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def seals_hopper():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    common = dict(env_name="seals/Hopper-v0")
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def seals_humanoid():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    common = dict(env_name="seals/Humanoid-v0")
-    total_timesteps = int(4e6)
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def seals_cartpole():
-    common = dict(env_name="seals/CartPole-v0")
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def pendulum():
-    common = dict(env_name="Pendulum-v1")
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def mountain_car():
-    common = dict(env_name="MountainCar-v0")
-    allow_variable_horizon = True
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def seals_mountain_car():
-    common = dict(env_name="seals/MountainCar-v0")
-
-
-@train_preference_comparisons_pebble_ex.named_config
-def fast():
-    # Minimize the amount of computation. Useful for test cases.
-    total_timesteps = 50
-    total_comparisons = 5
-    initial_comparison_frac = 0.2
-    initial_agent_pretrain_frac = 0.2
-    num_iterations = 1
-    fragment_length = 2
-    reward_trainer_kwargs = {
-        "epochs": 1,
-    }
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index cfa87a960..c848a6d09 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -3,24 +3,27 @@
 Can be used as a CLI script, or the `train_preference_comparisons` function
 can be called directly.
 """
-
 import functools
 import pathlib
 from typing import Any, Mapping, Optional, Type, Union
 
+import numpy as np
 import torch as th
 from sacred.observers import FileStorageObserver
-from stable_baselines3.common import type_aliases
+from stable_baselines3.common import type_aliases, base_class, vec_env
 
 from imitation.algorithms import preference_comparisons
+from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.data import types
 from imitation.policies import serialize
+from imitation.rewards import reward_nets, reward_function
 from imitation.scripts.common import common, reward
 from imitation.scripts.common import rl as rl_common
 from imitation.scripts.common import train
 from imitation.scripts.config.train_preference_comparisons import (
     train_preference_comparisons_ex,
 )
+from imitation.util import logger as imit_logger
 
 
 def save_model(
@@ -57,6 +60,59 @@ def save_checkpoint(
         )
 
 
+@train_preference_comparisons_ex.capture
+def make_reward_function(
+    reward_net: reward_nets.RewardNet,
+    *,
+    pebble_enabled: bool = False,
+    pebble_nearest_neighbor_k: Optional[int] = None,
+):
+    relabel_reward_fn = functools.partial(
+        reward_net.predict_processed,
+        update_stats=False,
+    )
+    if pebble_enabled:
+        relabel_reward_fn = PebbleStateEntropyReward(
+            relabel_reward_fn, pebble_nearest_neighbor_k
+        )
+    return relabel_reward_fn
+
+
+@train_preference_comparisons_ex.capture
+def make_agent_trajectory_generator(
+    venv: vec_env.VecEnv,
+    agent: base_class.BaseAlgorithm,
+    reward_net: reward_nets.RewardNet,
+    relabel_reward_fn: reward_function.RewardFn,
+    rng: np.random.Generator,
+    custom_logger: Optional[imit_logger.HierarchicalLogger],
+    *,
+    exploration_frac: float,
+    pebble_enabled: bool,
+    trajectory_generator_kwargs: Mapping[str, Any],
+) -> preference_comparisons.AgentTrainer:
+    if pebble_enabled:
+        return preference_comparisons.PebbleAgentTrainer(
+            algorithm=agent,
+            reward_fn=relabel_reward_fn,
+            venv=venv,
+            exploration_frac=exploration_frac,
+            rng=rng,
+            custom_logger=custom_logger,
+            **trajectory_generator_kwargs,
+        )
+    else:
+        return preference_comparisons.AgentTrainer(
+            algorithm=agent,
+            reward_fn=reward_net,
+            venv=venv,
+            exploration_frac=exploration_frac,
+            rng=rng,
+            custom_logger=custom_logger,
+            **trajectory_generator_kwargs,
+        )
+
+
 @train_preference_comparisons_ex.main
 def train_preference_comparisons(
     total_timesteps: int,
@@ -83,7 +139,6 @@ def train_preference_comparisons(
     checkpoint_interval: int,
     query_schedule: Union[str, type_aliases.Schedule],
     unsupervised_agent_pretrain_frac: Optional[float],
-    pebble_nearest_neighbor_k: Optional[int],
 ) -> Mapping[str, Any]:
     """Train a reward model using preference comparisons.
 
@@ -146,8 +201,6 @@ def train_preference_comparisons(
         unsupervised_agent_pretrain_frac: fraction of total_timesteps for which the
                 agent will be trained without preference gathering (and reward model
                 training)
-        pebble_nearest_neighbor_k: Parameter for state entropy computation (for PEBBLE
-            training only)
 
     Returns:
         Rollout statistics from trained policy.
@@ -160,10 +213,8 @@ def train_preference_comparisons(
 
     with common.make_venv() as venv:
         reward_net = reward.make_reward_net(venv)
-        relabel_reward_fn = functools.partial(
-            reward_net.predict_processed,
-            update_stats=False,
-        )
+        relabel_reward_fn = make_reward_function(reward_net)
+
         if agent_path is None:
             agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn)
         else:
@@ -176,21 +227,17 @@ def train_preference_comparisons(
         if trajectory_path is None:
             # Setting the logger here is not necessary (PreferenceComparisons takes care
             # of it automatically) but it avoids creating unnecessary loggers.
-            agent_trainer = preference_comparisons.AgentTrainer(
-                algorithm=agent,
-                reward_fn=reward_net,
+            trajectory_generator = make_agent_trajectory_generator(
                 venv=venv,
-                exploration_frac=exploration_frac,
+                agent=agent,
+                reward_net=reward_net,
+                relabel_reward_fn=relabel_reward_fn,
                 rng=rng,
                 custom_logger=custom_logger,
-                **trajectory_generator_kwargs,
             )
             # Stable Baselines will automatically occupy GPU 0 if it is available.
             # Let's use the same device as the SB3 agent for the reward model.
-            reward_net = reward_net.to(agent_trainer.algorithm.device)
-            trajectory_generator: preference_comparisons.TrajectoryGenerator = (
-                agent_trainer
-            )
+            reward_net = reward_net.to(trajectory_generator.algorithm.device)
         else:
             if exploration_frac > 0:
                 raise ValueError(
diff --git a/src/imitation/scripts/train_preference_comparisons_pebble.py b/src/imitation/scripts/train_preference_comparisons_pebble.py
deleted file mode 100644
index f34eefb9d..000000000
--- a/src/imitation/scripts/train_preference_comparisons_pebble.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""Train a reward model using preference comparisons.
-
-Can be used as a CLI script, or the `train_preference_comparisons` function
-can be called directly.
-"""
-
-import functools
-import pathlib
-from typing import Any, Mapping, Optional, Type, Union
-
-import torch as th
-from sacred.observers import FileStorageObserver
-from stable_baselines3.common import type_aliases
-
-from imitation.algorithms import preference_comparisons
-from imitation.data import types
-from imitation.policies import serialize
-from imitation.scripts.common import common, reward
-from imitation.scripts.common import rl as rl_common
-from imitation.scripts.common import train
-from imitation.scripts.config.train_preference_comparisons_pebble import (
-    train_preference_comparisons_pebble_ex,
-)
-
-
-def save_model(
-    agent_trainer: preference_comparisons.AgentTrainer,
-    save_path: pathlib.Path,
-):
-    """Save the model as `model.zip`."""
-    serialize.save_stable_model(
-        output_dir=save_path / "policy",
-        model=agent_trainer.algorithm,
-    )
-
-
-def save_checkpoint(
-    trainer: preference_comparisons.PreferenceComparisons,
-    save_path: pathlib.Path,
-    allow_save_policy: Optional[bool],
-):
-    """Save reward model and optionally policy."""
-    save_path.mkdir(parents=True, exist_ok=True)
-    th.save(trainer.model, save_path / "reward_net.pt")
-    if allow_save_policy:
-        # Note: We should only save the model as model.zip if `trajectory_generator`
-        # contains one. Currently we are slightly over-conservative, by requiring
-        # that an AgentTrainer be used if we're saving the policy.
-        assert isinstance(
-            trainer.trajectory_generator,
-            preference_comparisons.AgentTrainer,
-        )
-        save_model(trainer.trajectory_generator, save_path)
-    else:
-        trainer.logger.warn(
-            "trainer.trajectory_generator doesn't contain a policy to save.",
-        )
-
-
-@train_preference_comparisons_pebble_ex.main
-def train_preference_comparisons(
-    total_timesteps: int,
-    total_comparisons: int,
-    num_iterations: int,
-    comparison_queue_size: Optional[int],
-    fragment_length: int,
-    transition_oversampling: float,
-    initial_comparison_frac: float,
-    exploration_frac: float,
-    trajectory_path: Optional[str],
-    trajectory_generator_kwargs: Mapping[str, Any],
-    save_preferences: bool,
-    agent_path: Optional[str],
-    preference_model_kwargs: Mapping[str, Any],
-    reward_trainer_kwargs: Mapping[str, Any],
-    gatherer_cls: Type[preference_comparisons.PreferenceGatherer],
-    gatherer_kwargs: Mapping[str, Any],
-    active_selection: bool,
-    active_selection_oversampling: int,
-    uncertainty_on: str,
-    fragmenter_kwargs: Mapping[str, Any],
-    allow_variable_horizon: bool,
-    checkpoint_interval: int,
-    query_schedule: Union[str, type_aliases.Schedule],
-) -> Mapping[str, Any]:
-    """Train a reward model using preference comparisons.
-
-    Args:
-        total_timesteps: number of environment interaction steps
-        total_comparisons: number of preferences to gather in total
-        num_iterations: number of times to train the agent against the reward model
-            and then train the reward model against newly gathered preferences.
-        comparison_queue_size: the maximum number of comparisons to keep in the
-            queue for training the reward model. If None, the queue will grow
-            without bound as new comparisons are added.
-        fragment_length: number of timesteps per fragment that is used to elicit
-            preferences
-        transition_oversampling: factor by which to oversample transitions before
-            creating fragments. Since fragments are sampled with replacement,
-            this is usually chosen > 1 to avoid having the same transition
-            in too many fragments.
-        initial_comparison_frac: fraction of total_comparisons that will be
-            sampled before the rest of training begins (using the randomly initialized
-            agent). This can be used to pretrain the reward model before the agent
-            is trained on the learned reward.
-        exploration_frac: fraction of trajectory samples that will be created using
-            partially random actions, rather than the current policy. Might be helpful
-            if the learned policy explores too little and gets stuck with a wrong
-            reward.
-        trajectory_path: either None, in which case an agent will be trained
-            and used to sample trajectories on the fly, or a path to a pickled
-            sequence of TrajectoryWithRew to be trained on.
-        trajectory_generator_kwargs: kwargs to pass to the trajectory generator.
-        save_preferences: if True, store the final dataset of preferences to disk.
-        agent_path: if given, initialize the agent using this stored policy
-            rather than randomly.
-        preference_model_kwargs: passed to PreferenceModel
-        reward_trainer_kwargs: passed to BasicRewardTrainer or EnsembleRewardTrainer
-        gatherer_cls: type of PreferenceGatherer to use (defaults to SyntheticGatherer)
-        gatherer_kwargs: passed to the PreferenceGatherer specified by gatherer_cls
-        active_selection: use active selection fragmenter instead of random fragmenter
-        active_selection_oversampling: factor by which to oversample random fragments
-            from the base fragmenter of active selection.
-            this is usually chosen > 1 to allow the active selection algorithm to pick
-            fragment pairs with highest uncertainty. = 1 implies no active selection.
-        uncertainty_on: passed to ActiveSelectionFragmenter
-        fragmenter_kwargs: passed to RandomFragmenter
-        allow_variable_horizon: If False (default), algorithm will raise an
-            exception if it detects trajectories of different length during
-            training. If True, overrides this safety check. WARNING: variable
-            horizon episodes leak information about the reward via termination
-            condition, and can seriously confound evaluation. Read
-            https://imitation.readthedocs.io/en/latest/guide/variable_horizon.html
-            before overriding this.
-        checkpoint_interval: Save the reward model and policy models (if
-            trajectory_generator contains a policy) every `checkpoint_interval`
-            iterations and after training is complete. If 0, then only save weights
-            after training is complete. If <0, then don't save weights at all.
-        query_schedule: one of ("constant", "hyperbolic", "inverse_quadratic").
-            A function indicating how the total number of preference queries should
-            be allocated to each iteration. "hyperbolic" and "inverse_quadratic"
-            apportion fewer queries to later iterations when the policy is assumed
-            to be better and more stable.
-
-    Returns:
-        Rollout statistics from trained policy.
-
-    Raises:
-        ValueError: Inconsistency between config and deserialized policy normalization.
-    """
-    custom_logger, log_dir = common.setup_logging()
-    rng = common.make_rng()
-
-    with common.make_venv() as venv:
-        reward_net = reward.make_reward_net(venv)
-        relabel_reward_fn = functools.partial(
-            reward_net.predict_processed,
-            update_stats=False,
-        )
-        if agent_path is None:
-            agent = rl_common.make_rl_algo(venv, relabel_reward_fn=relabel_reward_fn)
-        else:
-            agent = rl_common.load_rl_algo_from_path(
-                agent_path=agent_path,
-                venv=venv,
-                relabel_reward_fn=relabel_reward_fn,
-            )
-
-        if trajectory_path is None:
-            # Setting the logger here is not necessary (PreferenceComparisons takes care
-            # of it automatically) but it avoids creating unnecessary loggers.
-            agent_trainer = preference_comparisons.AgentTrainer(
-                algorithm=agent,
-                reward_fn=reward_net,
-                venv=venv,
-                exploration_frac=exploration_frac,
-                rng=rng,
-                custom_logger=custom_logger,
-                **trajectory_generator_kwargs,
-            )
-            # Stable Baselines will automatically occupy GPU 0 if it is available.
-            # Let's use the same device as the SB3 agent for the reward model.
-            reward_net = reward_net.to(agent_trainer.algorithm.device)
-            trajectory_generator: preference_comparisons.TrajectoryGenerator = (
-                agent_trainer
-            )
-        else:
-            if exploration_frac > 0:
-                raise ValueError(
-                    "exploration_frac can't be set when a trajectory dataset is used",
-                )
-            trajectory_generator = preference_comparisons.TrajectoryDataset(
-                trajectories=types.load_with_rewards(trajectory_path),
-                rng=rng,
-                custom_logger=custom_logger,
-                **trajectory_generator_kwargs,
-            )
-
-        fragmenter: preference_comparisons.Fragmenter = (
-            preference_comparisons.RandomFragmenter(
-                **fragmenter_kwargs,
-                rng=rng,
-                custom_logger=custom_logger,
-            )
-        )
-        preference_model = preference_comparisons.PreferenceModel(
-            **preference_model_kwargs,
-            model=reward_net,
-        )
-        if active_selection:
-            fragmenter = preference_comparisons.ActiveSelectionFragmenter(
-                preference_model=preference_model,
-                base_fragmenter=fragmenter,
-                fragment_sample_factor=active_selection_oversampling,
-                uncertainty_on=uncertainty_on,
-                custom_logger=custom_logger,
-            )
-        gatherer = gatherer_cls(
-            **gatherer_kwargs,
-            rng=rng,
-            custom_logger=custom_logger,
-        )
-
-        loss = preference_comparisons.CrossEntropyRewardLoss()
-
-        reward_trainer = preference_comparisons._make_reward_trainer(
-            preference_model,
-            loss,
-            rng,
-            reward_trainer_kwargs,
-        )
-
-        main_trainer = preference_comparisons.PreferenceComparisons(
-            trajectory_generator,
-            reward_net,
-            num_iterations=num_iterations,
-            fragmenter=fragmenter,
-            preference_gatherer=gatherer,
-            reward_trainer=reward_trainer,
-            comparison_queue_size=comparison_queue_size,
-            fragment_length=fragment_length,
-            transition_oversampling=transition_oversampling,
-            initial_comparison_frac=initial_comparison_frac,
-            custom_logger=custom_logger,
-            allow_variable_horizon=allow_variable_horizon,
-            query_schedule=query_schedule,
-        )
-
-        def save_callback(iteration_num):
-            if checkpoint_interval > 0 and iteration_num % checkpoint_interval == 0:
-                save_checkpoint(
-                    trainer=main_trainer,
-                    save_path=log_dir / "checkpoints" / f"{iteration_num:04d}",
-                    allow_save_policy=bool(trajectory_path is None),
-                )
-
-        results = main_trainer.train(
-            total_timesteps,
-            total_comparisons,
-            callback=save_callback,
-        )
-
-        # Storing and evaluating policy only useful if we generated trajectory data
-        if bool(trajectory_path is None):
-            results = dict(results)
-            results["rollout"] = train.eval_policy(agent, venv)
-
-    if save_preferences:
-        main_trainer.dataset.save(log_dir / "preferences.pkl")
-
-    # Save final artifacts.
-    if checkpoint_interval >= 0:
-        save_checkpoint(
-            trainer=main_trainer,
-            save_path=log_dir / "checkpoints" / "final",
-            allow_save_policy=bool(trajectory_path is None),
-        )
-
-    return results
-
-
-def main_console():
-    observer_path = (
-        pathlib.Path.cwd() / "output" / "sacred" / "train_preference_comparisons_pebble"
-    )
-    observer = FileStorageObserver(observer_path)
-    train_preference_comparisons_pebble_ex.observers.append(observer)
-    train_preference_comparisons_pebble_ex.run_commandline()
-
-
-if __name__ == "__main__":  # pragma: no cover
-    main_console()

From 6f05b1d5d5f9b1c63ca0d2f996b759a72992fc00 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 23:04:26 +0100
Subject: [PATCH 37/55] #625 fix pre-commit errors

---
 .../algorithms/pebble/entropy_reward.py       | 41 +++++++++++--------
 .../algorithms/preference_comparisons.py      | 26 ++++++++----
 .../policies/replay_buffer_wrapper.py         | 20 ++++-----
 src/imitation/rewards/reward_function.py      |  7 +++-
 src/imitation/scripts/common/rl.py            |  3 +-
 .../config/train_preference_comparisons.py    |  2 +
 .../scripts/train_preference_comparisons.py   | 12 +++---
 .../algorithms/pebble/test_entropy_reward.py  | 29 +++++++++----
 8 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index e0d94c171..7570d369f 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,12 +1,14 @@
+"""Reward function for the PEBBLE training algorithm."""
+
 from enum import Enum, auto
-from typing import Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
 import torch as th
 
 from imitation.policies.replay_buffer_wrapper import (
-    ReplayBufferView,
     ReplayBufferRewardWrapper,
+    ReplayBufferView,
 )
 from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn
 from imitation.util import util
@@ -14,16 +16,16 @@
 
 
 class PebbleRewardPhase(Enum):
-    """States representing different behaviors for PebbleStateEntropyReward"""
+    """States representing different behaviors for PebbleStateEntropyReward."""
 
     UNSUPERVISED_EXPLORATION = auto()  # Entropy based reward
     POLICY_AND_REWARD_LEARNING = auto()  # Learned reward
 
 
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
-    """
-    Reward function for implementation of the PEBBLE learning algorithm
-    (https://arxiv.org/pdf/2106.05091.pdf).
+    """Reward function for implementation of the PEBBLE learning algorithm.
+
+    See https://arxiv.org/pdf/2106.05091.pdf .
 
     The rewards returned by this function go through the three phases:
     1. Before enough samples are collected for entropy calculation, the
@@ -38,33 +40,38 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     supplied with set_replay_buffer() or on_replay_buffer_initialized().
     To transition to the last phase, unsupervised_exploration_finish() needs
     to be called.
-
-    Args:
-        learned_reward_fn: The learned reward function used after unsupervised
-            exploration is finished
-        nearest_neighbor_k: Parameter for entropy computation (see
-            compute_state_entropy())
     """
 
-    # TODO #625: parametrize nearest_neighbor_k
     def __init__(
         self,
         learned_reward_fn: RewardFn,
         nearest_neighbor_k: int = 5,
     ):
+        """Builds this class.
+
+        Args:
+            learned_reward_fn: The learned reward function used after unsupervised
+                exploration is finished
+            nearest_neighbor_k: Parameter for entropy computation (see
+                compute_state_entropy())
+        """
         self.learned_reward_fn = learned_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
         self.entropy_stats = RunningNorm(1)
         self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
 
         # These two need to be set with set_replay_buffer():
-        self.replay_buffer_view = None
-        self.obs_shape = None
+        self.replay_buffer_view: Optional[ReplayBufferView] = None
+        self.obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]], None] = None
 
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
         self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape)
 
-    def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple):
+    def set_replay_buffer(
+        self,
+        replay_buffer: ReplayBufferView,
+        obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]]],
+    ):
         self.replay_buffer_view = replay_buffer
         self.obs_shape = obs_shape
 
@@ -87,7 +94,7 @@ def __call__(
     def _entropy_reward(self, state, action, next_state, done):
         if self.replay_buffer_view is None:
             raise ValueError(
-                "Replay buffer must be supplied before entropy reward can be used"
+                "Replay buffer must be supplied before entropy reward can be used",
             )
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index fe5dc472e..fade985b4 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -77,8 +77,7 @@ def sample(self, steps: int) -> Sequence[TrajectoryWithRew]:
         """  # noqa: DAR202
 
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
-        """Pre-train an agent if the trajectory generator uses one that
-            needs pre-training.
+        """Pre-train an agent before collecting comparisons.
 
         By default, this method does nothing and doesn't need
         to be overridden in subclasses that don't require pre-training.
@@ -331,8 +330,8 @@ def logger(self, value: imit_logger.HierarchicalLogger) -> None:
 
 
 class PebbleAgentTrainer(AgentTrainer):
-    """
-    Specialization of AgentTrainer for PEBBLE training.
+    """Specialization of AgentTrainer for PEBBLE training.
+
     Includes unsupervised pretraining with an entropy based reward function.
     """
 
@@ -344,9 +343,20 @@ def __init__(
         reward_fn: PebbleStateEntropyReward,
         **kwargs,
     ) -> None:
+        """Builds PebbleAgentTrainer.
+
+        Args:
+            reward_fn: Pebble reward function
+             **kwargs: additional keyword arguments to pass on to
+                the parent class
+
+        Raises:
+            ValueError: Unexpected type of reward_fn given.
+        """
         if not isinstance(reward_fn, PebbleStateEntropyReward):
             raise ValueError(
-                f"{self.__class__.__name__} expects {PebbleStateEntropyReward.__name__} reward function"
+                f"{self.__class__.__name__} expects "
+                f"{PebbleStateEntropyReward.__name__} reward function",
             )
         super().__init__(reward_fn=reward_fn, **kwargs)
 
@@ -1731,10 +1741,10 @@ def train(
         ###################################################
         with self.logger.accumulate_means("agent"):
             self.logger.log(
-                f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps"
+                f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps",
             )
             self.trajectory_generator.unsupervised_pretrain(
-                unsupervised_pretrain_timesteps
+                unsupervised_pretrain_timesteps,
             )
 
         for i, num_pairs in enumerate(preference_query_schedule):
@@ -1813,7 +1823,7 @@ def _preference_gather_schedule(self, total_comparisons):
 
     def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]:
         unsupervised_pretrain_timesteps = int(
-            total_timesteps * self.unsupervised_agent_pretrain_frac
+            total_timesteps * self.unsupervised_agent_pretrain_frac,
         )
         timesteps_per_iteration, extra_timesteps = divmod(
             total_timesteps - unsupervised_pretrain_timesteps,
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 297a6b008..414b421f5 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -7,7 +7,7 @@
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
 
-from imitation.rewards.reward_function import RewardFn, ReplayBufferAwareRewardFn
+from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn
 from imitation.util import util
 
 
@@ -24,19 +24,20 @@ def _samples_to_reward_fn_input(
 
 
 class ReplayBufferView:
-    """A read-only view over a valid records in a ReplayBuffer.
-
-    Args:
-        observations_buffer: Array buffer holding observations
-        buffer_slice_provider: Function returning slice of buffer
-            with valid observations
-    """
+    """A read-only view over a valid records in a ReplayBuffer."""
 
     def __init__(
         self,
         observations_buffer: np.ndarray,
         buffer_slice_provider: Callable[[], slice],
     ):
+        """Builds ReplayBufferView.
+
+        Args:
+            observations_buffer: Array buffer holding observations
+            buffer_slice_provider: Function returning slice of buffer
+                with valid observations
+        """
         self._observations_buffer_view = observations_buffer.view()
         self._observations_buffer_view.flags.writeable = False
         self._buffer_slice_provider = buffer_slice_provider
@@ -67,9 +68,6 @@ def __init__(
             action_space: Action space
             replay_buffer_class: Class of the replay buffer.
             reward_fn: Reward function for reward relabeling.
-            on_initialized_callback: Callback called with reference to this object after
-                this instance is fully initialized. This provides a hook to access the
-                buffer after it is created from inside a Stable Baselines algorithm.
             **kwargs: keyword arguments for ReplayBuffer.
         """
         # Note(yawen-d): we directly inherit ReplayBuffer and leave out the case of
diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py
index e9d7bed30..00b1da958 100644
--- a/src/imitation/rewards/reward_function.py
+++ b/src/imitation/rewards/reward_function.py
@@ -35,6 +35,11 @@ def __call__(
 
 
 class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
+    """Abstract class for a reward function that needs access to a replay buffer."""
+
     @abc.abstractmethod
-    def on_replay_buffer_initialized(self, replay_buffer: "ReplayBufferRewardWrapper"):
+    def on_replay_buffer_initialized(
+        self,
+        replay_buffer: "ReplayBufferRewardWrapper",  # type: ignore[name-defined]
+    ):
         pass
diff --git a/src/imitation/scripts/common/rl.py b/src/imitation/scripts/common/rl.py
index e879bbaf8..d71e35211 100644
--- a/src/imitation/scripts/common/rl.py
+++ b/src/imitation/scripts/common/rl.py
@@ -89,7 +89,8 @@ def _maybe_add_relabel_buffer(
         _buffer_kwargs = dict(
             reward_fn=relabel_reward_fn,
             replay_buffer_class=rl_kwargs.get(
-                "replay_buffer_class", buffers.ReplayBuffer
+                "replay_buffer_class",
+                buffers.ReplayBuffer,
             ),
         )
         rl_kwargs["replay_buffer_class"] = ReplayBufferRewardWrapper
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index ca0e786ff..9876ee952 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -60,8 +60,10 @@ def train_defaults():
 
     checkpoint_interval = 0  # Num epochs between saving (<0 disables, =0 final only)
     query_schedule = "hyperbolic"
+
     # Whether to use the PEBBLE algorithm (https://arxiv.org/pdf/2106.05091.pdf)
     pebble_enabled = False
+    unsupervised_agent_pretrain_frac = 0.0
 
 
 @train_preference_comparisons_ex.named_config
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index c848a6d09..659b47a74 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -10,13 +10,13 @@
 import numpy as np
 import torch as th
 from sacred.observers import FileStorageObserver
-from stable_baselines3.common import type_aliases, base_class, vec_env
+from stable_baselines3.common import base_class, type_aliases, vec_env
 
 from imitation.algorithms import preference_comparisons
 from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.data import types
 from imitation.policies import serialize
-from imitation.rewards import reward_nets, reward_function
+from imitation.rewards import reward_function, reward_nets
 from imitation.scripts.common import common, reward
 from imitation.scripts.common import rl as rl_common
 from imitation.scripts.common import train
@@ -65,7 +65,7 @@ def make_reward_function(
     reward_net: reward_nets.RewardNet,
     *,
     pebble_enabled: bool = False,
-    pebble_nearest_neighbor_k: Optional[int] = None,
+    pebble_nearest_neighbor_k: int = 5,
 ):
     relabel_reward_fn = functools.partial(
         reward_net.predict_processed,
@@ -73,7 +73,8 @@ def make_reward_function(
     )
     if pebble_enabled:
         relabel_reward_fn = PebbleStateEntropyReward(
-            relabel_reward_fn, pebble_nearest_neighbor_k
+            relabel_reward_fn,  # type: ignore[assignment]
+            pebble_nearest_neighbor_k,
         )
     return relabel_reward_fn
 
@@ -92,6 +93,7 @@ def make_agent_trajectory_generator(
     trajectory_generator_kwargs: Mapping[str, Any],
 ) -> preference_comparisons.AgentTrainer:
     if pebble_enabled:
+        assert isinstance(relabel_reward_fn, PebbleStateEntropyReward)
         return preference_comparisons.PebbleAgentTrainer(
             algorithm=agent,
             reward_fn=relabel_reward_fn,
@@ -138,7 +140,7 @@ def train_preference_comparisons(
     allow_variable_horizon: bool,
     checkpoint_interval: int,
     query_schedule: Union[str, type_aliases.Schedule],
-    unsupervised_agent_pretrain_frac: Optional[float],
+    unsupervised_agent_pretrain_frac: float,
 ) -> Mapping[str, Any]:
     """Train a reward model using preference comparisons.
 
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 918222382..84b59107a 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -1,17 +1,18 @@
+"""Tests for `imitation.algorithms.entropy_reward`."""
+
 import pickle
-from unittest.mock import patch, Mock
+from unittest.mock import Mock, patch
 
 import numpy as np
 import torch as th
 from gym.spaces import Discrete
-from stable_baselines3.common.preprocessing import get_obs_shape
 
 from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.policies.replay_buffer_wrapper import ReplayBufferView
 from imitation.util import util
 
 SPACE = Discrete(4)
-OBS_SHAPE = get_obs_shape(SPACE)
+OBS_SHAPE = (1,)
 PLACEHOLDER = np.empty(OBS_SHAPE)
 
 BUFFER_SIZE = 20
@@ -25,7 +26,8 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
 
     reward_fn = PebbleStateEntropyReward(Mock(), K)
     reward_fn.set_replay_buffer(
-        ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE
+        ReplayBufferView(all_observations, lambda: slice(None)),
+        OBS_SHAPE,
     )
 
     # Act
@@ -34,17 +36,20 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
 
     # Assert
     expected = util.compute_state_entropy(
-        observations, all_observations.reshape(-1, *OBS_SHAPE), K
+        observations,
+        all_observations.reshape(-1, *OBS_SHAPE),
+        K,
     )
     expected_normalized = reward_fn.entropy_stats.normalize(
-        th.as_tensor(expected)
+        th.as_tensor(expected),
     ).numpy()
     np.testing.assert_allclose(reward, expected_normalized)
 
 
 def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
     with patch("imitation.util.util.compute_state_entropy") as m:
-        # mock entropy computation so that we can test only stats collection in this test
+        # mock entropy computation so that we can test
+        # only stats collection in this test
         m.side_effect = lambda obs, all_obs, k: obs
 
         reward_fn = PebbleStateEntropyReward(Mock(), K)
@@ -64,7 +69,10 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
             reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
         normalized_reward = reward_fn(
-            np.zeros(dim), PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+            np.zeros(dim),
+            PLACEHOLDER,
+            PLACEHOLDER,
+            PLACEHOLDER,
         )
 
         # Assert
@@ -91,7 +99,10 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin
     # Assert
     assert reward == expected_reward
     learned_reward_mock.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        observations,
+        PLACEHOLDER,
+        PLACEHOLDER,
+        PLACEHOLDER,
     )
 
 

From c787877389a87bf7b6c092963062a49fad400a8a Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 1 Dec 2022 23:59:56 +0100
Subject: [PATCH 38/55] #625 add test for pebble agent trainer

---
 .../algorithms/preference_comparisons.py      |  5 ++--
 src/imitation/rewards/reward_function.py      |  2 +-
 .../algorithms/test_preference_comparisons.py | 24 ++++++++++++++++++-
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index fade985b4..91c7e55f1 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -45,6 +45,7 @@
 from imitation.policies import exploration_wrapper
 from imitation.regularization import regularizers
 from imitation.rewards import reward_function, reward_nets, reward_wrapper
+from imitation.rewards.reward_function import RewardFn
 from imitation.util import logger as imit_logger
 from imitation.util import networks, util
 
@@ -178,7 +179,7 @@ def __init__(
                 reward_fn.action_space,
             )
             reward_fn = reward_fn.predict_processed
-        self.reward_fn = reward_fn
+        self.reward_fn: RewardFn = reward_fn
         self.exploration_frac = exploration_frac
         self.rng = rng
 
@@ -362,7 +363,7 @@ def __init__(
 
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         self.train(steps, **kwargs)
-        self.reward_fn.unsupervised_exploration_finish()
+        self.reward_fn.unsupervised_exploration_finish()  # type: ignore[attribute-error]
 
 
 def _get_trajectories(
diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py
index 00b1da958..3e85a4fa5 100644
--- a/src/imitation/rewards/reward_function.py
+++ b/src/imitation/rewards/reward_function.py
@@ -40,6 +40,6 @@ class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
     @abc.abstractmethod
     def on_replay_buffer_initialized(
         self,
-        replay_buffer: "ReplayBufferRewardWrapper",  # type: ignore[name-defined]
+        replay_buffer: "ReplayBufferRewardWrapper",  # type: ignore[name-defined] # noqa
     ):
         pass
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
index 12727c1c9..3dedc4482 100644
--- a/tests/algorithms/test_preference_comparisons.py
+++ b/tests/algorithms/test_preference_comparisons.py
@@ -17,8 +17,10 @@
 
 import imitation.testing.reward_nets as testing_reward_nets
 from imitation.algorithms import preference_comparisons
+from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.data import types
 from imitation.data.types import TrajectoryWithRew
+from imitation.policies.replay_buffer_wrapper import ReplayBufferView
 from imitation.regularization import regularizers, updaters
 from imitation.rewards import reward_nets
 from imitation.util import networks, util
@@ -72,6 +74,23 @@ def agent_trainer(agent, reward_net, venv, rng):
     return preference_comparisons.AgentTrainer(agent, reward_net, venv, rng)
 
 
+@pytest.fixture
+def replay_buffer(rng):
+    return ReplayBufferView(rng.random((10, 8, 4)), lambda: slice(None))
+
+
+@pytest.fixture
+def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer):
+    reward_fn = PebbleStateEntropyReward(reward_net.predict_processed)
+    reward_fn.set_replay_buffer(replay_buffer, (4,))
+    return preference_comparisons.PebbleAgentTrainer(
+        algorithm=agent,
+        reward_fn=reward_fn,
+        venv=venv,
+        rng=rng,
+    )
+
+
 def assert_info_arrs_equal(arr1, arr2):  # pragma: no cover
     def check_possibly_nested_dicts_equal(dict1, dict2):
         for key, val1 in dict1.items():
@@ -293,14 +312,17 @@ def build_preference_comparsions(gatherer, reward_trainer, fragmenter, rng):
     "schedule",
     ["constant", "hyperbolic", "inverse_quadratic", lambda t: 1 / (1 + t**3)],
 )
+@pytest.mark.parametrize("agent_fixture", ["agent_trainer", "pebble_agent_trainer"])
 def test_trainer_no_crash(
-    agent_trainer,
+    request,
+    agent_fixture,
     reward_net,
     random_fragmenter,
     custom_logger,
     schedule,
     rng,
 ):
+    agent_trainer = request.getfixturevalue(agent_fixture)
     main_trainer = preference_comparisons.PreferenceComparisons(
         agent_trainer,
         reward_net,

From b9c5614e3c61ca61b1a9a6882b70ce298eb7fb52 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Fri, 2 Dec 2022 00:15:56 +0100
Subject: [PATCH 39/55] #625 fix more pre-commit errors

---
 src/imitation/algorithms/pebble/__init__.py                 | 1 +
 src/imitation/algorithms/pebble/entropy_reward.py           | 2 +-
 src/imitation/algorithms/preference_comparisons.py          | 3 ++-
 src/imitation/policies/base.py                              | 2 +-
 src/imitation/rewards/reward_function.py                    | 6 +++++-
 .../scripts/config/train_preference_comparisons.py          | 2 +-
 6 files changed, 11 insertions(+), 5 deletions(-)
 create mode 100644 src/imitation/algorithms/pebble/__init__.py

diff --git a/src/imitation/algorithms/pebble/__init__.py b/src/imitation/algorithms/pebble/__init__.py
new file mode 100644
index 000000000..dca061476
--- /dev/null
+++ b/src/imitation/algorithms/pebble/__init__.py
@@ -0,0 +1 @@
+"""PEBBLE specific algorithms."""
diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 7570d369f..08cf800c8 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -25,7 +25,7 @@ class PebbleRewardPhase(Enum):
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     """Reward function for implementation of the PEBBLE learning algorithm.
 
-    See https://arxiv.org/pdf/2106.05091.pdf .
+    See https://arxiv.org/abs/2106.05091 .
 
     The rewards returned by this function go through the three phases:
     1. Before enough samples are collected for entropy calculation, the
diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 91c7e55f1..3374ff136 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -363,7 +363,8 @@ def __init__(
 
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         self.train(steps, **kwargs)
-        self.reward_fn.unsupervised_exploration_finish()  # type: ignore[attribute-error]
+        fn = self.reward_fn
+        fn.unsupervised_exploration_finish()  # type: ignore[attribute-error]
 
 
 def _get_trajectories(
diff --git a/src/imitation/policies/base.py b/src/imitation/policies/base.py
index 3101cf2c7..9d455ff15 100644
--- a/src/imitation/policies/base.py
+++ b/src/imitation/policies/base.py
@@ -76,7 +76,7 @@ class SAC1024Policy(sac_policies.SACPolicy):
     """Actor and value networks with two hidden layers of 1024 units respectively.
 
     This matches the implementation of SAC policies in the PEBBLE paper. See:
-    https://arxiv.org/pdf/2106.05091.pdf
+    https://arxiv.org/abs/2106.05091
     https://github.com/denisyarats/pytorch_sac/blob/master/config/agent/sac.yaml
 
     Note: This differs from stable_baselines3 SACPolicy by having 1024 hidden units
diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py
index 3e85a4fa5..69f2f5932 100644
--- a/src/imitation/rewards/reward_function.py
+++ b/src/imitation/rewards/reward_function.py
@@ -5,6 +5,8 @@
 
 import numpy as np
 
+import imitation.policies.replay_buffer_wrapper
+
 
 class RewardFn(Protocol):
     """Abstract class for reward function.
@@ -40,6 +42,8 @@ class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
     @abc.abstractmethod
     def on_replay_buffer_initialized(
         self,
-        replay_buffer: "ReplayBufferRewardWrapper",  # type: ignore[name-defined] # noqa
+        replay_buffer: (
+            "imitation.policies.replay_buffer_wrapper.ReplayBufferRewardWrapper"
+        ),
     ):
         pass
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 9876ee952..0c4ed6411 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -61,7 +61,7 @@ def train_defaults():
     checkpoint_interval = 0  # Num epochs between saving (<0 disables, =0 final only)
     query_schedule = "hyperbolic"
 
-    # Whether to use the PEBBLE algorithm (https://arxiv.org/pdf/2106.05091.pdf)
+    # Whether to use the PEBBLE algorithm (https://arxiv.org/abs/2106.05091)
     pebble_enabled = False
     unsupervised_agent_pretrain_frac = 0.0
 

From 40e73873f193f593f9d360c627217f331ef9859b Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Fri, 2 Dec 2022 01:09:54 +0100
Subject: [PATCH 40/55] #625 fix even more pre-commit errors

---
 src/imitation/algorithms/pebble/entropy_reward.py | 11 ++++++-----
 .../algorithms/preference_comparisons.py          |  3 +--
 src/imitation/policies/replay_buffer_wrapper.py   | 15 +++++++++++++--
 src/imitation/rewards/reward_function.py          | 15 ---------------
 tests/policies/test_replay_buffer_wrapper.py      |  6 ++++--
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 08cf800c8..8cce6b084 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -7,10 +7,11 @@
 import torch as th
 
 from imitation.policies.replay_buffer_wrapper import (
+    ReplayBufferAwareRewardFn,
     ReplayBufferRewardWrapper,
     ReplayBufferView,
 )
-from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn
+from imitation.rewards.reward_function import RewardFn
 from imitation.util import util
 from imitation.util.networks import RunningNorm
 
@@ -29,12 +30,12 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
 
     The rewards returned by this function go through the three phases:
     1. Before enough samples are collected for entropy calculation, the
-        underlying function is returned. This shouldn't matter because
-        OffPolicyAlgorithms have an initialization period for `learning_starts`
-        timesteps.
+    underlying function is returned. This shouldn't matter because
+    OffPolicyAlgorithms have an initialization period for `learning_starts`
+    timesteps.
     2. During the unsupervised exploration phase, entropy based reward is returned
     3. After unsupervised exploration phase is finished, the underlying learned
-        reward is returned.
+    reward is returned.
 
     The second phase requires that a buffer with observations to compare against is
     supplied with set_replay_buffer() or on_replay_buffer_initialized().
diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 3374ff136..77d68eff0 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -348,8 +348,7 @@ def __init__(
 
         Args:
             reward_fn: Pebble reward function
-             **kwargs: additional keyword arguments to pass on to
-                the parent class
+            **kwargs: additional keyword arguments to pass on to the parent class
 
         Raises:
             ValueError: Unexpected type of reward_fn given.
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 414b421f5..b7a67a1c1 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -1,5 +1,5 @@
 """Wrapper for reward labeling for transitions sampled from a replay buffer."""
-
+import abc
 from typing import Callable, Mapping, Type
 
 import numpy as np
@@ -7,7 +7,7 @@
 from stable_baselines3.common.buffers import ReplayBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
 
-from imitation.rewards.reward_function import ReplayBufferAwareRewardFn, RewardFn
+from imitation.rewards.reward_function import RewardFn
 from imitation.util import util
 
 
@@ -134,3 +134,14 @@ def _get_samples(self):
             "_get_samples() is intentionally not implemented."
             "This method should not be called.",
         )
+
+
+class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
+    """Abstract class for a reward function that needs access to a replay buffer."""
+
+    @abc.abstractmethod
+    def on_replay_buffer_initialized(
+        self,
+        replay_buffer: ReplayBufferRewardWrapper,
+    ):
+        pass
diff --git a/src/imitation/rewards/reward_function.py b/src/imitation/rewards/reward_function.py
index 69f2f5932..93761752d 100644
--- a/src/imitation/rewards/reward_function.py
+++ b/src/imitation/rewards/reward_function.py
@@ -5,8 +5,6 @@
 
 import numpy as np
 
-import imitation.policies.replay_buffer_wrapper
-
 
 class RewardFn(Protocol):
     """Abstract class for reward function.
@@ -34,16 +32,3 @@ def __call__(
         Returns:
             Computed rewards of shape `(batch_size,`).
         """  # noqa: DAR202
-
-
-class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
-    """Abstract class for a reward function that needs access to a replay buffer."""
-
-    @abc.abstractmethod
-    def on_replay_buffer_initialized(
-        self,
-        replay_buffer: (
-            "imitation.policies.replay_buffer_wrapper.ReplayBufferRewardWrapper"
-        ),
-    ):
-        pass
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 02bb72ce2..7c26dd2d4 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -16,8 +16,10 @@
 from stable_baselines3.common.preprocessing import get_action_dim, get_obs_shape
 from stable_baselines3.common.save_util import load_from_pkl
 
-from imitation.policies.replay_buffer_wrapper import ReplayBufferRewardWrapper
-from imitation.rewards.reward_function import ReplayBufferAwareRewardFn
+from imitation.policies.replay_buffer_wrapper import (
+    ReplayBufferAwareRewardFn,
+    ReplayBufferRewardWrapper,
+)
 from imitation.util import util
 
 

From aad2e7cb324164af15ddcff79af7c90e8075a6b2 Mon Sep 17 00:00:00 2001
From: Mifeet <mifeet@users.noreply.github.com>
Date: Fri, 2 Dec 2022 12:39:29 +0100
Subject: [PATCH 41/55] code review - Update
 src/imitation/policies/replay_buffer_wrapper.py

Co-authored-by: Adam Gleave <adam@gleave.me>
---
 src/imitation/policies/replay_buffer_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index b7a67a1c1..255e01f3b 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -24,7 +24,7 @@ def _samples_to_reward_fn_input(
 
 
 class ReplayBufferView:
-    """A read-only view over a valid records in a ReplayBuffer."""
+    """A read-only view over valid records in a ReplayBuffer."""
 
     def __init__(
         self,

From e0aea610862c43bdcfd39cdad5d0cf93c4bc6172 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Fri, 2 Dec 2022 23:04:56 +0100
Subject: [PATCH 42/55] #625 code review

---
 src/imitation/algorithms/pebble/entropy_reward.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 8cce6b084..ba844c682 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,6 +1,6 @@
 """Reward function for the PEBBLE training algorithm."""
 
-from enum import Enum, auto
+import enum
 from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
@@ -16,11 +16,11 @@
 from imitation.util.networks import RunningNorm
 
 
-class PebbleRewardPhase(Enum):
+class PebbleRewardPhase(enum.Enum):
     """States representing different behaviors for PebbleStateEntropyReward."""
 
-    UNSUPERVISED_EXPLORATION = auto()  # Entropy based reward
-    POLICY_AND_REWARD_LEARNING = auto()  # Learned reward
+    UNSUPERVISED_EXPLORATION = enum.auto()  # Entropy based reward
+    POLICY_AND_REWARD_LEARNING = enum.auto()  # Learned reward
 
 
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):

From f0a3359f15cadac9bb89c950641e46e5daca9df7 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Fri, 2 Dec 2022 23:56:10 +0100
Subject: [PATCH 43/55] #625 code review: do not allocate timesteps for
 pretraining if there is no pretraining

---
 .../algorithms/preference_comparisons.py      | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 77d68eff0..ba44e338c 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -77,17 +77,36 @@ def sample(self, steps: int) -> Sequence[TrajectoryWithRew]:
             be the environment rewards, not ones from a reward model).
         """  # noqa: DAR202
 
+    @property
+    def has_pretraining(self) -> bool:
+        """Indicates whether this generator has a pre-training phase.
+
+        The value can be used, e.g., when allocating time-steps for pre-training.
+
+        By default, True is returned if the unsupervised_pretrain() method is not
+        overriden, bud subclasses may choose to override this behavior.
+        """
+        orig_impl = TrajectoryGenerator.unsupervised_pretrain
+        return type(self).unsupervised_pretrain != orig_impl
+
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         """Pre-train an agent before collecting comparisons.
 
-        By default, this method does nothing and doesn't need
-        to be overridden in subclasses that don't require pre-training.
+        By default, this method asserts that pre-training has zero steps allocated.
+        Override this behavior in subclasses that implement pre-training.
 
         Args:
             steps: number of environment steps to train for.
             **kwargs: additional keyword arguments to pass on to
                 the training procedure.
         """
+        if steps > 0:
+            self._logger.warn(
+                f"{steps} timesteps allocated for unsupervised pre-training:"
+                " Trajectory generators without pre-training implementation should"
+                " not consume any timesteps (otherwise the total number of"
+                " timesteps executed may be misleading)"
+            )
 
     def train(self, steps: int, **kwargs: Any) -> None:
         """Train an agent if the trajectory generator uses one.
@@ -1823,9 +1842,12 @@ def _preference_gather_schedule(self, total_comparisons):
         return schedule
 
     def _compute_timesteps(self, total_timesteps: int) -> Tuple[int, int, int]:
-        unsupervised_pretrain_timesteps = int(
-            total_timesteps * self.unsupervised_agent_pretrain_frac,
-        )
+        if self.trajectory_generator.has_pretraining:
+            unsupervised_pretrain_timesteps = int(
+                total_timesteps * self.unsupervised_agent_pretrain_frac,
+            )
+        else:
+            unsupervised_pretrain_timesteps = 0
         timesteps_per_iteration, extra_timesteps = divmod(
             total_timesteps - unsupervised_pretrain_timesteps,
             self.num_iterations,

From 8cb244979e3ffd251a81790f2dc425eac5ffb565 Mon Sep 17 00:00:00 2001
From: Mifeet <mifeet@users.noreply.github.com>
Date: Sat, 3 Dec 2022 00:01:47 +0100
Subject: [PATCH 44/55] Update
 src/imitation/algorithms/preference_comparisons.py

Co-authored-by: Adam Gleave <adam@gleave.me>
---
 src/imitation/algorithms/preference_comparisons.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index ba44e338c..03f1bc25c 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1746,7 +1746,7 @@ def train(
         """
         # Compute the number of comparisons to request at each iteration in advance.
         preference_query_schedule = self._preference_gather_schedule(total_comparisons)
-        print(f"Query schedule: {preference_query_schedule}")
+        self.logger.log(f"Query schedule: {preference_query_schedule}")
 
         (
             unsupervised_pretrain_timesteps,

From 378baa86eb54c4cded4cbd99e60c1dd928efa2dd Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Fri, 2 Dec 2022 23:59:38 +0100
Subject: [PATCH 45/55] #625 code review: remove ignore

---
 src/imitation/algorithms/preference_comparisons.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 03f1bc25c..cc3164182 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -382,7 +382,7 @@ def __init__(
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         self.train(steps, **kwargs)
         fn = self.reward_fn
-        fn.unsupervised_exploration_finish()  # type: ignore[attribute-error]
+        fn.unsupervised_exploration_finish()
 
 
 def _get_trajectories(

From d7ad4145f48c7995e24f8226e22da9260bba8744 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 3 Dec 2022 00:03:20 +0100
Subject: [PATCH 46/55] #625 code review - skip pretrainining if zero timesteps

---
 .../algorithms/preference_comparisons.py        | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index cc3164182..c3a77e579 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1749,7 +1749,7 @@ def train(
         self.logger.log(f"Query schedule: {preference_query_schedule}")
 
         (
-            unsupervised_pretrain_timesteps,
+            unsup_pretrain_timesteps,
             timesteps_per_iteration,
             extra_timesteps,
         ) = self._compute_timesteps(total_timesteps)
@@ -1759,13 +1759,14 @@ def train(
         ###################################################
         # Pre-training agent before gathering preferences #
         ###################################################
-        with self.logger.accumulate_means("agent"):
-            self.logger.log(
-                f"Pre-training agent for {unsupervised_pretrain_timesteps} timesteps",
-            )
-            self.trajectory_generator.unsupervised_pretrain(
-                unsupervised_pretrain_timesteps,
-            )
+        if unsup_pretrain_timesteps:
+            with self.logger.accumulate_means("agent"):
+                self.logger.log(
+                    f"Pre-training agent for {unsup_pretrain_timesteps} timesteps",
+                )
+                self.trajectory_generator.unsupervised_pretrain(
+                    unsup_pretrain_timesteps,
+                )
 
         for i, num_pairs in enumerate(preference_query_schedule):
             ##########################

From 412550de84f1b75c154ea1573aa0acaa7d7a5748 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 3 Dec 2022 00:23:29 +0100
Subject: [PATCH 47/55] #625 code review: separate pebble and environment
 configuration

---
 .../scripts/config/train_preference_comparisons.py       | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 0c4ed6411..f01a7d6c0 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -81,8 +81,6 @@ def pebble():
     train = {
         "policy_cls": base.SAC1024Policy,  # noqa: F841
     }
-    common = {"env_name": "MountainCarContinuous-v0"}
-    allow_variable_horizon = True
 
     locals()  # quieten flake8
 
@@ -141,6 +139,13 @@ def seals_mountain_car():
     common = dict(env_name="seals/MountainCar-v0")
 
 
+@train_preference_comparisons_ex.named_config
+def mountain_car_continuous():
+    common = {"env_name": "MountainCarContinuous-v0"}
+    allow_variable_horizon = True
+    locals()  # quieten flake8
+
+
 @train_preference_comparisons_ex.named_config
 def fast():
     # Minimize the amount of computation. Useful for test cases.

From 7c3470e584c5bf02903620f5c119fca0bd5afa8c Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 3 Dec 2022 00:25:29 +0100
Subject: [PATCH 48/55] #625 fix even even more pre-commit errors

---
 src/imitation/algorithms/preference_comparisons.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index c3a77e579..ec1816143 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -85,6 +85,9 @@ def has_pretraining(self) -> bool:
 
         By default, True is returned if the unsupervised_pretrain() method is not
         overriden, bud subclasses may choose to override this behavior.
+
+        Returns:
+            True if this generator has a pre-training phase, False otherwise
         """
         orig_impl = TrajectoryGenerator.unsupervised_pretrain
         return type(self).unsupervised_pretrain != orig_impl
@@ -105,7 +108,7 @@ def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
                 f"{steps} timesteps allocated for unsupervised pre-training:"
                 " Trajectory generators without pre-training implementation should"
                 " not consume any timesteps (otherwise the total number of"
-                " timesteps executed may be misleading)"
+                " timesteps executed may be misleading)",
             )
 
     def train(self, steps: int, **kwargs: Any) -> None:

From 73b1e36ff968885d59ecbb804521ecfe90ad5fc1 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 3 Dec 2022 00:34:07 +0100
Subject: [PATCH 49/55] #625 fix even even more pre-commit errors

---
 .../algorithms/preference_comparisons.py      |  3 ++-
 src/imitation/util/util.py                    | 23 +++++++++++--------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index ec1816143..411dc8c65 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -84,7 +84,7 @@ def has_pretraining(self) -> bool:
         The value can be used, e.g., when allocating time-steps for pre-training.
 
         By default, True is returned if the unsupervised_pretrain() method is not
-        overriden, bud subclasses may choose to override this behavior.
+        overridden, bud subclasses may choose to override this behavior.
 
         Returns:
             True if this generator has a pre-training phase, False otherwise
@@ -385,6 +385,7 @@ def __init__(
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         self.train(steps, **kwargs)
         fn = self.reward_fn
+        assert isinstance(fn, PebbleStateEntropyReward)
         fn.unsupervised_exploration_finish()
 
 
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index 9bf1c1a40..cef2e6f38 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -377,16 +377,19 @@ def compute_state_entropy(
         A tensor containing the state entropy for `obs`.
     """
     assert obs.shape[1:] == all_obs.shape[1:]
+    batch_size = 500
     with th.no_grad():
         non_batch_dimensions = tuple(range(2, len(obs.shape) + 1))
-        distances_tensor = th.linalg.vector_norm(
-            obs[:, None] - all_obs[None, :],
-            dim=non_batch_dimensions,
-            ord=2,
-        )
-
-        # Note that we take the k+1'th value because the closest neighbor to
-        # a point is itself, which we want to skip.
-        assert distances_tensor.shape[-1] > k
-        knn_dists = th.kthvalue(distances_tensor, k=k + 1, dim=1).values
+        dists: List[th.Tensor] = []
+        for idx in range(len(all_obs) // batch_size + 1):
+            start = idx * batch_size
+            end = (idx + 1) * batch_size
+            distances_tensor = th.linalg.vector_norm(
+                obs[:, None] - all_obs[None, start:end],
+                dim=non_batch_dimensions,
+                ord=2,
+            )
+            dists.append(distances_tensor)
+        all_dists = th.cat(dists, dim=1)
+        knn_dists = th.kthvalue(all_dists, k=k + 1, dim=1).values
         return knn_dists

From 6daa4732c63081a5f90ca91606bb16bff5e9c87e Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 8 Dec 2022 00:24:19 +0100
Subject: [PATCH 50/55] #641 code review: remove set_replay_buffer

---
 .../algorithms/pebble/entropy_reward.py       | 13 ++++-----
 .../algorithms/pebble/test_entropy_reward.py  | 29 ++++++++++++++-----
 .../algorithms/test_preference_comparisons.py |  6 +++-
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index ba844c682..9e7958fa6 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -58,6 +58,7 @@ def __init__(
         """
         self.learned_reward_fn = learned_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
+
         self.entropy_stats = RunningNorm(1)
         self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
 
@@ -66,15 +67,9 @@ def __init__(
         self.obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]], None] = None
 
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
-        self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape)
+        self.replay_buffer_view = replay_buffer.buffer_view
+        self.obs_shape = replay_buffer.obs_shape
 
-    def set_replay_buffer(
-        self,
-        replay_buffer: ReplayBufferView,
-        obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]]],
-    ):
-        self.replay_buffer_view = replay_buffer
-        self.obs_shape = obs_shape
 
     def unsupervised_exploration_finish(self):
         assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION
@@ -112,7 +107,9 @@ def _entropy_reward(self, state, action, next_state, done):
                 th.tensor(all_observations),
                 self.nearest_neighbor_k,
             )
+
             normalized_entropies = self.entropy_stats.forward(entropies)
+
             return normalized_entropies.numpy()
 
     def __getstate__(self):
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 84b59107a..bc020e86c 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -25,9 +25,11 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
     all_observations = rng.random((BUFFER_SIZE, VENVS, *OBS_SHAPE))
 
     reward_fn = PebbleStateEntropyReward(Mock(), K)
-    reward_fn.set_replay_buffer(
-        ReplayBufferView(all_observations, lambda: slice(None)),
-        OBS_SHAPE,
+    reward_fn.on_replay_buffer_initialized(
+        replay_buffer_mock(
+            ReplayBufferView(all_observations, lambda: slice(None)),
+            OBS_SHAPE,
+        )
     )
 
     # Act
@@ -54,9 +56,11 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
 
         reward_fn = PebbleStateEntropyReward(Mock(), K)
         all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
-        reward_fn.set_replay_buffer(
-            ReplayBufferView(all_observations, lambda: slice(None)),
-            OBS_SHAPE,
+        reward_fn.on_replay_buffer_initialized(
+            replay_buffer_mock(
+                ReplayBufferView(all_observations, lambda: slice(None)),
+                OBS_SHAPE,
+            )
         )
 
         dim = 8
@@ -112,13 +116,15 @@ def test_pebble_entropy_reward_can_pickle():
 
     obs1 = np.random.rand(VENVS, *OBS_SHAPE)
     reward_fn = PebbleStateEntropyReward(reward_fn_stub, K)
-    reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE)
+    reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, OBS_SHAPE))
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Act
     pickled = pickle.dumps(reward_fn)
     reward_fn_deserialized = pickle.loads(pickled)
-    reward_fn_deserialized.set_replay_buffer(replay_buffer, OBS_SHAPE)
+    reward_fn_deserialized.on_replay_buffer_initialized(
+        replay_buffer_mock(replay_buffer, OBS_SHAPE)
+    )
 
     # Assert
     obs2 = np.random.rand(VENVS, *OBS_SHAPE)
@@ -129,3 +135,10 @@ def test_pebble_entropy_reward_can_pickle():
 
 def reward_fn_stub(state, action, next_state, done):
     return state
+
+
+def replay_buffer_mock(buffer_view: ReplayBufferView, obs_shape: tuple) -> Mock:
+    replay_buffer_mock = Mock()
+    replay_buffer_mock.buffer_view = buffer_view
+    replay_buffer_mock.obs_shape = obs_shape
+    return replay_buffer_mock
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
index 3dedc4482..fb63e71d0 100644
--- a/tests/algorithms/test_preference_comparisons.py
+++ b/tests/algorithms/test_preference_comparisons.py
@@ -3,6 +3,7 @@
 import math
 import re
 from typing import Any, Sequence
+from unittest.mock import Mock
 
 import gym
 import numpy as np
@@ -81,8 +82,11 @@ def replay_buffer(rng):
 
 @pytest.fixture
 def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer):
+    replay_buffer_mock = Mock()
+    replay_buffer_mock.buffer_view = replay_buffer
+    replay_buffer_mock.obs_shape = (4,)
     reward_fn = PebbleStateEntropyReward(reward_net.predict_processed)
-    reward_fn.set_replay_buffer(replay_buffer, (4,))
+    reward_fn.on_replay_buffer_initialized(replay_buffer_mock)
     return preference_comparisons.PebbleAgentTrainer(
         algorithm=agent,
         reward_fn=reward_fn,

From c80fb80109647e193d7afde2c0ef3bdcabf9dcfc Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Thu, 8 Dec 2022 00:28:17 +0100
Subject: [PATCH 51/55] #641 code review: fix comment

---
 src/imitation/algorithms/preference_comparisons.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 411dc8c65..72f5da5cf 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -95,8 +95,9 @@ def has_pretraining(self) -> bool:
     def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         """Pre-train an agent before collecting comparisons.
 
-        By default, this method asserts that pre-training has zero steps allocated.
         Override this behavior in subclasses that implement pre-training.
+        If not overriden, this method raises ValueError when non-zero steps are
+        allocated for pre-training.
 
         Args:
             steps: number of environment steps to train for.
@@ -104,7 +105,7 @@ def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
                 the training procedure.
         """
         if steps > 0:
-            self._logger.warn(
+            raise ValueError(
                 f"{steps} timesteps allocated for unsupervised pre-training:"
                 " Trajectory generators without pre-training implementation should"
                 " not consume any timesteps (otherwise the total number of"

From 50577b046752193db92182b6ed40f517e0d53eb3 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 10 Dec 2022 01:02:32 +0100
Subject: [PATCH 52/55] #641 code review: replace RunningNorm with
 NormalizedRewardNet

---
 .../algorithms/pebble/entropy_reward.py       | 144 ++++++++++++++----
 .../config/train_preference_comparisons.py    |   1 +
 src/imitation/util/util.py                    |   5 +-
 .../algorithms/pebble/test_entropy_reward.py  |  49 +++---
 4 files changed, 139 insertions(+), 60 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 9e7958fa6..f1bb373ba 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,8 +1,9 @@
 """Reward function for the PEBBLE training algorithm."""
 
 import enum
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple
 
+import gym
 import numpy as np
 import torch as th
 
@@ -12,6 +13,7 @@
     ReplayBufferView,
 )
 from imitation.rewards.reward_function import RewardFn
+from imitation.rewards.reward_nets import NormalizedRewardNet, RewardNet
 from imitation.util import util
 from imitation.util.networks import RunningNorm
 
@@ -23,6 +25,92 @@ class PebbleRewardPhase(enum.Enum):
     POLICY_AND_REWARD_LEARNING = enum.auto()  # Learned reward
 
 
+class InsufficientObservations(RuntimeError):
+    pass
+
+
+class EntropyRewardNet(RewardNet):
+    def __init__(
+        self,
+        nearest_neighbor_k: int,
+        replay_buffer_view: ReplayBufferView,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        normalize_images: bool = True,
+    ):
+        """Initialize the RewardNet.
+
+        Args:
+            observation_space: the observation space of the environment
+            action_space: the action space of the environment
+            normalize_images: whether to automatically normalize
+                image observations to [0, 1] (from 0 to 255). Defaults to True.
+        """
+        super().__init__(observation_space, action_space, normalize_images)
+        self.nearest_neighbor_k = nearest_neighbor_k
+        self._replay_buffer_view = replay_buffer_view
+
+    def set_replay_buffer(self, replay_buffer: ReplayBufferRewardWrapper):
+        """This method needs to be called after unpickling.
+
+        See also __getstate__() / __setstate__()
+        """
+        assert self.observation_space == replay_buffer.observation_space
+        assert self.action_space == replay_buffer.action_space
+        self._replay_buffer_view = replay_buffer.buffer_view
+
+    def forward(
+        self,
+        state: th.Tensor,
+        action: th.Tensor,
+        next_state: th.Tensor,
+        done: th.Tensor,
+    ) -> th.Tensor:
+        assert (
+            self._replay_buffer_view is not None
+        ), "Missing replay buffer (possibly after unpickle)"
+
+        all_observations = self._replay_buffer_view.observations
+        # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
+        all_observations = all_observations.reshape(
+            (-1,) + self.observation_space.shape
+        )
+
+        if all_observations.shape[0] < self.nearest_neighbor_k:
+            raise InsufficientObservations(
+                "Insufficient observations for entropy calculation"
+            )
+
+        return util.compute_state_entropy(
+            state, all_observations, self.nearest_neighbor_k
+        )
+
+    def preprocess(
+        self,
+        state: np.ndarray,
+        action: np.ndarray,
+        next_state: np.ndarray,
+        done: np.ndarray,
+    ) -> Tuple[th.Tensor, th.Tensor, th.Tensor, th.Tensor]:
+        """Override default preprocessing to avoid the default one-hot encoding.
+
+        We also know forward() only works with state, so no need to convert
+        other tensors.
+        """
+        state_th = util.safe_to_tensor(state).to(self.device)
+        action_th = next_state_th = done_th = th.empty(0)
+        return state_th, action_th, next_state_th, done_th
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["_replay_buffer_view"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._replay_buffer_view = None
+
+
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     """Reward function for implementation of the PEBBLE learning algorithm.
 
@@ -59,17 +147,27 @@ def __init__(
         self.learned_reward_fn = learned_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
 
-        self.entropy_stats = RunningNorm(1)
         self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
 
         # These two need to be set with set_replay_buffer():
-        self.replay_buffer_view: Optional[ReplayBufferView] = None
-        self.obs_shape: Union[Tuple[int, ...], Dict[str, Tuple[int, ...]], None] = None
+        self._entropy_reward_net: Optional[EntropyRewardNet] = None
+        self._normalized_entropy_reward_net: Optional[RewardNet] = None
 
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
-        self.replay_buffer_view = replay_buffer.buffer_view
-        self.obs_shape = replay_buffer.obs_shape
-
+        if self._normalized_entropy_reward_net is None:
+            self._entropy_reward_net = EntropyRewardNet(
+                nearest_neighbor_k=self.nearest_neighbor_k,
+                replay_buffer_view=replay_buffer.buffer_view,
+                observation_space=replay_buffer.observation_space,
+                action_space=replay_buffer.action_space,
+                normalize_images=False,
+            )
+            self._normalized_entropy_reward_net = NormalizedRewardNet(
+                self._entropy_reward_net, RunningNorm
+            )
+        else:
+            assert self._entropy_reward_net is not None
+            self._entropy_reward_net.set_replay_buffer(replay_buffer)
 
     def unsupervised_exploration_finish(self):
         assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION
@@ -88,35 +186,15 @@ def __call__(
             return self.learned_reward_fn(state, action, next_state, done)
 
     def _entropy_reward(self, state, action, next_state, done):
-        if self.replay_buffer_view is None:
+        if self._normalized_entropy_reward_net is None:
             raise ValueError(
                 "Replay buffer must be supplied before entropy reward can be used",
             )
-        all_observations = self.replay_buffer_view.observations
-        # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
-        all_observations = all_observations.reshape((-1, *self.obs_shape))
-
-        if all_observations.shape[0] < self.nearest_neighbor_k:
+        try:
+            return self._normalized_entropy_reward_net.predict_processed(
+                state, action, next_state, done, update_stats=True
+            )
+        except InsufficientObservations:
             # not enough observations to compare to, fall back to the learned function;
             # (falling back to a constant may also be ok)
             return self.learned_reward_fn(state, action, next_state, done)
-        else:
-            # TODO #625: deal with the conversion back and forth between np and torch
-            entropies = util.compute_state_entropy(
-                th.tensor(state),
-                th.tensor(all_observations),
-                self.nearest_neighbor_k,
-            )
-
-            normalized_entropies = self.entropy_stats.forward(entropies)
-
-            return normalized_entropies.numpy()
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        del state["replay_buffer_view"]
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        self.replay_buffer_view = None
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index f01a7d6c0..3a66349c5 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -158,3 +158,4 @@ def fast():
     reward_trainer_kwargs = {
         "epochs": 1,
     }
+    locals()  # quieten flake8
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index cef2e6f38..c56e81f4c 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -384,12 +384,15 @@ def compute_state_entropy(
         for idx in range(len(all_obs) // batch_size + 1):
             start = idx * batch_size
             end = (idx + 1) * batch_size
+            all_obs_batch = all_obs[start:end]
             distances_tensor = th.linalg.vector_norm(
-                obs[:, None] - all_obs[None, start:end],
+                obs[:, None] - all_obs_batch[None, :],
                 dim=non_batch_dimensions,
                 ord=2,
             )
+            assert distances_tensor.shape == (obs.shape[0], all_obs_batch.shape[0])
             dists.append(distances_tensor)
         all_dists = th.cat(dists, dim=1)
         knn_dists = th.kthvalue(all_dists, k=k + 1, dim=1).values
         return knn_dists
+
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index bc020e86c..e318eced2 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -5,15 +5,14 @@
 
 import numpy as np
 import torch as th
-from gym.spaces import Discrete
-
+from gym.spaces import Discrete,  Box
+from gym.spaces.space import Space
 from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.policies.replay_buffer_wrapper import ReplayBufferView
 from imitation.util import util
 
-SPACE = Discrete(4)
-OBS_SHAPE = (1,)
-PLACEHOLDER = np.empty(OBS_SHAPE)
+SPACE = Box(-1, 1, shape=(1,))
+PLACEHOLDER = np.empty(SPACE.shape)
 
 BUFFER_SIZE = 20
 K = 4
@@ -22,30 +21,27 @@
 
 
 def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
-    all_observations = rng.random((BUFFER_SIZE, VENVS, *OBS_SHAPE))
+    all_observations = rng.random((BUFFER_SIZE, VENVS) + SPACE.shape)
 
     reward_fn = PebbleStateEntropyReward(Mock(), K)
     reward_fn.on_replay_buffer_initialized(
         replay_buffer_mock(
             ReplayBufferView(all_observations, lambda: slice(None)),
-            OBS_SHAPE,
+            SPACE,
         )
     )
 
     # Act
-    observations = th.rand((BATCH_SIZE, *OBS_SHAPE))
+    observations = th.rand((BATCH_SIZE, *SPACE.shape))
     reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Assert
     expected = util.compute_state_entropy(
         observations,
-        all_observations.reshape(-1, *OBS_SHAPE),
+        all_observations.reshape(-1, *SPACE.shape),
         K,
     )
-    expected_normalized = reward_fn.entropy_stats.normalize(
-        th.as_tensor(expected),
-    ).numpy()
-    np.testing.assert_allclose(reward, expected_normalized)
+    np.testing.assert_allclose(reward, expected, rtol=0.005, atol=0.005)
 
 
 def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
@@ -55,11 +51,11 @@ def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
         m.side_effect = lambda obs, all_obs, k: obs
 
         reward_fn = PebbleStateEntropyReward(Mock(), K)
-        all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
+        all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape))
         reward_fn.on_replay_buffer_initialized(
             replay_buffer_mock(
                 ReplayBufferView(all_observations, lambda: slice(None)),
-                OBS_SHAPE,
+                SPACE,
             )
         )
 
@@ -97,7 +93,7 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin
     reward_fn.unsupervised_exploration_finish()
 
     # Act
-    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
+    observations = np.ones((BATCH_SIZE, *SPACE.shape))
     reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Assert
@@ -111,23 +107,23 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin
 
 
 def test_pebble_entropy_reward_can_pickle():
-    all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
+    all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape))
     replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
 
-    obs1 = np.random.rand(VENVS, *OBS_SHAPE)
+    obs1 = np.random.rand(VENVS, *SPACE.shape)
     reward_fn = PebbleStateEntropyReward(reward_fn_stub, K)
-    reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, OBS_SHAPE))
+    reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, SPACE))
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Act
     pickled = pickle.dumps(reward_fn)
     reward_fn_deserialized = pickle.loads(pickled)
     reward_fn_deserialized.on_replay_buffer_initialized(
-        replay_buffer_mock(replay_buffer, OBS_SHAPE)
+        replay_buffer_mock(replay_buffer, SPACE)
     )
 
     # Assert
-    obs2 = np.random.rand(VENVS, *OBS_SHAPE)
+    obs2 = np.random.rand(VENVS, *SPACE.shape)
     expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     np.testing.assert_allclose(actual_result, expected_result)
@@ -137,8 +133,9 @@ def reward_fn_stub(state, action, next_state, done):
     return state
 
 
-def replay_buffer_mock(buffer_view: ReplayBufferView, obs_shape: tuple) -> Mock:
-    replay_buffer_mock = Mock()
-    replay_buffer_mock.buffer_view = buffer_view
-    replay_buffer_mock.obs_shape = obs_shape
-    return replay_buffer_mock
+def replay_buffer_mock(buffer_view: ReplayBufferView, obs_space: Space) -> Mock:
+    mock = Mock()
+    mock.buffer_view = buffer_view
+    mock.observation_space = obs_space
+    mock.action_space = SPACE
+    return mock

From 531b3532cfd6633cd57023a7ecb30b468395d97e Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 10 Dec 2022 01:56:27 +0100
Subject: [PATCH 53/55] #641 code review: refactor PebbleStateEntropyReward so
 that inner RewardNets can be injected from the outside

---
 .../algorithms/pebble/entropy_reward.py       |  89 ++++-----
 .../scripts/train_preference_comparisons.py   |  48 ++++-
 src/imitation/util/util.py                    |   1 -
 .../algorithms/pebble/test_entropy_reward.py  | 178 ++++++++++--------
 .../algorithms/test_preference_comparisons.py |   4 +-
 .../test_train_preference_comparisons.py      |  64 +++++++
 6 files changed, 241 insertions(+), 143 deletions(-)
 create mode 100644 tests/scripts/test_train_preference_comparisons.py

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index f1bb373ba..074281e90 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -13,47 +13,46 @@
     ReplayBufferView,
 )
 from imitation.rewards.reward_function import RewardFn
-from imitation.rewards.reward_nets import NormalizedRewardNet, RewardNet
+from imitation.rewards.reward_nets import RewardNet
 from imitation.util import util
-from imitation.util.networks import RunningNorm
-
-
-class PebbleRewardPhase(enum.Enum):
-    """States representing different behaviors for PebbleStateEntropyReward."""
-
-    UNSUPERVISED_EXPLORATION = enum.auto()  # Entropy based reward
-    POLICY_AND_REWARD_LEARNING = enum.auto()  # Learned reward
 
 
 class InsufficientObservations(RuntimeError):
     pass
 
 
-class EntropyRewardNet(RewardNet):
+class EntropyRewardNet(RewardNet, ReplayBufferAwareRewardFn):
     def __init__(
         self,
         nearest_neighbor_k: int,
-        replay_buffer_view: ReplayBufferView,
         observation_space: gym.Space,
         action_space: gym.Space,
         normalize_images: bool = True,
+        replay_buffer_view: Optional[ReplayBufferView] = None,
     ):
         """Initialize the RewardNet.
 
         Args:
+            nearest_neighbor_k: Parameter for entropy computation (see
+                compute_state_entropy())
             observation_space: the observation space of the environment
             action_space: the action space of the environment
             normalize_images: whether to automatically normalize
                 image observations to [0, 1] (from 0 to 255). Defaults to True.
+            replay_buffer_view: Replay buffer view with observations to compare
+                against when computing entropy. If None is given, the buffer needs to
+                be set with on_replay_buffer_initialized() before EntropyRewardNet can
+                be used
         """
         super().__init__(observation_space, action_space, normalize_images)
         self.nearest_neighbor_k = nearest_neighbor_k
         self._replay_buffer_view = replay_buffer_view
 
-    def set_replay_buffer(self, replay_buffer: ReplayBufferRewardWrapper):
-        """This method needs to be called after unpickling.
+    def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
+        """Sets replay buffer.
 
-        See also __getstate__() / __setstate__()
+        This method needs to be called, e.g., after unpickling.
+        See also __getstate__() / __setstate__().
         """
         assert self.observation_space == replay_buffer.observation_space
         assert self.action_space == replay_buffer.action_space
@@ -111,6 +110,13 @@ def __setstate__(self, state):
         self._replay_buffer_view = None
 
 
+class PebbleRewardPhase(enum.Enum):
+    """States representing different behaviors for PebbleStateEntropyReward."""
+
+    UNSUPERVISED_EXPLORATION = enum.auto()  # Entropy based reward
+    POLICY_AND_REWARD_LEARNING = enum.auto()  # Learned reward
+
+
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     """Reward function for implementation of the PEBBLE learning algorithm.
 
@@ -126,48 +132,30 @@ class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
     reward is returned.
 
     The second phase requires that a buffer with observations to compare against is
-    supplied with set_replay_buffer() or on_replay_buffer_initialized().
-    To transition to the last phase, unsupervised_exploration_finish() needs
-    to be called.
+    supplied with on_replay_buffer_initialized(). To transition to the last phase,
+    unsupervised_exploration_finish() needs to be called.
     """
 
     def __init__(
         self,
+        entropy_reward_fn: RewardFn,
         learned_reward_fn: RewardFn,
-        nearest_neighbor_k: int = 5,
     ):
         """Builds this class.
 
         Args:
+            entropy_reward_fn: The entropy-based reward function used during
+                unsupervised exploration
             learned_reward_fn: The learned reward function used after unsupervised
                 exploration is finished
-            nearest_neighbor_k: Parameter for entropy computation (see
-                compute_state_entropy())
         """
+        self.entropy_reward_fn = entropy_reward_fn
         self.learned_reward_fn = learned_reward_fn
-        self.nearest_neighbor_k = nearest_neighbor_k
-
         self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
 
-        # These two need to be set with set_replay_buffer():
-        self._entropy_reward_net: Optional[EntropyRewardNet] = None
-        self._normalized_entropy_reward_net: Optional[RewardNet] = None
-
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
-        if self._normalized_entropy_reward_net is None:
-            self._entropy_reward_net = EntropyRewardNet(
-                nearest_neighbor_k=self.nearest_neighbor_k,
-                replay_buffer_view=replay_buffer.buffer_view,
-                observation_space=replay_buffer.observation_space,
-                action_space=replay_buffer.action_space,
-                normalize_images=False,
-            )
-            self._normalized_entropy_reward_net = NormalizedRewardNet(
-                self._entropy_reward_net, RunningNorm
-            )
-        else:
-            assert self._entropy_reward_net is not None
-            self._entropy_reward_net.set_replay_buffer(replay_buffer)
+        if isinstance(self.entropy_reward_fn, ReplayBufferAwareRewardFn):
+            self.entropy_reward_fn.on_replay_buffer_initialized(replay_buffer)
 
     def unsupervised_exploration_finish(self):
         assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION
@@ -181,20 +169,11 @@ def __call__(
         done: np.ndarray,
     ) -> np.ndarray:
         if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION:
-            return self._entropy_reward(state, action, next_state, done)
+            try:
+                return self.entropy_reward_fn(state, action, next_state, done)
+            except InsufficientObservations:
+                # not enough observations to compare to, fall back to the learned function;
+                # (falling back to a constant may also be ok)
+                return self.learned_reward_fn(state, action, next_state, done)
         else:
             return self.learned_reward_fn(state, action, next_state, done)
-
-    def _entropy_reward(self, state, action, next_state, done):
-        if self._normalized_entropy_reward_net is None:
-            raise ValueError(
-                "Replay buffer must be supplied before entropy reward can be used",
-            )
-        try:
-            return self._normalized_entropy_reward_net.predict_processed(
-                state, action, next_state, done, update_stats=True
-            )
-        except InsufficientObservations:
-            # not enough observations to compare to, fall back to the learned function;
-            # (falling back to a constant may also be ok)
-            return self.learned_reward_fn(state, action, next_state, done)
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index 659b47a74..524734713 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -13,10 +13,18 @@
 from stable_baselines3.common import base_class, type_aliases, vec_env
 
 from imitation.algorithms import preference_comparisons
-from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
+from imitation.algorithms.pebble.entropy_reward import (
+    EntropyRewardNet,
+    PebbleStateEntropyReward,
+)
 from imitation.data import types
 from imitation.policies import serialize
+from imitation.policies.replay_buffer_wrapper import (
+    ReplayBufferAwareRewardFn,
+    ReplayBufferRewardWrapper,
+)
 from imitation.rewards import reward_function, reward_nets
+from imitation.rewards.reward_nets import NormalizedRewardNet
 from imitation.scripts.common import common, reward
 from imitation.scripts.common import rl as rl_common
 from imitation.scripts.common import train
@@ -24,6 +32,7 @@
     train_preference_comparisons_ex,
 )
 from imitation.util import logger as imit_logger
+from imitation.util.networks import RunningNorm
 
 
 def save_model(
@@ -71,14 +80,47 @@ def make_reward_function(
         reward_net.predict_processed,
         update_stats=False,
     )
+    observation_space = reward_net.observation_space
+    action_space = reward_net.action_space
     if pebble_enabled:
-        relabel_reward_fn = PebbleStateEntropyReward(
-            relabel_reward_fn,  # type: ignore[assignment]
+        relabel_reward_fn = create_pebble_reward_fn(
+            relabel_reward_fn,
             pebble_nearest_neighbor_k,
+            action_space,
+            observation_space,
         )
     return relabel_reward_fn
 
 
+def create_pebble_reward_fn(
+    relabel_reward_fn, pebble_nearest_neighbor_k, action_space, observation_space
+):
+    entropy_reward_net = EntropyRewardNet(
+        nearest_neighbor_k=pebble_nearest_neighbor_k,
+        observation_space=observation_space,
+        action_space=action_space,
+        normalize_images=False,
+    )
+    normalized_entropy_reward_net = NormalizedRewardNet(entropy_reward_net, RunningNorm)
+
+    class EntropyRewardFn(ReplayBufferAwareRewardFn):
+        """Adapter for entropy reward adding on_replay_buffer_initialized() hook."""
+
+        def __call__(self, *args, **kwargs) -> np.ndarray:
+            kwargs["update_stats"] = True
+            return normalized_entropy_reward_net.predict_processed(*args, **kwargs)
+
+        def on_replay_buffer_initialized(
+            self, replay_buffer: ReplayBufferRewardWrapper
+        ):
+            entropy_reward_net.on_replay_buffer_initialized(replay_buffer)
+
+    return PebbleStateEntropyReward(
+        EntropyRewardFn(),
+        relabel_reward_fn,  # type: ignore[assignment]
+    )
+
+
 @train_preference_comparisons_ex.capture
 def make_agent_trajectory_generator(
     venv: vec_env.VecEnv,
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
index c56e81f4c..cf38cee5a 100644
--- a/src/imitation/util/util.py
+++ b/src/imitation/util/util.py
@@ -395,4 +395,3 @@ def compute_state_entropy(
         all_dists = th.cat(dists, dim=1)
         knn_dists = th.kthvalue(all_dists, k=k + 1, dim=1).values
         return knn_dists
-
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index e318eced2..833a9ba94 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -1,14 +1,22 @@
 """Tests for `imitation.algorithms.entropy_reward`."""
-
 import pickle
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 
 import numpy as np
+import pytest
 import torch as th
-from gym.spaces import Discrete,  Box
+from gym.spaces import Box
 from gym.spaces.space import Space
-from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
-from imitation.policies.replay_buffer_wrapper import ReplayBufferView
+
+from imitation.algorithms.pebble.entropy_reward import (
+    EntropyRewardNet,
+    InsufficientObservations,
+    PebbleStateEntropyReward,
+)
+from imitation.policies.replay_buffer_wrapper import (
+    ReplayBufferAwareRewardFn,
+    ReplayBufferView,
+)
 from imitation.util import util
 
 SPACE = Box(-1, 1, shape=(1,))
@@ -20,112 +28,115 @@
 VENVS = 2
 
 
-def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
-    all_observations = rng.random((BUFFER_SIZE, VENVS) + SPACE.shape)
+def test_pebble_entropy_reward_returns_entropy_for_pretraining():
+    expected_result = th.rand(BATCH_SIZE)
+    observations = th.rand((BATCH_SIZE,) + SPACE.shape)
+    entropy_fn = Mock()
+    entropy_fn.return_value = expected_result
+    learned_fn = Mock()
 
-    reward_fn = PebbleStateEntropyReward(Mock(), K)
-    reward_fn.on_replay_buffer_initialized(
-        replay_buffer_mock(
-            ReplayBufferView(all_observations, lambda: slice(None)),
-            SPACE,
-        )
+    reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn)
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    np.testing.assert_allclose(reward, expected_result)
+    entropy_fn.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
     )
 
-    # Act
-    observations = th.rand((BATCH_SIZE, *SPACE.shape))
+
+def test_pebble_entropy_reward_returns_learned_rew_on_insufficient_observations(rng):
+    expected_result = th.rand(BATCH_SIZE)
+    observations = th.rand((BATCH_SIZE,) + SPACE.shape)
+    entropy_fn = Mock()
+    entropy_fn.side_effect = InsufficientObservations("test error")
+    learned_fn = Mock()
+    learned_fn.return_value = expected_result
+
+    reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn)
     reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
-    # Assert
-    expected = util.compute_state_entropy(
-        observations,
-        all_observations.reshape(-1, *SPACE.shape),
-        K,
+    np.testing.assert_allclose(reward, expected_result)
+    learned_fn.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
     )
-    np.testing.assert_allclose(reward, expected, rtol=0.005, atol=0.005)
 
 
-def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
-    with patch("imitation.util.util.compute_state_entropy") as m:
-        # mock entropy computation so that we can test
-        # only stats collection in this test
-        m.side_effect = lambda obs, all_obs, k: obs
+def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training():
+    expected_result = th.rand(BATCH_SIZE)
+    observations = th.rand((BATCH_SIZE,) + SPACE.shape)
+    entropy_fn = Mock()
+    learned_fn = Mock()
+    learned_fn.return_value = expected_result
 
-        reward_fn = PebbleStateEntropyReward(Mock(), K)
-        all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape))
-        reward_fn.on_replay_buffer_initialized(
-            replay_buffer_mock(
-                ReplayBufferView(all_observations, lambda: slice(None)),
-                SPACE,
-            )
-        )
+    reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn)
+    reward_fn.unsupervised_exploration_finish()
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    np.testing.assert_allclose(reward, expected_result)
+    learned_fn.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
 
-        dim = 8
-        shift = 3
-        scale = 2
 
-        # Act
-        for _ in range(1000):
-            state = th.randn(dim) * scale + shift
-            reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+def test_pebble_entropy_reward_propagates_on_replay_buffer_initialized():
+    replay_buffer = replay_buffer_mock(np.empty((BUFFER_SIZE, VENVS) + SPACE.shape))
+    entropy_fn = Mock(spec=ReplayBufferAwareRewardFn)
+    learned_fn = Mock()
 
-        normalized_reward = reward_fn(
-            np.zeros(dim),
-            PLACEHOLDER,
-            PLACEHOLDER,
-            PLACEHOLDER,
-        )
+    reward_fn = PebbleStateEntropyReward(entropy_fn, learned_fn)
+    reward_fn.on_replay_buffer_initialized(replay_buffer)
 
-        # Assert
-        np.testing.assert_allclose(
-            normalized_reward,
-            np.repeat(-shift / scale, dim),
-            rtol=0.05,
-            atol=0.05,
-        )
+    entropy_fn.on_replay_buffer_initialized.assert_called_once_with(replay_buffer)
 
 
-def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training():
-    expected_reward = np.ones(1)
-    learned_reward_mock = Mock()
-    learned_reward_mock.return_value = expected_reward
-    reward_fn = PebbleStateEntropyReward(learned_reward_mock)
-    # move all the way to the last state
-    reward_fn.unsupervised_exploration_finish()
+def test_entropy_reward_net_returns_entropy_for_pretraining(rng):
+    observations = th.rand((BATCH_SIZE, *SPACE.shape))
+    all_observations = rng.random((BUFFER_SIZE, VENVS) + SPACE.shape)
+    reward_net = EntropyRewardNet(K, SPACE, SPACE)
+    reward_net.on_replay_buffer_initialized(replay_buffer_mock(all_observations))
 
     # Act
-    observations = np.ones((BATCH_SIZE, *SPACE.shape))
-    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+    reward = reward_net.predict_processed(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
 
     # Assert
-    assert reward == expected_reward
-    learned_reward_mock.assert_called_once_with(
+    expected = util.compute_state_entropy(
         observations,
-        PLACEHOLDER,
-        PLACEHOLDER,
-        PLACEHOLDER,
+        all_observations.reshape(-1, *SPACE.shape),
+        K,
     )
+    np.testing.assert_allclose(reward, expected, rtol=0.005, atol=0.005)
 
 
-def test_pebble_entropy_reward_can_pickle():
-    all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape))
-    replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
+def test_entropy_reward_net_raises_on_insufficient_observations(rng):
+    observations = th.rand((BATCH_SIZE, *SPACE.shape))
+    all_observations = rng.random((K - 1, 1) + SPACE.shape)
+    reward_net = EntropyRewardNet(K, SPACE, SPACE)
+    reward_net.on_replay_buffer_initialized(replay_buffer_mock(all_observations))
+
+    # Act
+    with pytest.raises(InsufficientObservations):
+        reward_net.predict_processed(
+            observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        )
 
-    obs1 = np.random.rand(VENVS, *SPACE.shape)
-    reward_fn = PebbleStateEntropyReward(reward_fn_stub, K)
-    reward_fn.on_replay_buffer_initialized(replay_buffer_mock(replay_buffer, SPACE))
-    reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+def test_entropy_reward_net_can_pickle(rng):
+    all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape))
+    replay_buffer = replay_buffer_mock(all_observations)
+    reward_net = EntropyRewardNet(K, SPACE, SPACE)
+    reward_net.on_replay_buffer_initialized(replay_buffer)
 
     # Act
-    pickled = pickle.dumps(reward_fn)
+    pickled = pickle.dumps(reward_net)
     reward_fn_deserialized = pickle.loads(pickled)
-    reward_fn_deserialized.on_replay_buffer_initialized(
-        replay_buffer_mock(replay_buffer, SPACE)
-    )
+    reward_fn_deserialized.on_replay_buffer_initialized(replay_buffer)
 
     # Assert
-    obs2 = np.random.rand(VENVS, *SPACE.shape)
-    expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
-    actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+    obs = th.rand(VENVS, *SPACE.shape)
+    expected_result = reward_net(obs, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+    actual_result = reward_fn_deserialized(obs, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     np.testing.assert_allclose(actual_result, expected_result)
 
 
@@ -133,7 +144,8 @@ def reward_fn_stub(state, action, next_state, done):
     return state
 
 
-def replay_buffer_mock(buffer_view: ReplayBufferView, obs_space: Space) -> Mock:
+def replay_buffer_mock(all_observations: np.ndarray, obs_space: Space = SPACE) -> Mock:
+    buffer_view = ReplayBufferView(all_observations, lambda: slice(None))
     mock = Mock()
     mock.buffer_view = buffer_view
     mock.observation_space = obs_space
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
index fb63e71d0..f31fdceb8 100644
--- a/tests/algorithms/test_preference_comparisons.py
+++ b/tests/algorithms/test_preference_comparisons.py
@@ -85,7 +85,9 @@ def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer):
     replay_buffer_mock = Mock()
     replay_buffer_mock.buffer_view = replay_buffer
     replay_buffer_mock.obs_shape = (4,)
-    reward_fn = PebbleStateEntropyReward(reward_net.predict_processed)
+    reward_fn = PebbleStateEntropyReward(
+        reward_net.predict_processed, venv.observation_space, venv.action_space
+    )
     reward_fn.on_replay_buffer_initialized(replay_buffer_mock)
     return preference_comparisons.PebbleAgentTrainer(
         algorithm=agent,
diff --git a/tests/scripts/test_train_preference_comparisons.py b/tests/scripts/test_train_preference_comparisons.py
new file mode 100644
index 000000000..d05ebd27a
--- /dev/null
+++ b/tests/scripts/test_train_preference_comparisons.py
@@ -0,0 +1,64 @@
+from unittest.mock import Mock, patch
+
+import numpy as np
+import torch as th
+from gym import Space
+from gym.spaces import Box
+
+from imitation.policies.replay_buffer_wrapper import ReplayBufferView
+from imitation.scripts.train_preference_comparisons import create_pebble_reward_fn
+
+K = 4
+SPACE = Box(-1, 1, shape=(1,))
+BUFFER_SIZE = 20
+VENVS = 2
+PLACEHOLDER = np.empty(SPACE.shape)
+
+
+def test_creates_normalized_entropy_pebble_reward():
+    with patch("imitation.util.util.compute_state_entropy") as m:
+        # mock entropy computation so that we can test
+        # only stats collection in this test
+        m.side_effect = lambda obs, all_obs, k: obs
+
+        reward_fn = create_pebble_reward_fn(reward_fn_stub, K, SPACE, SPACE)
+
+        all_observations = np.empty((BUFFER_SIZE, VENVS, *SPACE.shape))
+        reward_fn.on_replay_buffer_initialized(replay_buffer_mock(all_observations))
+
+        dim = 8
+        shift = 3
+        scale = 2
+
+        # Act
+        for _ in range(1000):
+            state = th.randn(dim) * scale + shift
+            reward_fn(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+        normalized_reward = reward_fn(
+            np.zeros(dim),
+            PLACEHOLDER,
+            PLACEHOLDER,
+            PLACEHOLDER,
+        )
+
+        # Assert
+        np.testing.assert_allclose(
+            normalized_reward,
+            np.repeat(-shift / scale, dim),
+            rtol=0.05,
+            atol=0.05,
+        )
+
+
+def reward_fn_stub(state, action, next_state, done):
+    return state
+
+
+def replay_buffer_mock(all_observations: np.ndarray, obs_space: Space = SPACE) -> Mock:
+    buffer_view = ReplayBufferView(all_observations, lambda: slice(None))
+    mock = Mock()
+    mock.buffer_view = buffer_view
+    mock.observation_space = obs_space
+    mock.action_space = SPACE
+    return mock

From 74ba96b17d75c785ad5a8af6725f645dbd17df9e Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Sat, 10 Dec 2022 12:45:24 +0100
Subject: [PATCH 54/55] #641 fix static analysis and tests

---
 .../algorithms/pebble/entropy_reward.py       | 32 +++++++++++++++----
 .../algorithms/preference_comparisons.py      |  6 +++-
 .../scripts/train_preference_comparisons.py   | 22 +++++++------
 .../algorithms/pebble/test_entropy_reward.py  | 25 ++++++++++++---
 .../algorithms/test_preference_comparisons.py | 12 ++++---
 .../test_train_preference_comparisons.py      |  2 ++
 6 files changed, 74 insertions(+), 25 deletions(-)

diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
index 074281e90..eba53405b 100644
--- a/src/imitation/algorithms/pebble/entropy_reward.py
+++ b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,7 +1,7 @@
 """Reward function for the PEBBLE training algorithm."""
 
 import enum
-from typing import Optional, Tuple
+from typing import Any, Callable, Optional, Tuple
 
 import gym
 import numpy as np
@@ -18,10 +18,16 @@
 
 
 class InsufficientObservations(RuntimeError):
+    """Error signifying not enough observations for entropy calculation."""
+
     pass
 
 
 class EntropyRewardNet(RewardNet, ReplayBufferAwareRewardFn):
+    """RewardNet wrapping entropy reward function."""
+
+    __call__: Callable[..., Any]  # Needed to appease pytype
+
     def __init__(
         self,
         nearest_neighbor_k: int,
@@ -53,6 +59,9 @@ def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper)
 
         This method needs to be called, e.g., after unpickling.
         See also __getstate__() / __setstate__().
+
+        Args:
+            replay_buffer: replay buffer with history of observations
         """
         assert self.observation_space == replay_buffer.observation_space
         assert self.action_space == replay_buffer.action_space
@@ -72,16 +81,18 @@ def forward(
         all_observations = self._replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
         all_observations = all_observations.reshape(
-            (-1,) + self.observation_space.shape
+            (-1,) + self.observation_space.shape,
         )
 
         if all_observations.shape[0] < self.nearest_neighbor_k:
             raise InsufficientObservations(
-                "Insufficient observations for entropy calculation"
+                "Insufficient observations for entropy calculation",
             )
 
         return util.compute_state_entropy(
-            state, all_observations, self.nearest_neighbor_k
+            state,
+            all_observations,
+            self.nearest_neighbor_k,
         )
 
     def preprocess(
@@ -95,6 +106,15 @@ def preprocess(
 
         We also know forward() only works with state, so no need to convert
         other tensors.
+
+        Args:
+            state: The observation input.
+            action: The action input.
+            next_state: The observation input.
+            done: Whether the episode has terminated.
+
+        Returns:
+            Observations preprocessed by converting them to Tensor.
         """
         state_th = util.safe_to_tensor(state).to(self.device)
         action_th = next_state_th = done_th = th.empty(0)
@@ -172,8 +192,8 @@ def __call__(
             try:
                 return self.entropy_reward_fn(state, action, next_state, done)
             except InsufficientObservations:
-                # not enough observations to compare to, fall back to the learned function;
-                # (falling back to a constant may also be ok)
+                # not enough observations to compare to, fall back to the learned
+                # function; (falling back to a constant may also be ok)
                 return self.learned_reward_fn(state, action, next_state, done)
         else:
             return self.learned_reward_fn(state, action, next_state, done)
diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 72f5da5cf..fccd7958d 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -96,13 +96,17 @@ def unsupervised_pretrain(self, steps: int, **kwargs: Any) -> None:
         """Pre-train an agent before collecting comparisons.
 
         Override this behavior in subclasses that implement pre-training.
-        If not overriden, this method raises ValueError when non-zero steps are
+        If not overridden, this method raises ValueError when non-zero steps are
         allocated for pre-training.
 
         Args:
             steps: number of environment steps to train for.
             **kwargs: additional keyword arguments to pass on to
                 the training procedure.
+
+        Raises:
+            ValueError: Unsupervised pre-training not implemented but non-zero
+                steps are allocated for pre-training.
         """
         if steps > 0:
             raise ValueError(
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index 524734713..5e07b094c 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -7,6 +7,7 @@
 import pathlib
 from typing import Any, Mapping, Optional, Type, Union
 
+import gym
 import numpy as np
 import torch as th
 from sacred.observers import FileStorageObserver
@@ -24,6 +25,7 @@
     ReplayBufferRewardWrapper,
 )
 from imitation.rewards import reward_function, reward_nets
+from imitation.rewards.reward_function import RewardFn
 from imitation.rewards.reward_nets import NormalizedRewardNet
 from imitation.scripts.common import common, reward
 from imitation.scripts.common import rl as rl_common
@@ -80,21 +82,22 @@ def make_reward_function(
         reward_net.predict_processed,
         update_stats=False,
     )
-    observation_space = reward_net.observation_space
-    action_space = reward_net.action_space
     if pebble_enabled:
         relabel_reward_fn = create_pebble_reward_fn(
-            relabel_reward_fn,
+            relabel_reward_fn,  # type: ignore[assignment]
             pebble_nearest_neighbor_k,
-            action_space,
-            observation_space,
+            reward_net.action_space,
+            reward_net.observation_space,
         )
     return relabel_reward_fn
 
 
 def create_pebble_reward_fn(
-    relabel_reward_fn, pebble_nearest_neighbor_k, action_space, observation_space
-):
+    relabel_reward_fn: RewardFn,
+    pebble_nearest_neighbor_k: int,
+    action_space: gym.Space,
+    observation_space: gym.Space,
+) -> PebbleStateEntropyReward:
     entropy_reward_net = EntropyRewardNet(
         nearest_neighbor_k=pebble_nearest_neighbor_k,
         observation_space=observation_space,
@@ -111,13 +114,14 @@ def __call__(self, *args, **kwargs) -> np.ndarray:
             return normalized_entropy_reward_net.predict_processed(*args, **kwargs)
 
         def on_replay_buffer_initialized(
-            self, replay_buffer: ReplayBufferRewardWrapper
+            self,
+            replay_buffer: ReplayBufferRewardWrapper,
         ):
             entropy_reward_net.on_replay_buffer_initialized(replay_buffer)
 
     return PebbleStateEntropyReward(
         EntropyRewardFn(),
-        relabel_reward_fn,  # type: ignore[assignment]
+        relabel_reward_fn,
     )
 
 
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index 833a9ba94..b598ac75e 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -40,7 +40,10 @@ def test_pebble_entropy_reward_returns_entropy_for_pretraining():
 
     np.testing.assert_allclose(reward, expected_result)
     entropy_fn.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        observations,
+        PLACEHOLDER,
+        PLACEHOLDER,
+        PLACEHOLDER,
     )
 
 
@@ -57,7 +60,10 @@ def test_pebble_entropy_reward_returns_learned_rew_on_insufficient_observations(
 
     np.testing.assert_allclose(reward, expected_result)
     learned_fn.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        observations,
+        PLACEHOLDER,
+        PLACEHOLDER,
+        PLACEHOLDER,
     )
 
 
@@ -74,7 +80,10 @@ def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_trainin
 
     np.testing.assert_allclose(reward, expected_result)
     learned_fn.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        observations,
+        PLACEHOLDER,
+        PLACEHOLDER,
+        PLACEHOLDER,
     )
 
 
@@ -97,7 +106,10 @@ def test_entropy_reward_net_returns_entropy_for_pretraining(rng):
 
     # Act
     reward = reward_net.predict_processed(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+        observations,
+        PLACEHOLDER,
+        PLACEHOLDER,
+        PLACEHOLDER,
     )
 
     # Assert
@@ -118,7 +130,10 @@ def test_entropy_reward_net_raises_on_insufficient_observations(rng):
     # Act
     with pytest.raises(InsufficientObservations):
         reward_net.predict_processed(
-            observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+            observations,
+            PLACEHOLDER,
+            PLACEHOLDER,
+            PLACEHOLDER,
         )
 
 
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
index f31fdceb8..d863cc4b0 100644
--- a/tests/algorithms/test_preference_comparisons.py
+++ b/tests/algorithms/test_preference_comparisons.py
@@ -18,12 +18,12 @@
 
 import imitation.testing.reward_nets as testing_reward_nets
 from imitation.algorithms import preference_comparisons
-from imitation.algorithms.pebble.entropy_reward import PebbleStateEntropyReward
 from imitation.data import types
 from imitation.data.types import TrajectoryWithRew
 from imitation.policies.replay_buffer_wrapper import ReplayBufferView
 from imitation.regularization import regularizers, updaters
 from imitation.rewards import reward_nets
+from imitation.scripts.train_preference_comparisons import create_pebble_reward_fn
 from imitation.util import networks, util
 
 UNCERTAINTY_ON = ["logit", "probability", "label"]
@@ -84,9 +84,13 @@ def replay_buffer(rng):
 def pebble_agent_trainer(agent, reward_net, venv, rng, replay_buffer):
     replay_buffer_mock = Mock()
     replay_buffer_mock.buffer_view = replay_buffer
-    replay_buffer_mock.obs_shape = (4,)
-    reward_fn = PebbleStateEntropyReward(
-        reward_net.predict_processed, venv.observation_space, venv.action_space
+    replay_buffer_mock.observation_space = venv.observation_space
+    replay_buffer_mock.action_space = venv.action_space
+    reward_fn = create_pebble_reward_fn(
+        reward_net.predict_processed,
+        5,
+        venv.action_space,
+        venv.observation_space,
     )
     reward_fn.on_replay_buffer_initialized(replay_buffer_mock)
     return preference_comparisons.PebbleAgentTrainer(
diff --git a/tests/scripts/test_train_preference_comparisons.py b/tests/scripts/test_train_preference_comparisons.py
index d05ebd27a..c4390dd6b 100644
--- a/tests/scripts/test_train_preference_comparisons.py
+++ b/tests/scripts/test_train_preference_comparisons.py
@@ -1,3 +1,5 @@
+"""Tests train_preferences_comparisons helper methods."""
+
 from unittest.mock import Mock, patch
 
 import numpy as np

From b344cbdf13f57425cc2d4487129fc0efb9484ab3 Mon Sep 17 00:00:00 2001
From: Jan Michelfeit <jan.michelfeit@resistant.ai>
Date: Mon, 12 Dec 2022 22:16:20 +0100
Subject: [PATCH 55/55] #641 increase coverage

---
 .../policies/replay_buffer_wrapper.py         | 11 +++++--
 .../algorithms/pebble/test_entropy_reward.py  |  4 ---
 .../algorithms/test_preference_comparisons.py | 30 +++++++++++++++++++
 tests/policies/test_replay_buffer_wrapper.py  | 21 -------------
 tests/scripts/test_scripts.py                 | 14 +++++++++
 .../test_train_preference_comparisons.py      |  3 ++
 6 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 255e01f3b..a309917c2 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -143,5 +143,12 @@ class ReplayBufferAwareRewardFn(RewardFn, abc.ABC):
     def on_replay_buffer_initialized(
         self,
         replay_buffer: ReplayBufferRewardWrapper,
-    ):
-        pass
+    ) -> None:
+        """Hook method to be called when ReplayBuffer is initialized.
+
+        Needed to propagate the ReplayBuffer to a reward function because the buffer
+        is created indirectly in ReplayBufferRewardWrapper.
+
+        Args:
+            replay_buffer: the created ReplayBuffer
+        """  # noqa: DAR202
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
index b598ac75e..461a7dd5a 100644
--- a/tests/algorithms/pebble/test_entropy_reward.py
+++ b/tests/algorithms/pebble/test_entropy_reward.py
@@ -155,10 +155,6 @@ def test_entropy_reward_net_can_pickle(rng):
     np.testing.assert_allclose(actual_result, expected_result)
 
 
-def reward_fn_stub(state, action, next_state, done):
-    return state
-
-
 def replay_buffer_mock(all_observations: np.ndarray, obs_space: Space = SPACE) -> Mock:
     buffer_view = ReplayBufferView(all_observations, lambda: slice(None))
     mock = Mock()
diff --git a/tests/algorithms/test_preference_comparisons.py b/tests/algorithms/test_preference_comparisons.py
index d863cc4b0..c66dcc157 100644
--- a/tests/algorithms/test_preference_comparisons.py
+++ b/tests/algorithms/test_preference_comparisons.py
@@ -18,11 +18,16 @@
 
 import imitation.testing.reward_nets as testing_reward_nets
 from imitation.algorithms import preference_comparisons
+from imitation.algorithms.preference_comparisons import (
+    PebbleAgentTrainer,
+    TrajectoryGenerator,
+)
 from imitation.data import types
 from imitation.data.types import TrajectoryWithRew
 from imitation.policies.replay_buffer_wrapper import ReplayBufferView
 from imitation.regularization import regularizers, updaters
 from imitation.rewards import reward_nets
+from imitation.rewards.reward_function import RewardFn
 from imitation.scripts.train_preference_comparisons import create_pebble_reward_fn
 from imitation.util import networks, util
 
@@ -1120,3 +1125,28 @@ def test_that_trainer_improves(
     )
 
     assert np.mean(trained_agent_rewards) > np.mean(novice_agent_rewards)
+
+
+def test_trajectory_generator_raises_on_pretrain_if_not_implemented():
+    class TrajectoryGeneratorTestImpl(TrajectoryGenerator):
+        def sample(self, steps: int) -> Sequence[TrajectoryWithRew]:
+            return []
+
+    generator = TrajectoryGeneratorTestImpl()
+    assert generator.has_pretraining is False
+    with pytest.raises(ValueError, match="should not consume any timesteps"):
+        generator.unsupervised_pretrain(1)
+
+    generator.sample(1)  # just to make coverage happy
+
+
+def test_pebble_agent_trainer_expects_pebble_reward(agent, venv, rng):
+    reward_fn: RewardFn = lambda state, action, next, done: state
+
+    with pytest.raises(ValueError, match="PebbleStateEntropyReward"):
+        PebbleAgentTrainer(
+            algorithm=agent,
+            reward_fn=reward_fn,  # type: ignore[call-arg]
+            venv=venv,
+            rng=rng,
+        )
diff --git a/tests/policies/test_replay_buffer_wrapper.py b/tests/policies/test_replay_buffer_wrapper.py
index 7c26dd2d4..7b92e64ba 100644
--- a/tests/policies/test_replay_buffer_wrapper.py
+++ b/tests/policies/test_replay_buffer_wrapper.py
@@ -4,7 +4,6 @@
 from typing import Type
 from unittest.mock import Mock
 
-import gym
 import numpy as np
 import pytest
 import stable_baselines3 as sb3
@@ -122,26 +121,6 @@ def test_wrapper_class(tmpdir, rng):
         replay_buffer_wrapper._get_samples()
 
 
-class ActionIsObsEnv(gym.Env):
-    """Simple environment where the obs is the action."""
-
-    def __init__(self):
-        """Initialize environment."""
-        super().__init__()
-        self.action_space = spaces.Box(np.array([0]), np.array([1]))
-        self.observation_space = spaces.Box(np.array([0]), np.array([1]))
-
-    def step(self, action):
-        obs = action
-        reward = 0
-        done = False
-        info = {}
-        return obs, reward, done, info
-
-    def reset(self):
-        return np.array([0])
-
-
 def test_replay_buffer_view_provides_buffered_observations():
     space = spaces.Box(np.array([0]), np.array([5]))
     n_envs = 2
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 226b6b3c2..1f8a0d23d 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -254,6 +254,20 @@ def test_train_preference_comparisons_reward_named_config(tmpdir, named_configs)
     assert isinstance(run.result, dict)
 
 
+def test_train_preference_comparisons_pebble_config(tmpdir):
+    config_updates = dict(common=dict(log_root=tmpdir))
+    run = train_preference_comparisons.train_preference_comparisons_ex.run(
+        # make sure rl.sac named_config is called after rl.fast to overwrite
+        # rl_kwargs.batch_size to None
+        named_configs=ALGO_FAST_CONFIGS["preference_comparison"]
+        + ["pebble", "mountain_car_continuous"],
+        config_updates=config_updates,
+    )
+    assert run.config["rl"]["rl_cls"] is stable_baselines3.SAC
+    assert run.status == "COMPLETED"
+    assert isinstance(run.result, dict)
+
+
 def test_train_dagger_main(tmpdir):
     with pytest.warns(None) as record:
         run = train_imitation.train_imitation_ex.run(
diff --git a/tests/scripts/test_train_preference_comparisons.py b/tests/scripts/test_train_preference_comparisons.py
index c4390dd6b..cf794fecf 100644
--- a/tests/scripts/test_train_preference_comparisons.py
+++ b/tests/scripts/test_train_preference_comparisons.py
@@ -52,6 +52,9 @@ def test_creates_normalized_entropy_pebble_reward():
             atol=0.05,
         )
 
+        # Just to make coverage happy:
+        reward_fn_stub(state, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
 
 def reward_fn_stub(state, action, next_state, done):
     return state