#625 PebbleStateEntropyReward supports the initial phase before replay buffer is filled

Jan Michelfeit · Jan Michelfeit · commit 88371e147450 · 2022-12-01T20:30:09.000+01:00
diff --git a/src/imitation/algorithms/pebble/entropy_reward.py b/src/imitation/algorithms/pebble/entropy_reward.py
@@ -1,9 +1,8 @@
+from enum import Enum, auto
 from typing import Tuple
 
 import numpy as np
 import torch as th
-from gym.vector.utils import spaces
-from stable_baselines3.common.preprocessing import get_obs_shape
 
 from imitation.policies.replay_buffer_wrapper import (
     ReplayBufferView,
@@ -14,27 +13,53 @@
 from imitation.util.networks import RunningNorm
 
 
+class PebbleRewardPhase(Enum):
+    """States representing different behaviors for PebbleStateEntropyReward"""
+
+    # Collecting samples so that we have something for entropy calculation
+    LEARNING_START = auto()
+    # Entropy based reward
+    UNSUPERVISED_EXPLORATION = auto()
+    # Learned reward
+    POLICY_AND_REWARD_LEARNING = auto()
+
+
 class PebbleStateEntropyReward(ReplayBufferAwareRewardFn):
-    # TODO #625: get rid of the observation_space parameter
+    """
+    Reward function for implementation of the PEBBLE learning algorithm
+    (https://arxiv.org/pdf/2106.05091.pdf).
+
+    The rewards returned by this function go through the three phases
+    defined in PebbleRewardPhase. To transition between these phases,
+    unsupervised_exploration_start() and unsupervised_exploration_finish()
+    need to be called.
+
+    The second phase (UNSUPERVISED_EXPLORATION) also requires that a buffer
+    with observations to compare against is supplied with set_replay_buffer()
+    or on_replay_buffer_initialized().
+
+    Args:
+        learned_reward_fn: The learned reward function used after unsupervised
+            exploration is finished
+        nearest_neighbor_k: Parameter for entropy computation (see
+            compute_state_entropy())
+    """
+
     # TODO #625: parametrize nearest_neighbor_k
     def __init__(
         self,
-        trained_reward_fn: RewardFn,
-        observation_space: spaces.Space,
+        learned_reward_fn: RewardFn,
         nearest_neighbor_k: int = 5,
     ):
-        self.trained_reward_fn = trained_reward_fn
+        self.trained_reward_fn = learned_reward_fn
         self.nearest_neighbor_k = nearest_neighbor_k
         # TODO support n_envs > 1
         self.entropy_stats = RunningNorm(1)
-        self.observation_space = observation_space
-        self.obs_shape = get_obs_shape(observation_space)
-        self.replay_buffer_view = ReplayBufferView(
-            np.empty(0, dtype=observation_space.dtype), lambda: slice(0)
-        )
-        # This indicates that the training is in the "Unsupervised exploration"
-        # phase of the Pebble algorithm, where entropy is used as reward
-        self.unsupervised_exploration_active = True
+        self.state = PebbleRewardPhase.LEARNING_START
+
+        # These two need to be set with set_replay_buffer():
+        self.replay_buffer_view = None
+        self.obs_shape = None
 
     def on_replay_buffer_initialized(self, replay_buffer: ReplayBufferRewardWrapper):
         self.set_replay_buffer(replay_buffer.buffer_view, replay_buffer.obs_shape)
@@ -43,8 +68,13 @@ def set_replay_buffer(self, replay_buffer: ReplayBufferView, obs_shape: Tuple):
         self.replay_buffer_view = replay_buffer
         self.obs_shape = obs_shape
 
-    def on_unsupervised_exploration_finished(self):
-        self.unsupervised_exploration_active = False
+    def unsupervised_exploration_start(self):
+        assert self.state == PebbleRewardPhase.LEARNING_START
+        self.state = PebbleRewardPhase.UNSUPERVISED_EXPLORATION
+
+    def unsupervised_exploration_finish(self):
+        assert self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION
+        self.state = PebbleRewardPhase.POLICY_AND_REWARD_LEARNING
 
     def __call__(
         self,
@@ -53,19 +83,20 @@ def __call__(
         next_state: np.ndarray,
         done: np.ndarray,
     ) -> np.ndarray:
-        if self.unsupervised_exploration_active:
+        if self.state == PebbleRewardPhase.UNSUPERVISED_EXPLORATION:
             return self._entropy_reward(state)
         else:
             return self.trained_reward_fn(state, action, next_state, done)
 
     def _entropy_reward(self, state):
-        # TODO: should this work with torch instead of numpy internally?
-        #   (The RewardFn protocol requires numpy)
+        if self.replay_buffer_view is None:
+            raise ValueError(
+                "Replay buffer must be supplied before entropy reward can be used"
+            )
+
         all_observations = self.replay_buffer_view.observations
         # ReplayBuffer sampling flattens the venv dimension, let's adapt to that
-        all_observations = all_observations.reshape(
-            (-1, *state.shape[1:])  # TODO #625: fix self.obs_shape
-        )
+        all_observations = all_observations.reshape((-1, *self.obs_shape))
         # TODO #625: deal with the conversion back and forth between np and torch
         entropies = util.compute_state_entropy(
             th.tensor(state),
@@ -82,6 +113,4 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        self.replay_buffer_view = ReplayBufferView(
-            np.empty(0, self.observation_space.dtype), lambda: slice(0)
-        )
+        self.replay_buffer_view = None
diff --git a/tests/algorithms/pebble/test_entropy_reward.py b/tests/algorithms/pebble/test_entropy_reward.py
@@ -11,48 +11,87 @@
 from imitation.util import util
 
 SPACE = Discrete(4)
-PLACEHOLDER = np.empty(get_obs_shape(SPACE))
+OBS_SHAPE = get_obs_shape(SPACE)
+PLACEHOLDER = np.empty(OBS_SHAPE)
 
 BUFFER_SIZE = 20
 K = 4
 BATCH_SIZE = 8
 VENVS = 2
 
 
-def test_pebble_entropy_reward_returns_entropy(rng):
-    obs_shape = get_obs_shape(SPACE)
-    all_observations = rng.random((BUFFER_SIZE, VENVS, *obs_shape))
+def test_pebble_entropy_reward_function_returns_learned_reward_initially():
+    expected_reward = np.ones(1)
+    learned_reward_mock = Mock()
+    learned_reward_mock.return_value = expected_reward
+    reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE)
+
+    # Act
+    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    assert reward == expected_reward
+    learned_reward_mock.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
+
+
+def test_pebble_entropy_reward_function_returns_learned_reward_after_pre_training():
+    expected_reward = np.ones(1)
+    learned_reward_mock = Mock()
+    learned_reward_mock.return_value = expected_reward
+    reward_fn = PebbleStateEntropyReward(learned_reward_mock, SPACE)
+    # move all the way to the last state
+    reward_fn.unsupervised_exploration_start()
+    reward_fn.unsupervised_exploration_finish()
+
+    # Act
+    observations = np.ones((BATCH_SIZE, *OBS_SHAPE))
+    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
+
+    # Assert
+    assert reward == expected_reward
+    learned_reward_mock.assert_called_once_with(
+        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
+    )
+
+
+def test_pebble_entropy_reward_returns_entropy_for_pretraining(rng):
+    all_observations = rng.random((BUFFER_SIZE, VENVS, *(OBS_SHAPE)))
 
     reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
     reward_fn.set_replay_buffer(
-        ReplayBufferView(all_observations, lambda: slice(None)), obs_shape
+        ReplayBufferView(all_observations, lambda: slice(None)), OBS_SHAPE
     )
+    reward_fn.unsupervised_exploration_start()
 
     # Act
-    observations = rng.random((BATCH_SIZE, *obs_shape))
+    observations = th.rand((BATCH_SIZE, *(OBS_SHAPE)))
     reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Assert
     expected = util.compute_state_entropy(
-        observations, all_observations.reshape(-1, *obs_shape), K
+        observations, all_observations.reshape(-1, *(OBS_SHAPE)), K
     )
     expected_normalized = reward_fn.entropy_stats.normalize(
         th.as_tensor(expected)
     ).numpy()
     np.testing.assert_allclose(reward, expected_normalized)
 
 
-def test_pebble_entropy_reward_returns_normalized_values():
+def test_pebble_entropy_reward_returns_normalized_values_for_pretraining():
     with patch("imitation.util.util.compute_state_entropy") as m:
         # mock entropy computation so that we can test only stats collection in this test
         m.side_effect = lambda obs, all_obs, k: obs
 
         reward_fn = PebbleStateEntropyReward(Mock(), SPACE, K)
-        all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
+        all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
         reward_fn.set_replay_buffer(
             ReplayBufferView(all_observations, lambda: slice(None)),
-            get_obs_shape(SPACE),
+            OBS_SHAPE,
         )
+        reward_fn.unsupervised_exploration_start()
 
         dim = 8
         shift = 3
@@ -77,51 +116,25 @@ def test_pebble_entropy_reward_returns_normalized_values():
 
 
 def test_pebble_entropy_reward_can_pickle():
-    all_observations = np.empty((BUFFER_SIZE, VENVS, *get_obs_shape(SPACE)))
+    all_observations = np.empty((BUFFER_SIZE, VENVS, *OBS_SHAPE))
     replay_buffer = ReplayBufferView(all_observations, lambda: slice(None))
 
-    obs1 = np.random.rand(VENVS, *get_obs_shape(SPACE))
+    obs1 = np.random.rand(VENVS, *OBS_SHAPE)
     reward_fn = PebbleStateEntropyReward(reward_fn_stub, SPACE, K)
-    reward_fn.set_replay_buffer(replay_buffer, get_obs_shape(SPACE))
+    reward_fn.set_replay_buffer(replay_buffer, OBS_SHAPE)
     reward_fn(obs1, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
 
     # Act
     pickled = pickle.dumps(reward_fn)
     reward_fn_deserialized = pickle.loads(pickled)
-    reward_fn_deserialized.set_replay_buffer(replay_buffer)
+    reward_fn_deserialized.set_replay_buffer(replay_buffer, OBS_SHAPE)
 
     # Assert
-    obs2 = np.random.rand(VENVS, *get_obs_shape(SPACE))
+    obs2 = np.random.rand(VENVS, *OBS_SHAPE)
     expected_result = reward_fn(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     actual_result = reward_fn_deserialized(obs2, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
     np.testing.assert_allclose(actual_result, expected_result)
 
 
-def test_pebble_entropy_reward_function_switches_to_inner():
-    obs_shape = get_obs_shape(SPACE)
-
-    expected_reward = np.ones(1)
-    reward_fn_mock = Mock()
-    reward_fn_mock.return_value = expected_reward
-    reward_fn = PebbleStateEntropyReward(reward_fn_mock, SPACE)
-
-    # Act
-    reward_fn.on_unsupervised_exploration_finished()
-    observations = np.ones((BATCH_SIZE, *obs_shape))
-    reward = reward_fn(observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER)
-
-    # Assert
-    assert reward == expected_reward
-    reward_fn_mock.assert_called_once_with(
-        observations, PLACEHOLDER, PLACEHOLDER, PLACEHOLDER
-    )
-
-
-def reward_fn_stub(
-    self,
-    state: np.ndarray,
-    action: np.ndarray,
-    next_state: np.ndarray,
-    done: np.ndarray,
-) -> np.ndarray:
+def reward_fn_stub(state, action, next_state, done):
     return state