Implementing Rainbow model.

MillionIntegrals · MillionIntegrals · commit dcbfbc95f22b · 2019-04-07T14:38:06.000-07:00
diff --git a/README.md b/README.md
@@ -117,6 +117,7 @@ that are ready to run and easy to modify for other similar usecases:
     - N-Step Bellman updates
     - Distributional Q-Learning
     - Noisy Networks for Exploration
+    - Rainbow (combination of the above)
 
 
 # Examples
diff --git a/examples-configs/rl/atari/dqn_rainbow_param/asterix_rp_dqn_distributional.yaml b/examples-configs/rl/atari/dqn_rainbow_param/asterix_rp_dqn_distributional.yaml
@@ -39,8 +39,7 @@ reinforcer:
     replay_buffer:
       name: vel.rl.buffers.circular_replay_buffer
 
-      #      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
-      buffer_initial_size: 200_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
       buffer_capacity: 1_000_000
 
       # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
diff --git a/examples-configs/rl/atari/dqn_rainbow_param/asterix_rp_dqn_raw.yaml b/examples-configs/rl/atari/dqn_rainbow_param/asterix_rp_dqn_raw.yaml
@@ -36,8 +36,7 @@ reinforcer:
     replay_buffer:
       name: vel.rl.buffers.circular_replay_buffer
 
-#      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
-      buffer_initial_size: 200_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
       buffer_capacity: 1_000_000
 
       # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
diff --git a/examples-configs/rl/atari/dqn_rainbow_param/asteroids_rp_dqn_noisynet.yaml b/examples-configs/rl/atari/dqn_rainbow_param/asteroids_rp_dqn_noisynet.yaml
@@ -43,8 +43,7 @@ reinforcer:
     replay_buffer:
       name: vel.rl.buffers.circular_replay_buffer
 
-#      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
-      buffer_initial_size: 200_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
       buffer_capacity: 1_000_000
 
       # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
diff --git a/examples-configs/rl/atari/dqn_rainbow_param/asteroids_rp_dqn_raw.yaml b/examples-configs/rl/atari/dqn_rainbow_param/asteroids_rp_dqn_raw.yaml
@@ -36,8 +36,7 @@ reinforcer:
     replay_buffer:
       name: vel.rl.buffers.circular_replay_buffer
 
-#      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
-      buffer_initial_size: 200_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
       buffer_capacity: 1_000_000
 
       # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
diff --git a/examples-configs/rl/atari/dqn_rainbow_param/atlantis_rp_dqn_nstep.yaml b/examples-configs/rl/atari/dqn_rainbow_param/atlantis_rp_dqn_nstep.yaml
@@ -40,8 +40,7 @@ reinforcer:
     replay_buffer:
       name: vel.rl.buffers.circular_replay_buffer
 
-#      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
-      buffer_initial_size: 200_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
       buffer_capacity: 1_000_000
 
       # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
diff --git a/examples-configs/rl/atari/dqn_rainbow_param/atlantis_rp_dqn_raw.yaml b/examples-configs/rl/atari/dqn_rainbow_param/atlantis_rp_dqn_raw.yaml
@@ -36,8 +36,7 @@ reinforcer:
     replay_buffer:
       name: vel.rl.buffers.circular_replay_buffer
 
-#      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
-      buffer_initial_size: 200_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
       buffer_capacity: 1_000_000
 
       # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
diff --git a/examples-configs/rl/atari/rainbow/breakout_rainbow.yaml b/examples-configs/rl/atari/rainbow/breakout_rainbow.yaml
@@ -0,0 +1,102 @@
+name: 'breakout_rainbow'
+
+
+env:
+  name: vel.rl.env.classic_atari
+  game: 'BreakoutNoFrameskip-v4'
+  settings:
+    max_episode_frames: 108_000
+
+
+vec_env:
+  name: vel.rl.vecenv.dummy
+  frame_history: 4  # How many stacked frames go into a single observation
+
+
+model:
+  name: vel.rl.models.q_rainbow_model
+
+  atoms: 51  # 51 bins for Distributional DQN
+  vmin: -10.0
+  vmax: 10.0
+
+  initial_std_dev: 0.5
+  factorized_noise: true
+
+  input_block:
+    name: vel.modules.input.image_to_tensor
+
+  backbone:
+    name: vel.rl.models.backbone.double_noisy_nature_cnn
+    input_width: 84
+    input_height: 84
+    input_channels: 4  # The same as frame_history
+
+    initial_std_dev: 0.5
+    factorized_noise: true
+
+
+reinforcer:
+  name: vel.rl.reinforcers.buffered_off_policy_iteration_reinforcer
+
+  env_roller:
+    name: vel.rl.env_roller.transition_replay_env_roller
+
+    # N-Step Q-Learning
+    forward_steps: 3
+    discount_factor: 0.99
+
+    replay_buffer:
+      name: vel.rl.buffers.prioritized_circular_replay_buffer
+
+      buffer_initial_size: 80_000 # How many samples we need in the buffer before we start using replay buffer
+      buffer_capacity: 1_000_000
+
+      # Because env has a framestack already built-in, save memory by encoding only last frames in the replay buffer
+      frame_stack_compensation: true
+      frame_history: 4  # How many stacked frames go into a single observation
+
+      priority_exponent: 0.5
+      priority_weight:
+        name: vel.schedules.linear
+        initial_value: 0.4
+        final_value: 1.0
+
+      priority_epsilon: 1.0e-6
+
+  algo:
+    name: vel.rl.algo.distributional_dqn
+    double_dqn: true
+
+    target_update_frequency: 32_000  # After how many batches to update the target network
+    max_grad_norm: 0.5
+
+    discount_factor: 0.99
+
+  rollout_steps: 4 # How many environment steps (per env) to perform per batch of training
+  training_steps: 32 # How many environment steps (per env) to perform per training round
+  parallel_envs: 1  # Roll out only one env in parallel, just like in DeepMind paper
+
+
+optimizer:
+  name: vel.optimizers.adam
+  lr: 6.25e-05
+  epsilon: 1.5e-4
+
+
+commands:
+  train:
+    name: vel.rl.commands.rl_train_command
+    total_frames: 1.1e7  # 11M
+    batches_per_epoch: 2500
+
+  record:
+    name: vel.rl.commands.record_movie_command
+    takes: 10
+    videoname: 'breakout_rainbow_vid_{:04}.avi'
+    fps: 15
+
+  evaluate:
+    name: vel.rl.commands.evaluate_env_command
+    parallel_envs: 12
+    takes: 20
diff --git a/vel/rl/algo/distributional_dqn.py b/vel/rl/algo/distributional_dqn.py
@@ -180,7 +180,7 @@ def metrics(self) -> list:
 
 
 def create(model: ModelFactory, discount_factor: float, target_update_frequency: int,
-           max_grad_norm: float, double_dqn: bool=False):
+           max_grad_norm: float, double_dqn: bool = False):
     """ Vel factory function """
     return DistributionalDeepQLearning(
         model_factory=model,
diff --git a/vel/rl/models/backbone/double_noisy_nature_cnn.py b/vel/rl/models/backbone/double_noisy_nature_cnn.py
@@ -0,0 +1,121 @@
+"""
+Code based loosely on implementation:
+https://github.com/openai/baselines/blob/master/baselines/ppo2/policies.py
+
+Under MIT license.
+"""
+import numpy as np
+
+import torch.nn as nn
+import torch.nn.init as init
+import torch.nn.functional as F
+
+import vel.util.network as net_util
+
+from vel.api import LinearBackboneModel, ModelFactory
+from vel.rl.modules.noisy_linear import NoisyLinear
+
+
+class DoubleNoisyNatureCnn(LinearBackboneModel):
+    """
+    Neural network as defined in the paper 'Human-level control through deep reinforcement learning'
+    but with two separate heads and "noisy" linear layer.
+    """
+    def __init__(self, input_width, input_height, input_channels, output_dim=512, initial_std_dev=0.4,
+                 factorized_noise=True):
+        super().__init__()
+
+        self._output_dim = output_dim
+
+        self.conv1 = nn.Conv2d(
+            in_channels=input_channels,
+            out_channels=32,
+            kernel_size=(8, 8),
+            stride=4
+        )
+
+        self.conv2 = nn.Conv2d(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=(4, 4),
+            stride=2
+        )
+
+        self.conv3 = nn.Conv2d(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=(3, 3),
+            stride=1
+        )
+
+        self.final_width = net_util.convolutional_layer_series(input_width, [
+            (8, 0, 4),
+            (4, 0, 2),
+            (3, 0, 1)
+        ])
+
+        self.final_height = net_util.convolutional_layer_series(input_height, [
+            (8, 0, 4),
+            (4, 0, 2),
+            (3, 0, 1)
+        ])
+
+        self.linear_layer_one = NoisyLinear(
+            # 64 is the number of channels of the last conv layer
+            self.final_width * self.final_height * 64,
+            self.output_dim,
+            initial_std_dev=initial_std_dev,
+            factorized_noise=factorized_noise
+        )
+
+        self.linear_layer_two = NoisyLinear(
+            # 64 is the number of channels of the last conv layer
+            self.final_width * self.final_height * 64,
+            self.output_dim,
+            initial_std_dev=initial_std_dev,
+            factorized_noise=factorized_noise
+        )
+
+    @property
+    def output_dim(self) -> int:
+        """ Final dimension of model output """
+        return self._output_dim
+
+    def reset_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                init.orthogonal_(m.weight, gain=np.sqrt(2))
+                init.constant_(m.bias, 0.0)
+            elif isinstance(m, nn.Linear):
+                # init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                init.orthogonal_(m.weight, gain=np.sqrt(2))
+                init.constant_(m.bias, 0.0)
+            elif isinstance(m, NoisyLinear):
+                m.reset_weights()
+
+    def forward(self, image):
+        result = image
+        result = F.relu(self.conv1(result))
+        result = F.relu(self.conv2(result))
+        result = F.relu(self.conv3(result))
+        flattened = result.view(result.size(0), -1)
+
+        output_one = F.relu(self.linear_layer_one(flattened))
+        output_two = F.relu(self.linear_layer_two(flattened))
+
+        return output_one, output_two
+
+
+def create(input_width, input_height, input_channels=1, output_dim=512, initial_std_dev=0.4, factorized_noise=True):
+    """ Vel factory function """
+    def instantiate(**_):
+        return DoubleNoisyNatureCnn(
+            input_width=input_width, input_height=input_height, input_channels=input_channels,
+            output_dim=output_dim, initial_std_dev=initial_std_dev, factorized_noise=factorized_noise
+        )
+
+    return ModelFactory.generic(instantiate)
+
+
+DoubleNoisyNatureCnnFactory = create
diff --git a/vel/rl/models/q_dueling_model.py b/vel/rl/models/q_dueling_model.py
@@ -4,7 +4,7 @@
 from vel.api import LinearBackboneModel, Model, ModelFactory, BackboneModel
 from vel.modules.input.identity import IdentityFactory
 from vel.rl.api import Rollout, Evaluator
-from vel.rl.modules.dueling_q_head import DuelingQHead
+from vel.rl.modules.q_dueling_head import QDuelingHead
 from vel.rl.models.q_model import QModelEvaluator
 
 
@@ -21,7 +21,7 @@ def __init__(self, input_block: BackboneModel, backbone: LinearBackboneModel, ac
 
         self.input_block = input_block
         self.backbone = backbone
-        self.q_head = DuelingQHead(input_dim=backbone.output_dim, action_space=action_space)
+        self.q_head = QDuelingHead(input_dim=backbone.output_dim, action_space=action_space)
 
     def forward(self, observations):
         """ Model forward pass """
diff --git a/vel/rl/models/q_noisy_model.py b/vel/rl/models/q_noisy_model.py
@@ -5,7 +5,7 @@
 from vel.modules.input.identity import IdentityFactory
 from vel.rl.api import Rollout, RlModel, Evaluator
 from vel.rl.models.q_model import QModelEvaluator
-from vel.rl.modules.noisy_q_head import NoisyQHead
+from vel.rl.modules.q_noisy_head import QNoisyHead
 
 
 class NoisyQModel(RlModel):
@@ -22,7 +22,7 @@ def __init__(self, input_block: BackboneModel, backbone: LinearBackboneModel, ac
 
         self.input_block = input_block
         self.backbone = backbone
-        self.q_head = NoisyQHead(
+        self.q_head = QNoisyHead(
             input_dim=backbone.output_dim, action_space=action_space, initial_std_dev=initial_std_dev,
             factorized_noise=factorized_noise
         )
diff --git a/vel/rl/models/q_rainbow_model.py b/vel/rl/models/q_rainbow_model.py
diff --git a/vel/rl/modules/noisy_linear.py b/vel/rl/modules/noisy_linear.py
diff --git a/vel/rl/modules/q_distributional_head.py b/vel/rl/modules/q_distributional_head.py
diff --git a/vel/rl/modules/q_distributional_noisy_dueling_head.py b/vel/rl/modules/q_distributional_noisy_dueling_head.py
diff --git a/vel/rl/modules/q_dueling_head.py b/vel/rl/modules/q_dueling_head.py
diff --git a/vel/rl/modules/q_noisy_head.py b/vel/rl/modules/q_noisy_head.py