google
diff --git a/‎hackable_diffusion/lib/architecture/attention.py‎
Lines changed: 14 additions & 0 deletions b/‎hackable_diffusion/lib/architecture/attention.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎hackable_diffusion/lib/architecture/attention_test.py‎
Lines changed: 96 additions & 0 deletions b/‎hackable_diffusion/lib/architecture/attention_test.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎hackable_diffusion/lib/architecture/mlp_blocks.py‎
Lines changed: 104 additions & 39 deletions b/‎hackable_diffusion/lib/architecture/mlp_blocks.py‎
Lines changed: 104 additions & 39 deletions
@@ -127,6 +127,8 @@ def _dot_product_attention(
     rescale: Float["..."],
     *,
     mask: Bool["batch sequence_key"] | None = None,
+    dropout_rate: float = 0.0,
+    is_training: bool = True,
 ) -> Float["batch sequence_query head*dim"]:
   """Performs dot product attention.
 
@@ -137,6 +139,8 @@ def _dot_product_attention(
     rescale: Rescale factor for the attention scores.
     mask: Mask tensor. Mask is True for tokens we want to keep and False for
       tokens we want to mask. If None, no masking is performed.
+    dropout_rate: The dropout rate for the attention weights.
+    is_training: Whether the model is in training mode.
 
   Returns:
     The output tensor.
@@ -156,6 +160,11 @@ def _dot_product_attention(
   # Softmax and attention weights
   attn_weights = _stable_softmax(logits=attn_logits)
 
+  if dropout_rate > 0.0:
+    attn_weights = nn.Dropout(rate=dropout_rate)(
+        attn_weights, deterministic=not is_training
+    )
+
   # Calculate attention output
   attn_output = jnp.einsum("bhts,bhsd->bhtd", attn_weights, v)
 
@@ -194,6 +203,7 @@ class MultiHeadAttention(nn.Module):
       use_rope is True.
     zero_init_output: If True, the kernel of the final output projection layer
       is initialized to zeros.
+    dropout_rate: The dropout rate for the attention weights.
     dtype: The data type of the computation.
   """
 
@@ -203,6 +213,7 @@ class MultiHeadAttention(nn.Module):
   use_rope: bool = False
   rope_position_type: RoPEPositionType = RoPEPositionType.SQUARE
   zero_init_output: bool = False
+  dropout_rate: float = 0.0
   dtype: DType = jnp.float32
 
   def setup(self):
@@ -226,6 +237,7 @@ def __call__(
       c: Float["batch sequence2 dim2"] | None,
       *,
       mask: Bool["batch sequence1|sequence2"] | None = None,
+      is_training: bool = True,
   ) -> Float["batch sequence1 dim1"]:
     """Computes multi-head attention.
 
@@ -319,6 +331,8 @@ def __call__(
         v=v,
         rescale=scale,
         mask=mask,
+        dropout_rate=self.dropout_rate,
+        is_training=is_training,
     )
 
     attn_output = nn.Dense(
 
@@ -408,6 +408,102 @@ def test_multi_head_attention_invalid_mask_shape_raises_error(
     ):
       module.init(self.rng, self.x, c, mask=invalid_mask)
 
+  # MARK: Dropout Tests
+
+  def test_multi_head_attention_dropout_disabled_during_evaluation(self):
+    """Verifies dropout is inactive when is_training=False (evaluation mode)."""
+    # Initialize with an aggressive dropout rate (e.g., 0.5)
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        dropout_rate=0.5,
+    )
+
+    # Generate random inputs to capture exact matrix values
+    rng1, rng2 = jax.random.split(self.rng)
+    x_rand = jax.random.normal(
+        rng1, (self.batch_size, self.seq_len_q, self.dim)
+    )
+
+    variables = module.init(rng2, x_rand, c=None)
+
+    # Run twice with evaluation mode (is_training=False).
+    # Even with a 50% dropout rate, the outputs should be completely identical.
+    output_eval_1 = module.apply(variables, x_rand, c=None, is_training=False)
+    output_eval_2 = module.apply(variables, x_rand, c=None, is_training=False)
+
+    np.testing.assert_allclose(
+        output_eval_1,
+        output_eval_2,
+        atol=1e-6,
+    )
+
+  def test_multi_head_attention_dropout_active_during_training(self):
+    """Verifies dropout alters outputs randomly when is_training=True."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        dropout_rate=0.5,
+    )
+
+    rng1, rng2, rng_dropout1, rng_dropout2 = jax.random.split(self.rng, 4)
+    x_rand = jax.random.normal(
+        rng1, (self.batch_size, self.seq_len_q, self.dim)
+    )
+
+    variables = module.init(rng2, x_rand, c=None)
+
+    # Flax requires a 'dropout' RNG stream state passed inside a dict
+    # whenever execution hits an active nn.Dropout layer during training.
+    output_train_1 = module.apply(
+        variables,
+        x_rand,
+        c=None,
+        is_training=True,
+        rngs={"dropout": rng_dropout1},
+    )
+    output_train_2 = module.apply(
+        variables,
+        x_rand,
+        c=None,
+        is_training=True,
+        rngs={"dropout": rng_dropout2},
+    )
+
+    # Since two distinct keys were injected into the dropout stream,
+    # different masks were dropped, meaning outputs must differ.
+    self.assertFalse(jnp.allclose(output_train_1, output_train_2, atol=1e-5))
+
+  def test_multi_head_attention_dropout_scales_retained_activations(self):
+    """Verifies dropout scales active entries by 1 / (1 - rate) during training."""
+    # Set a 50% rate. Active entries must double in value (multiplied by 2.0)
+    rate = 0.5
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        dropout_rate=rate,
+    )
+
+    rng1, rng2, rng_dropout = jax.random.split(self.rng, 3)
+    x_rand = jax.random.normal(
+        rng1, (self.batch_size, self.seq_len_q, self.dim)
+    )
+
+    variables = module.init(rng2, x_rand, c=None)
+
+    output_eval = module.apply(variables, x_rand, c=None, is_training=False)
+    output_train = module.apply(
+        variables,
+        x_rand,
+        c=None,
+        is_training=True,
+        rngs={"dropout": rng_dropout},
+    )
+
+    # Standard inverted dropout behavior means active values must be larger
+    # than non-dropped values to preserve target expectation bounds.
+    max_train_val = float(jnp.max(jnp.abs(output_train)))
+    max_eval_val = float(jnp.max(jnp.abs(output_eval)))
+
+    self.assertGreater(max_train_val, max_eval_val)
+
 
 if __name__ == "__main__":
   absltest.main()
@@ -88,67 +88,132 @@ def __call__(
 
 
 ################################################################################
-# MARK: SwiGLU
+# MARK: LinearSwiGLU
 ################################################################################
 
 
-class SwiGLU(nn.Module):
-  """SwiGLU feed-forward network.
+class LinearSwiGLU(nn.Module):
+  """A Dense layer variant that outputs SwiGLU gating directly.
 
   A gated feed-forward network using SiLU (Swish) activation for the gate,
   following "GLU Variants Improve Transformer" (Shazeer, 2020):
   https://arxiv.org/abs/2002.05202
 
-  The forward pass is:
-
-    gate_and_val = x @ W_up           # (*, hidden_size) -> (*, ff_size * 2)
-    val, gate = split(gate_and_val)   # (*, ff_size) each
-    x = val * SiLU(gate)              # (*, ff_size)
-    x = dropout(x)
-    x = x @ W_down                    # (*, ff_size) -> (*, hidden_size)
-
-  Attributes:
-    hidden_size: Output dimension (residual stream width).
-    ff_size: Intermediate dimension (before gating).
-    zero_init_output: If True, the down-projection kernel is initialized to
-      zeros so the block starts as identity.
-    dropout_rate: Dropout rate applied after gating.
-    dtype: Data type for computation.
+  Projects the input dimension to features * 2, chunks the result across the
+  last dimension, and gates the activation channel with SiLU.
   """
 
-  hidden_size: int
-  ff_size: int
-  zero_init_output: bool = False
-  dropout_rate: float = 0.0
+  features: int
+  use_bias: bool = False
   dtype: DType = jnp.float32
 
   @nn.compact
   @kt.typechecked
-  def __call__(
-      self, x: Float['batch *other_dims hidden_size'], *, is_training: bool
-  ) -> Float['batch *other_dims hidden_size']:
-    # Up-projection: (*, hidden_size) -> (*, ff_size * 2).
+  def __call__(self, x: Float["*batch d_in"]) -> Float["*batch features"]:
+    # Project to double feature width
     gate_and_val = nn.Dense(
-        features=self.ff_size * 2,
-        use_bias=False,
+        features=self.features * 2,
+        use_bias=self.use_bias,
         dtype=self.dtype,
-        name='Dense_Up',
+        name="Dense_Gate_Val",
     )(x)
-    # Split into value and gate, apply SiLU gating.
+
+    # Split and apply SiLU gating (mirrors torch.chunk(2, dim=-1))
     val, gate = jnp.split(gate_and_val, 2, axis=-1)
-    x = val * nn.silu(gate)
-    x = nn.Dropout(rate=self.dropout_rate, deterministic=not is_training)(x)
-    # Down-projection: (*, ff_size) -> (*, hidden_size).
+    return val * nn.silu(gate)
+
+
+################################################################################
+# MARK: FeedForward Unified Block
+################################################################################
+
+
+class FeedForward(nn.Module):
+  """A unified FeedForward block selecting between SwiGLU or traditional layers.
+
+  Attributes:
+    output_size: Output dimension (residual stream width).
+    hidden_size: Intermediate bottleneck network dimension.
+    ffn_type: Layout type toggle. - 'swiglu' uses a gated SwiGLU projection
+      layer. - 'standard' uses a classic dense projection followed by an
+      activation.
+    activation: Name of the activation function to use when
+      `ffn_type='standard'` (e.g., 'gelu', 'silu', 'relu'). This parameter is
+      explicitly ignored when `ffn_type='swiglu'` because the SwiGLU path uses
+      its own mathematical gating mechanism (SiLU/Swish).
+    zero_init_output: If True, the terminal linear projections are zeroed out
+      ensuring the block satisfies identity-at-init behavior.
+    dropout_rate: Activation state dropout regularization coefficient.
+    dtype: Numerical precision layout representation format.
+  """
+
+  output_size: int
+  hidden_size: int
+  ffn_type: str = "standard"
+  activation: str = "gelu"
+  zero_init_output: bool = False
+  dropout_rate: float = 0.0
+  dtype: DType = jnp.float32
+
+  def setup(self):
+    if self.ffn_type not in ("standard", "swiglu"):
+      raise ValueError(
+          f"Unknown ffn_type: {self.ffn_type}. Must be 'standard' or 'swiglu'."
+      )
+    # Regularization Dropout Layer
+    self.dropout = nn.Dropout(rate=self.dropout_rate)
+
+    # Down Projection Layer Config
     down_kernel_init = (
         nn.initializers.zeros_init()
         if self.zero_init_output
         else nn.initializers.lecun_normal()
     )
-    x = nn.Dense(
-        features=self.hidden_size,
-        use_bias=False,
-        dtype=self.dtype,
+    # Standard SwiGLU down-projections generally omit biases
+    self.use_down_bias = False if self.ffn_type == "swiglu" else True
+
+    self.down_proj = nn.Dense(
+        features=self.output_size,
+        use_bias=self.use_down_bias,
         kernel_init=down_kernel_init,
-        name='Dense_Down',
-    )(x)
+        dtype=self.dtype,
+        name="Dense_Down",
+    )
+
+  @nn.compact
+  @kt.typechecked
+  def __call__(
+      self, x: Float["batch *other_dims output_size"], *, is_training: bool
+  ) -> Float["batch *other_dims output_size"]:
+    # Up-projection step
+    if self.ffn_type == "swiglu":
+      # Project to double feature width
+      gate_and_val = nn.Dense(
+          features=self.hidden_size * 2,
+          use_bias=False,
+          dtype=self.dtype,
+          name="Dense_Up",
+      )(x)
+      # Split and apply SiLU gating
+      val, gate = jnp.split(gate_and_val, 2, axis=-1)
+      x = val * nn.silu(gate)
+    elif self.ffn_type == "standard":
+      x = nn.Dense(
+          features=self.hidden_size,
+          use_bias=True,
+          dtype=self.dtype,
+          name="Dense_Up",
+      )(x)
+      # Apply the configured activation function
+      activation_fn = getattr(nn, self.activation)
+      x = activation_fn(x)
+    else:
+      raise ValueError(f"Unknown ffn_type mapping strategy: {self.ffn_type!r}")
+
+    # Middle regularization step
+    x = self.dropout(x, deterministic=not is_training)
+
+    # Final down-projection step
+    x = self.down_proj(x)
+
     return x