Correct that DiT blocks to match their corresponding implementations

agalashov · Hackable Diffusion Authors · commit 7d22496bb0ac · 2026-05-21T08:00:09.000-07:00
PiperOrigin-RevId: 919043097
diff --git a/hackable_diffusion/lib/architecture/attention.py b/hackable_diffusion/lib/architecture/attention.py
@@ -14,7 +14,7 @@
 
 """Attention layers and utils."""
 
-from typing import Callable
+from typing import Callable, Literal
 import warnings
 
 import flax.linen as nn
@@ -37,6 +37,8 @@
 RoPEPositionType = arch_typing.RoPEPositionType
 INVALID_INT = arch_typing.INVALID_INT
 
+AttnQKNormMethod = Literal["l2", "rms_norm"]
+
 ################################################################################
 # MARK: Constants
 ################################################################################
@@ -211,6 +213,7 @@ class MultiHeadAttention(nn.Module):
   num_heads: int = INVALID_INT
   head_dim: int = INVALID_INT
   normalize_qk: bool = False
+  qk_norm_method: AttnQKNormMethod = "l2"
   use_rope: bool = False
   rope_position_type: RoPEPositionType = RoPEPositionType.SQUARE
   use_bias: bool = True
@@ -303,6 +306,32 @@ def __call__(
     v = v.reshape(b, seq_len_kv, num_heads, head_d).transpose(0, 2, 1, 3)
     # shape is [batch, num_heads, sequence_length, head_dim]
 
+    if self.normalize_qk:
+      if self.qk_norm_method == "rms_norm":
+        q = nn.RMSNorm(name="RMSNorm_Q")(q)
+        k = nn.RMSNorm(name="RMSNorm_K")(k)
+        scale = 1.0 / jnp.sqrt(jnp.float32(head_d))
+      # QK L2 normalization: https://arxiv.org/abs/2010.04245
+      elif self.qk_norm_method == "l2":
+        scale = self.param(
+            "norm_qk_scale",
+            nn.initializers.constant(
+                jnp.log2(seq_len_kv**2 - seq_len_kv + SAFETY_EPSILON)
+            ),
+            (1, 1, 1, 1),
+        )
+
+        norm_q = jnp.linalg.norm(q, ord=2, axis=-1, keepdims=True)
+        norm_k = jnp.linalg.norm(k, ord=2, axis=-1, keepdims=True)
+        q = q / (norm_q + SAFETY_EPSILON)
+        k = k / (norm_k + SAFETY_EPSILON)
+      else:
+        raise ValueError(
+            f"Unsupported QK normalization method: {self.qk_norm_method}."
+        )
+    else:
+      scale = 1.0 / jnp.sqrt(jnp.float32(head_d))
+
     # RoPE: https://arxiv.org/abs/2104.09864
     if self.use_rope:
       q = sequence_embedders.RoPESequenceEmbedding(
@@ -313,23 +342,6 @@ def __call__(
       )(k)
       # shape is [batch, num_heads, sequence_length, head_dim]
 
-    # QK normalization: https://arxiv.org/abs/2010.04245.
-    if self.normalize_qk:
-      scale = self.param(
-          "norm_qk_scale",
-          nn.initializers.constant(
-              jnp.log2(seq_len_kv**2 - seq_len_kv + SAFETY_EPSILON)
-          ),
-          (1, 1, 1, 1),
-      )
-
-      norm_q = jnp.linalg.norm(q, ord=2, axis=-1, keepdims=True)
-      norm_k = jnp.linalg.norm(k, ord=2, axis=-1, keepdims=True)
-      q = q / (norm_q + SAFETY_EPSILON)
-      k = k / (norm_k + SAFETY_EPSILON)
-    else:
-      scale = 1.0 / jnp.sqrt(head_d)
-
     attn_output = _dot_product_attention(
         q=q,
         k=k,
diff --git a/hackable_diffusion/lib/architecture/attention_test.py b/hackable_diffusion/lib/architecture/attention_test.py
@@ -555,6 +555,107 @@ def test_multi_head_attention_no_bias_param_shapes(self):
     }
     self.assertDictEqual(expected, variables_shapes)
 
+  # MARK: qk_norm_method tests
+
+  @parameterized.named_parameters(
+      ("l2", "l2"),
+      ("rms_norm", "rms_norm"),
+  )
+  def test_qk_norm_method_output_shape(self, qk_norm_method):
+    """Verifies output shape is correct for each qk_norm_method."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=True,
+        qk_norm_method=qk_norm_method,
+    )
+    variables = module.init(self.rng, self.x, c=None)
+    output = module.apply(variables, self.x, c=None, is_training=False)
+    self.assertEqual(output.shape, self.x.shape)
+
+  def test_qk_norm_l2_param_shapes(self):
+    """Verifies L2 QK normalization creates a norm_qk_scale parameter."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=True,
+        qk_norm_method="l2",
+    )
+    variables = module.init(self.rng, self.x, c=None)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    # L2 method should have a norm_qk_scale param
+    self.assertIn("params/norm_qk_scale", leaves)
+    self.assertEqual(leaves["params/norm_qk_scale"].shape, (1, 1, 1, 1))
+    # Should NOT have RMSNorm_Q/K
+    rms_paths = [p for p in leaves if "RMSNorm" in p]
+    self.assertEmpty(rms_paths)
+
+  def test_qk_norm_rms_norm_param_shapes(self):
+    """Verifies RMSNorm QK normalization creates RMSNorm_Q/K scale params."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=True,
+        qk_norm_method="rms_norm",
+    )
+    variables = module.init(self.rng, self.x, c=None)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    # RMSNorm method should have RMSNorm_Q/scale and RMSNorm_K/scale
+    self.assertIn("params/RMSNorm_Q/scale", leaves)
+    self.assertIn("params/RMSNorm_K/scale", leaves)
+    self.assertEqual(leaves["params/RMSNorm_Q/scale"].shape, (self.head_dim,))
+    self.assertEqual(leaves["params/RMSNorm_K/scale"].shape, (self.head_dim,))
+    # Should NOT have norm_qk_scale
+    self.assertNotIn("params/norm_qk_scale", leaves)
+
+  def test_qk_norm_rms_norm_with_rope(self):
+    """Verifies RMSNorm QK norm works with RoPE (norm before RoPE)."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=True,
+        qk_norm_method="rms_norm",
+        use_rope=True,
+        rope_position_type=RoPEPositionType.SQUARE,
+    )
+    x = jnp.ones((self.batch_size, self.seq_len_kv, self.dim))
+    variables = module.init(self.rng, x, c=None)
+    output = module.apply(variables, x, c=None, is_training=False)
+    self.assertEqual(output.shape, x.shape)
+
+  def test_qk_norm_l2_with_rope(self):
+    """Verifies L2 QK norm works with RoPE (norm before RoPE)."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=True,
+        qk_norm_method="l2",
+        use_rope=True,
+        rope_position_type=RoPEPositionType.SQUARE,
+    )
+    x = jnp.ones((self.batch_size, self.seq_len_kv, self.dim))
+    variables = module.init(self.rng, x, c=None)
+    output = module.apply(variables, x, c=None, is_training=False)
+    self.assertEqual(output.shape, x.shape)
+
+  def test_qk_norm_disabled_has_no_norm_params(self):
+    """Verifies that normalize_qk=False creates no norm params."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=False,
+    )
+    variables = module.init(self.rng, self.x, c=None)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    norm_paths = [p for p in leaves if "norm_qk" in p or "RMSNorm" in p]
+    self.assertEmpty(norm_paths)
+
+  def test_qk_norm_invalid_method_raises_error(self):
+    """Verifies that an invalid qk_norm_method raises ValueError."""
+    module = attention.MultiHeadAttention(
+        num_heads=self.num_heads,
+        normalize_qk=True,
+        qk_norm_method="invalid_method",  # pytype: disable=wrong-arg-types
+    )
+    with self.assertRaisesRegex(
+        ValueError, "Unsupported QK normalization method"
+    ):
+      module.init(self.rng, self.x, c=None)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/hackable_diffusion/lib/architecture/dit_blocks.py b/hackable_diffusion/lib/architecture/dit_blocks.py
@@ -133,6 +133,7 @@ class DiTBlock(nn.Module):
       to avoid bias in the FFN.
     ffn_activation: Activation function for the FFN.
     attn_normalize_qk: Whether to normalize query and key in attention.
+    attn_qk_norm_method: Normalization method for query and key in attention.
     attn_use_bias: Whether to use bias in the attention QKV and output
       projections.
     mlp_ratio: The ratio of the MLP hidden dimension to the hidden size.
@@ -149,10 +150,11 @@ class DiTBlock(nn.Module):
   num_heads: int = INVALID_INT
   head_dim: int = INVALID_INT
   use_gates: bool = True
-  ffn_type: mlp_blocks.FFNType = 'swiglu'
-  ffn_use_bias: bool = False
+  ffn_type: mlp_blocks.FFNType = 'dense'
+  ffn_use_bias: bool = True
   ffn_activation: str = 'gelu'
   attn_normalize_qk: bool = True
+  attn_qk_norm_method: attention.AttnQKNormMethod = 'l2'
   attn_use_bias: bool = True
   mlp_ratio: float = 4.0
   use_rope: bool = False
@@ -194,6 +196,7 @@ def setup(self):
         zero_init_output=self.zero_init_output,
         dtype=self.dtype,
         normalize_qk=self.attn_normalize_qk,
+        qk_norm_method=self.attn_qk_norm_method,
         use_bias=self.attn_use_bias,
         dropout_rate=self.dropout_rate,
     )
@@ -282,7 +285,11 @@ class DiTBlockFlux(DiTBlock):
   use_gates: bool = dataclasses.field(init=False, default=False)
   zero_init_output: bool = dataclasses.field(init=False, default=True)
   attn_normalize_qk: bool = dataclasses.field(init=False, default=True)
+  attn_qk_norm_method: attention.AttnQKNormMethod = dataclasses.field(
+      init=False, default='rms_norm'
+  )
   attn_use_bias: bool = dataclasses.field(init=False, default=False)
+  ffn_use_bias: bool = dataclasses.field(init=False, default=False)
 
   def __post_init__(self):
     self.norm_factory = normalization.NormalizationLayerFactory(
@@ -305,7 +312,11 @@ class DiTBlockSD3(DiTBlock):
   use_gates: bool = dataclasses.field(init=False, default=True)
   zero_init_output: bool = dataclasses.field(init=False, default=False)
   attn_normalize_qk: bool = dataclasses.field(init=False, default=True)
-  attn_use_bias: bool = dataclasses.field(init=False, default=False)
+  attn_qk_norm_method: attention.AttnQKNormMethod = dataclasses.field(
+      init=False, default='rms_norm'
+  )
+  attn_use_bias: bool = dataclasses.field(init=False, default=True)
+  ffn_use_bias: bool = dataclasses.field(init=False, default=True)
 
   def __post_init__(self):
     self.norm_factory = normalization.NormalizationLayerFactory(
@@ -330,6 +341,7 @@ class DiTBlockAdaLNZero(DiTBlock):
   zero_init_output: bool = dataclasses.field(init=False, default=False)
   attn_normalize_qk: bool = dataclasses.field(init=False, default=False)
   attn_use_bias: bool = dataclasses.field(init=False, default=True)
+  ffn_use_bias: bool = dataclasses.field(init=False, default=True)
 
   def __post_init__(self):
     self.norm_factory = normalization.NormalizationLayerFactory(
diff --git a/hackable_diffusion/lib/architecture/dit_blocks_test.py b/hackable_diffusion/lib/architecture/dit_blocks_test.py
@@ -170,9 +170,11 @@ def test_variable_shapes_ada_rms_norm(self):
             'ffn': {
                 'Dense_Up': {
                     'kernel': (self.d, mlp_hidden * 2),
+                    'bias': (mlp_hidden * 2,),
                 },
                 'Dense_Down': {
                     'kernel': (mlp_hidden, self.d),
+                    'bias': (self.d,),
                 },
             },
             'attn': {
@@ -226,9 +228,11 @@ def test_variable_shapes_ada_ln_zero(self):
             'ffn': {
                 'Dense_Up': {
                     'kernel': (self.d, mlp_hidden),
+                    'bias': (mlp_hidden,),
                 },
                 'Dense_Down': {
                     'kernel': (mlp_hidden, self.d),
+                    'bias': (self.d,),
                 },
             },
             'ConditionalNorm_MLP': {
@@ -376,6 +380,98 @@ def test_ada_ln_zero_has_gates(self):
     gate_paths = [p for p in leaves_with_paths if 'Gate' in p]
     self.assertNotEmpty(gate_paths)
 
+  # MARK: qk_norm_method tests
+
+  @parameterized.named_parameters(
+      ('l2', 'l2'),
+      ('rms_norm', 'rms_norm'),
+  )
+  def test_preset_qk_norm_method_output_shape(self, qk_norm_method):
+    """Tests that DiTBlock with each qk_norm_method produces correct shape."""
+    x = jnp.ones((self.batch, self.n, self.d))
+    cond = jnp.ones((self.batch, self.c))
+    module = dit_blocks.DiTBlock(
+        hidden_size=self.d,
+        num_heads=4,
+        norm_factory=normalization.NormalizationLayerFactory(
+            normalization_method=NormalizationType.RMS_NORM,
+            use_conditional_shift=False,
+        ),
+        use_gates=False,
+        attn_normalize_qk=True,
+        attn_qk_norm_method=qk_norm_method,
+    )
+    variables = module.init(self.key, x, cond, is_training=False)
+    output = module.apply(variables, x, cond, is_training=False)
+    self.assertEqual(output.shape, (self.batch, self.n, self.d))
+
+  def test_flux_uses_rms_norm_qk(self):
+    """Verifies DiTBlockFlux uses RMSNorm QK normalization."""
+    x = jnp.ones((self.batch, self.n, self.d))
+    cond = jnp.ones((self.batch, self.c))
+    module = dit_blocks.DiTBlockFlux(hidden_size=self.d, num_heads=4)
+    variables = module.init(self.key, x, cond, is_training=False)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    # Flux uses rms_norm method: should have RMSNorm_Q/K, no norm_qk_scale
+    rms_paths = [p for p in leaves if 'RMSNorm_Q' in p or 'RMSNorm_K' in p]
+    self.assertNotEmpty(rms_paths)
+    l2_paths = [p for p in leaves if 'norm_qk_scale' in p]
+    self.assertEmpty(l2_paths)
+
+  def test_sd3_uses_rms_norm_qk(self):
+    """Verifies DiTBlockSD3 uses RMSNorm QK normalization."""
+    x = jnp.ones((self.batch, self.n, self.d))
+    cond = jnp.ones((self.batch, self.c))
+    module = dit_blocks.DiTBlockSD3(hidden_size=self.d, num_heads=4)
+    variables = module.init(self.key, x, cond, is_training=False)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    # SD3 uses rms_norm method: should have RMSNorm_Q/K, no norm_qk_scale
+    rms_paths = [p for p in leaves if 'RMSNorm_Q' in p or 'RMSNorm_K' in p]
+    self.assertNotEmpty(rms_paths)
+    l2_paths = [p for p in leaves if 'norm_qk_scale' in p]
+    self.assertEmpty(l2_paths)
+
+  def test_ada_ln_zero_has_no_qk_norm(self):
+    """Verifies DiTBlockAdaLNZero has no QK normalization params."""
+    x = jnp.ones((self.batch, self.n, self.d))
+    cond = jnp.ones((self.batch, self.c))
+    module = dit_blocks.DiTBlockAdaLNZero(hidden_size=self.d, num_heads=4)
+    variables = module.init(self.key, x, cond, is_training=False)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    norm_paths = [
+        p
+        for p in leaves
+        if 'norm_qk' in p or 'RMSNorm_Q' in p or 'RMSNorm_K' in p
+    ]
+    self.assertEmpty(norm_paths)
+
+  def test_dit_block_no_attn_bias_with_rms_norm_qk(self):
+    """Verifies DiTBlock with use_bias=False and rms_norm QK norm."""
+    x = jnp.ones((self.batch, self.n, self.d))
+    cond = jnp.ones((self.batch, self.c))
+    module = dit_blocks.DiTBlock(
+        hidden_size=self.d,
+        num_heads=4,
+        norm_factory=normalization.NormalizationLayerFactory(
+            normalization_method=NormalizationType.RMS_NORM,
+            use_conditional_shift=False,
+        ),
+        use_gates=False,
+        attn_normalize_qk=True,
+        attn_qk_norm_method='rms_norm',
+        attn_use_bias=False,
+    )
+    variables = module.init(self.key, x, cond, is_training=False)
+    leaves = test_helpers.get_leaves_with_paths(variables)
+    # No bias in attention
+    attn_bias_paths = [
+        p for p in leaves if p.startswith('params/attn/') and 'bias' in p
+    ]
+    self.assertEmpty(attn_bias_paths)
+    # Has RMSNorm_Q/K
+    rms_paths = [p for p in leaves if 'RMSNorm_Q' in p or 'RMSNorm_K' in p]
+    self.assertNotEmpty(rms_paths)
+
 
 class PositionalEmbeddingTest(parameterized.TestCase):