Adding name scopes for easier XProf for WAN 2.1

eltsai · eltsai · commit c2e0f15ffc9e · 2025-11-18T18:40:52.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -850,30 +850,41 @@ def __call__(
     dtype = hidden_states.dtype
     if encoder_hidden_states is None:
       encoder_hidden_states = hidden_states
-
-    query_proj = self.query(hidden_states)
-    key_proj = self.key(encoder_hidden_states)
-    value_proj = self.value(encoder_hidden_states)
+      
+    with jax.named_scope("attn_qkv_proj"):
+      with jax.named_scope("proj_query"):
+        query_proj = self.query(hidden_states)
+      with jax.named_scope("proj_key"):
+        key_proj = self.key(encoder_hidden_states)
+      with jax.named_scope("proj_value"):
+        value_proj = self.value(encoder_hidden_states)
 
     if self.qk_norm:
-      query_proj = self.norm_q(query_proj)
-      key_proj = self.norm_k(key_proj)
+      with jax.named_scope("attn_q_norm"):
+        query_proj = self.norm_q(query_proj)
+      with jax.named_scope("attn_k_norm"):
+        key_proj = self.norm_k(key_proj)
     if rotary_emb is not None:
-      query_proj = _unflatten_heads(query_proj, self.heads)
-      key_proj = _unflatten_heads(key_proj, self.heads)
-      value_proj = _unflatten_heads(value_proj, self.heads)
-      # output of _unflatten_heads Batch, heads, seq_len, head_dim
-      query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
+      with jax.named_scope("attn_rope"):
+        query_proj = _unflatten_heads(query_proj, self.heads)
+        key_proj = _unflatten_heads(key_proj, self.heads)
+        value_proj = _unflatten_heads(value_proj, self.heads)
+        # output of _unflatten_heads Batch, heads, seq_len, head_dim
+        query_proj, key_proj = self._apply_rope(query_proj, key_proj, rotary_emb)
 
     query_proj = checkpoint_name(query_proj, "query_proj")
     key_proj = checkpoint_name(key_proj, "key_proj")
     value_proj = checkpoint_name(value_proj, "value_proj")
-    attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
+    
+    with jax.named_scope("attn_compute"):
+      attn_output = self.attention_op.apply_attention(query_proj, key_proj, value_proj)
 
     attn_output = attn_output.astype(dtype=dtype)
     attn_output = checkpoint_name(attn_output, "attn_output")
-    hidden_states = self.proj_attn(attn_output)
-    hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
+    
+    with jax.named_scope("attn_out_proj"):
+      hidden_states = self.proj_attn(attn_output)
+      hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     return hidden_states
 
 
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -236,10 +236,12 @@ def __init__(
     )
 
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
-    hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
-    hidden_states = checkpoint_name(hidden_states, "ffn_activation")
-    hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
-    return self.proj_out(hidden_states)  # output is (4, 75600, 5120)
+    with jax.named_scope("mlp_up_proj_and_gelu"):
+      hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
+      hidden_states = checkpoint_name(hidden_states, "ffn_activation")
+      hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
+    with jax.named_scope("mlp_down_proj"):
+      return self.proj_out(hidden_states)  # output is (4, 75600, 5120)
 
 
 class WanTransformerBlock(nnx.Module):
@@ -331,41 +333,55 @@ def __call__(
       deterministic: bool = True,
       rngs: nnx.Rngs = None,
   ):
-    shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
-        (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
-    )
-    hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
-    encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None))
-
-    # 1. Self-attention
-    norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(
-        hidden_states.dtype
-    )
-    attn_output = self.attn1(
-        hidden_states=norm_hidden_states,
-        encoder_hidden_states=norm_hidden_states,
-        rotary_emb=rotary_emb,
-        deterministic=deterministic,
-        rngs=rngs,
-    )
-    hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
-
-    # 2. Cross-attention
-    norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32)).astype(hidden_states.dtype)
-    attn_output = self.attn2(
-        hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs
-    )
-    hidden_states = hidden_states + attn_output
-
-    # 3. Feed-forward
-    norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(
-        hidden_states.dtype
-    )
-    ff_output = self.ffn(norm_hidden_states, deterministic=deterministic, rngs=rngs)
-    hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(
-        hidden_states.dtype
-    )
-    return hidden_states
+    with jax.named_scope("transformer_block"):
+      with jax.named_scope("adaln"):
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = jnp.split(
+            (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
+        )
+      hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
+      encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None))
+
+      # 1. Self-attention
+      with jax.named_scope("self_attention"):
+        with jax.named_scope("self_attention_norm"):
+          norm_hidden_states = (self.norm1(hidden_states.astype(jnp.float32)) * (1 + scale_msa) + shift_msa).astype(
+              hidden_states.dtype
+          )
+        with jax.named_scope("self_attention_attn"):
+          attn_output = self.attn1(
+              hidden_states=norm_hidden_states,
+              encoder_hidden_states=norm_hidden_states,
+              rotary_emb=rotary_emb,
+              deterministic=deterministic,
+              rngs=rngs,
+          )
+        with jax.named_scope("self_attention_residual"):
+          hidden_states = (hidden_states.astype(jnp.float32) + attn_output * gate_msa).astype(hidden_states.dtype)
+
+      # 2. Cross-attention
+      with jax.named_scope("cross_attention"):
+        with jax.named_scope("cross_attention_norm"):
+          norm_hidden_states = self.norm2(hidden_states.astype(jnp.float32)).astype(hidden_states.dtype)
+        with jax.named_scope("cross_attention_attn"):
+          attn_output = self.attn2(
+              hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states, deterministic=deterministic, rngs=rngs
+          )
+        with jax.named_scope("cross_attention_residual"):
+          hidden_states = hidden_states + attn_output
+
+      # 3. Feed-forward
+      with jax.named_scope("mlp"):
+        with jax.named_scope("mlp_norm"):
+          norm_hidden_states = (self.norm3(hidden_states.astype(jnp.float32)) * (1 + c_scale_msa) + c_shift_msa).astype(
+              hidden_states.dtype
+          )
+        with jax.named_scope("mlp_ffn"):
+          ff_output = self.ffn(norm_hidden_states, deterministic=deterministic, rngs=rngs)
+        with jax.named_scope("mlp_residual"):
+          hidden_states = (hidden_states.astype(jnp.float32) + ff_output.astype(jnp.float32) * c_gate_msa).astype(
+              hidden_states.dtype
+          )
+      return hidden_states
 
 
 class WanModel(nnx.Module, FlaxModelMixin, ConfigMixin):
@@ -522,13 +538,15 @@ def __call__(
 
     hidden_states = jnp.transpose(hidden_states, (0, 2, 3, 4, 1))
     rotary_emb = self.rope(hidden_states)
-
-    hidden_states = self.patch_embedding(hidden_states)
-    hidden_states = jax.lax.collapse(hidden_states, 1, -1)
-
-    temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
-        timestep, encoder_hidden_states, encoder_hidden_states_image
-    )
+    
+    with jax.named_scope("patch_embedding"):
+      hidden_states = self.patch_embedding(hidden_states)
+      hidden_states = jax.lax.collapse(hidden_states, 1, -1)
+
+    with jax.named_scope("condition_embedding"):
+      temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+          timestep, encoder_hidden_states, encoder_hidden_states_image
+      )
     timestep_proj = timestep_proj.reshape(timestep_proj.shape[0], 6, -1)
 
     if encoder_hidden_states_image is not None: