vllm-project
diff --git a/‎tpu_inference/layers/jax/attention/gpt_oss_attention.py‎
Lines changed: 217 additions & 0 deletions b/‎tpu_inference/layers/jax/attention/gpt_oss_attention.py‎
Lines changed: 217 additions & 0 deletions
diff --git a/‎tpu_inference/layers/jax/moe/gpt_oss_moe.py‎
Lines changed: 148 additions & 0 deletions b/‎tpu_inference/layers/jax/moe/gpt_oss_moe.py‎
Lines changed: 148 additions & 0 deletions
@@ -0,0 +1,217 @@
+import math
+from dataclasses import InitVar, dataclass, field
+from typing import Any, Tuple
+
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from flax.typing import Sharding
+from jax.experimental import shard_map
+from jax.sharding import Mesh
+from jax.sharding import PartitionSpec as P
+from jaxtyping import Float
+
+from tpu_inference import utils
+from tpu_inference.kernels.ragged_paged_attention.v3.kernel import \
+    ragged_paged_attention
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.base import create_param
+from tpu_inference.layers.jax.layers import RMSNorm
+from tpu_inference.layers.jax.rope import GptOssRotaryEmbedding
+
+KVCache = Tuple[jax.Array, jax.Array]
+
+@dataclass(kw_only=True)
+class GptOssAttention(nnx.Module):
+    """
+    JAX implementation of the GPT-OSS Attention block
+    """
+    hidden_size: int
+    num_attention_heads: int
+    num_key_value_heads: int
+    head_dim: int
+    dtype: jnp.dtype
+    rngs: InitVar[nnx.Rngs]
+    
+    rope_theta: float
+    initial_context_length: int = 4096
+    rope_scaling_factor: float = 32.0
+    rope_ntk_alpha: float = 1.0
+    rope_ntk_beta: float = 32.0
+
+    query_tnh: P = P()
+    keyvalue_skh: P = P()
+    attn_o_tnh: P = P()
+    dnh_sharding: Sharding = ()
+    dkh_sharding: Sharding = ()
+    nhd_sharding: Sharding = ()
+    n_sharding: Sharding = ()
+    nh_sharding: Sharding = ()
+    kh_sharding: Sharding = ()
+    d_sharding: Sharding = ()
+    
+    random_init: bool = False
+    mesh: Mesh
+    
+    def __post_init__(self, rngs: nnx.Rngs):
+        """Initializes weights, biases, and RoPE module."""
+        #D, N, K, H = self.hidden_size, self.num_attention_heads, self.num_key_value_heads, self.head_dim
+        
+        self.sm_scale = 1.0 / (self.head_dim ** 0.5)
+
+        self.sinks_N = create_param(
+            rngs, shape=(self.num_attention_heads,), dtype=jnp.float32,
+            sharding=self.n_sharding, random_init=self.random_init
+        )
+
+        # Q, K, V projection kernels
+        self.kernel_q_DNH = create_param(
+            rngs, shape=(self.hidden_size, self.num_attention_heads, self.head_dim),
+            dtype=self.dtype, sharding=self.dnh_sharding, random_init=self.random_init
+        )
+        self.bias_q_NH = create_param(
+            rngs, shape=(self.num_attention_heads, self.head_dim),
+            dtype=self.dtype, sharding=self.nh_sharding, random_init=self.random_init
+        )
+        self.kernel_k_DKH = create_param(
+            rngs, shape=(self.hidden_size, self.num_key_value_heads, self.head_dim),
+            dtype=self.dtype, sharding=self.dkh_sharding, random_init=self.random_init
+        )
+        self.bias_k_KH = create_param(
+            rngs, shape=(self.num_key_value_heads, self.head_dim),
+            dtype=self.dtype, sharding=self.kh_sharding, random_init=self.random_init
+        )
+        self.kernel_v_DKH = create_param(
+            rngs, shape=(self.hidden_size, self.num_key_value_heads, self.head_dim),
+            dtype=self.dtype, sharding=self.dkh_sharding, random_init=self.random_init
+        )
+        self.bias_v_KH = create_param(
+            rngs, shape=(self.num_key_value_heads, self.head_dim),
+            dtype=self.dtype, sharding=self.kh_sharding, random_init=self.random_init
+        )
+        # Output projection kernel
+        self.kernel_o_proj_NHD = create_param(
+            rngs, shape=(self.num_attention_heads, self.head_dim, self.hidden_size),
+            dtype=self.dtype, sharding=self.nhd_sharding, random_init=self.random_init
+        )
+        self.bias_o_D = create_param(
+            rngs, shape=(self.hidden_size,),
+            dtype=self.dtype, sharding=self.d_sharding, random_init=self.random_init
+        )
+        
+        # RoPE Module
+        self.rope = GptOssRotaryEmbedding(
+            head_dim=self.head_dim,
+            rope_theta=self.rope_theta,
+            dtype=self.dtype,
+            initial_context_length=self.initial_context_length,
+            rope_scaling_factor=self.rope_scaling_factor,
+            rope_ntk_alpha=self.rope_ntk_alpha,
+            rope_ntk_beta=self.rope_ntk_beta
+        )
+
+    def attention(
+        self,
+        kv_cache: KVCache,
+        q_TNH: jax.Array,
+        k_SKH: jax.Array,
+        v_SKH: jax.Array,
+        sinks: jax.Array,
+        attention_metadata: AttentionMetadata,
+        mesh: Mesh,
+    ) -> Tuple[KVCache, jax.Array]:
+        """Performs scaled dot-product attention by calling the ragged_paged_attention kernel."""
+        md = attention_metadata
+        kv_cache_spec = P(None, None, "model")
+        
+        in_specs = (
+            self.query_tnh,      # q
+            self.keyvalue_skh,   # k
+            self.keyvalue_skh,   # v
+            kv_cache_spec,       # kv_cache
+            P(),                 # md.seq_lens: Replicated
+            P(),                 # page_indices_flat: Replicated
+            P(),                 # query_start_loc: Replicated
+            P(),                 # distribution: Replicated
+            P(('model')),                 # sinks
+        )
+        out_specs = (self.attn_o_tnh, kv_cache_spec)
+
+        def _ragged_paged_attention_wrapper(*args):
+            # Pass the GPT-OSS specific parameters to the kernel
+            return ragged_paged_attention(
+                *args,
+                sm_scale=self.sm_scale,
+                sliding_window=md.sliding_window,
+            )
+
+        output_TNH, kv_cache = jax.jit(
+            shard_map.shard_map(
+                _ragged_paged_attention_wrapper,
+                mesh=mesh,
+                in_specs=in_specs,
+                out_specs=out_specs,
+                check_rep=False,
+            ))(
+                q_TNH,
+                k_SKH,
+                v_SKH,
+                kv_cache,
+                md.seq_lens,
+                md.block_tables,
+                md.query_start_loc,
+                md.request_distribution,
+                sinks,
+            )
+        return kv_cache, output_TNH
+
+    def __call__(self,
+                 x_TD,
+                 is_prefill,
+                 kv_cache: KVCache,
+                 attention_metadata: AttentionMetadata,
+                 use_attention_rope: bool = True):
+        """Forward pass for the Attention module using 3D kernels."""
+        md = attention_metadata
+        x_TD = jnp.asarray(x_TD, self.dtype)
+
+        with jax.named_scope("q_proj"):
+            q_TNH = jnp.einsum("TD,DNH->TNH", x_TD, self.kernel_q_DNH.value)
+            q_TNH += self.bias_q_NH.value
+        
+        with jax.named_scope("k_proj"):
+            k_TKH = jnp.einsum("TD,DKH->TKH", x_TD, self.kernel_k_DKH.value)
+            k_TKH += self.bias_k_KH.value
+        
+        with jax.named_scope("v_proj"):
+            v_TKH = jnp.einsum("TD,DKH->TKH", x_TD, self.kernel_v_DKH.value)
+            v_TKH += self.bias_v_KH.value
+
+        if use_attention_rope:
+            q_TNH, k_TKH = self.rope(q_TNH, k_TKH, md.input_positions)
+
+        with jax.named_scope("attn_op"):
+            # Padding H dim of q,k,v to be the multiple of 128
+            multiple_of_128 = ((self.head_dim - 1) // 128 + 1) * 128
+            q_TNH = jnp.pad(q_TNH, ((0, 0), (0, 0),
+                                    (0, multiple_of_128 - self.head_dim)))
+            k_TKH = jnp.pad(k_TKH, ((0, 0), (0, 0),
+                                    (0, multiple_of_128 - self.head_dim)))
+            v_TKH = jnp.pad(v_TKH, ((0, 0), (0, 0),
+                                    (0, multiple_of_128 - self.head_dim)))
+            new_kv_cache, attn_out_TNH = self.attention(
+                kv_cache,
+                q_TNH,
+                k_TKH,
+                v_TKH, 
+                self.sinks_N.value,
+                md,
+                self.mesh
+            )
+            attn_out_TNH = attn_out_TNH[..., :self.head_dim]
+            
+        with jax.named_scope("o_proj"):
+            output_TD = jnp.einsum("TNH,NHD->TD", attn_out_TNH, self.kernel_o_proj_NHD.value)
+            output_TD += self.bias_o_D.value
+        
+        return new_kv_cache, output_TD
@@ -0,0 +1,148 @@
+import enum
+from dataclasses import InitVar, dataclass
+from functools import partial
+from typing import Optional, Tuple
+
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from flax.typing import Sharding
+from jax.sharding import PartitionSpec
+from jaxtyping import Float
+from qwix._src.core.ragged_dot import ragged_dot as qwix_ragged_dot
+from qwix._src.providers import ptq
+
+from tpu_inference.layers.jax.base import create_param
+from tpu_inference.layers.jax.layers import FlaxUtils
+from tpu_inference.layers.jax.moe.moe import MoE, Router
+from tpu_inference.models.jax.utils.quantization.quantization_utils import (
+    manually_quantize_qwix_activation, manually_quantize_qwix_weight)
+
+modeling_flax_utils = FlaxUtils()
+
+@dataclass(kw_only=True)
+class GptOssRouter(Router):
+    """Router module for Mixture-of-Experts (MoE) layers.
+
+    This module determines which experts each token should be routed to based on the input.
+
+    """
+    e_sharding: Sharding = ()
+
+    def __post_init__(self, rngs: nnx.Rngs):
+        """
+        Initializes the parent's kernel and adds the new bias parameter.
+        """
+        super().__post_init__(rngs)
+
+        self.bias_E = create_param(rngs,
+                                   shape=(self.num_experts,),
+                                   dtype=self.dtype,
+                                   sharding=self.e_sharding,
+                                   random_init=self.random_init)
+
+    def __call__(self, x_TD: Float):
+        """
+        Overrides the parent's forward pass to include the bias.
+        """
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        
+        router_logits_TE = jnp.einsum('TD,DE -> TE', x_TD, self.kernel_DE.value)
+
+        router_logits_TE += self.bias_E.value
+        
+        weights_TX, selected_experts_TX = jax.lax.top_k(
+            router_logits_TE, self.num_experts_per_tok)
+
+        normalized_weights_TX = jax.nn.softmax(weights_TX.astype(self.dtype), axis=-1)
+        
+        return normalized_weights_TX, selected_experts_TX
+
+def _swiglu(x: Float, alpha: Float, limit: Float) -> Float:
+    """Implements the specific SwiGLU from the golden implementation."""
+    x_glu, x_linear = x[..., ::2], x[..., 1::2]
+
+    x_glu = jnp.clip(x_glu, a_max=limit)
+    x_linear = jnp.clip(x_linear, a_min=-limit, a_max=limit)
+    
+    gated_activation = x_glu * jax.nn.sigmoid(alpha * x_glu)
+
+    return gated_activation * (x_linear + 1)
+
+@dataclass(kw_only=True)
+class GptOssMoE(nnx.Module):
+    """
+    JAX implementation of the GPT-OSS Mixture-of-Experts MLP block.
+    """
+    dtype: jnp.dtype
+    hidden_size: int
+    intermediate_size_moe: int
+    num_local_experts: int
+    router: GptOssRouter
+    rngs: InitVar[nnx.Rngs]
+
+    swiglu_limit: float = 7.0
+    swiglu_alpha: float = 1.702
+
+    # Sharding specifications
+    activation_ffw_td: Sharding
+    edf_sharding: Sharding
+    efd_sharding: Sharding
+    ed_sharding: Sharding
+
+    random_init: bool = False
+
+
+
+    def __call__(self, x_TD: Float) -> Float:
+        """Performs the forward pass for the GPT-OSS MoE layer."""
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        
+        weights_TX, indices_TX = self.router(x_TD)
+
+        one_hot_mask_TXE = jax.nn.one_hot(indices_TX, num_classes=self.num_local_experts, dtype=self.dtype)
+        combined_weights_TE = jnp.sum(one_hot_mask_TXE * weights_TX[..., None], axis=1)
+
+        # First MLP layer (up-projection)
+        with jax.named_scope("MLP #1"):
+            up_proj_TEF2 = jnp.einsum('TD,EDF -> TEF', x_TD, self.mlp1_weight_EDF2.value)
+            up_proj_TEF2 += self.mlp1_bias_EF2.value
+
+            fuse_TEF = _swiglu(up_proj_TEF2, alpha=self.swiglu_alpha, limit=self.swiglu_limit)
+
+        # Second MLP layer (down-projection)
+        with jax.named_scope("MLP #2"):
+            down_proj_TED = jnp.einsum('TEF,EFD -> TED', fuse_TEF, self.mlp2_weight_EFD.value)
+            down_proj_TED += self.mlp2_bias_ED.value
+
+        # Weighted sum of expert outputs
+        with jax.named_scope("sum"):
+            output_TD = jnp.einsum('TED,TE -> TD', down_proj_TED, combined_weights_TE)
+            
+        return output_TD.astype(self.dtype)
+
+    def __post_init__(self, rngs: nnx.Rngs):
+        """Initializes all weights and biases for the MoE block."""
+        D, F, E = self.hidden_size, self.intermediate_size_moe, self.num_local_experts
+
+        # MLP #1 Weights (Combined Gate and Up-projection) and Bias
+        self.mlp1_weight_EDF2 = create_param(
+            rngs, shape=(E, D, F * 2), dtype=self.dtype,
+            sharding=self.edf_sharding, random_init=self.random_init
+        )
+        self.mlp1_bias_EF2 = create_param(
+            rngs, shape=(E, F * 2), dtype=self.dtype,
+            sharding=self.ed_sharding, random_init=self.random_init
+        )
+
+        # MLP #2 Weights (Down-projection) and Bias
+        self.mlp2_weight_EFD = create_param(
+            rngs, shape=(E, F, D), dtype=self.dtype,
+            sharding=self.efd_sharding, random_init=self.random_init
+        )
+        self.mlp2_bias_ED = create_param(
+            rngs, shape=(E, D), dtype=self.dtype,
+            sharding=self.ed_sharding, random_init=self.random_init
+        )