[GPT-OSS] uncomment sink related changes as the kernel_hd64.py was merged (#966)

bzgoogle · bzgoogle · web-flow · commit d989dc3b9e19 · 2025-10-28T15:06:58.000-07:00
Signed-off-by: bzgoogle &lt;beinuoz_google_com@t1v-n-fa0da4f0-w-0.us-central1-c.c.cloud-tpu-inference-test.internal&gt;
Co-authored-by: bzgoogle &lt;beinuoz_google_com@t1v-n-fa0da4f0-w-0.us-central1-c.c.cloud-tpu-inference-test.internal&gt;
diff --git a/tpu_inference/layers/jax/attention/gpt_oss_attention.py b/tpu_inference/layers/jax/attention/gpt_oss_attention.py
@@ -9,8 +9,8 @@
 from jax.sharding import Mesh
 from jax.sharding import PartitionSpec as P
 
-from tpu_inference.kernels.ragged_paged_attention.v3.kernel import \
-    ragged_paged_attention
+from tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 import \
+    ragged_paged_attention_hd64
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
 from tpu_inference.layers.jax.base import create_param
 from tpu_inference.layers.jax.rope import GptOssRotaryEmbedding
@@ -155,13 +155,13 @@ def attention(
             P(),  # page_indices_flat: Replicated
             P(),  # query_start_loc: Replicated
             P(),  # distribution: Replicated
-            #P(('model')),  # sinks
+            P(('model')),  # sinks
         )
         out_specs = (self.attn_o_tnh, kv_cache_spec)
 
         def _ragged_paged_attention_wrapper(*args):
             # Pass the GPT-OSS specific parameters to the kernel
-            return ragged_paged_attention(
+            return ragged_paged_attention_hd64(
                 *args,
                 sm_scale=self.sm_scale,
                 sliding_window=md.sliding_window,
@@ -183,7 +183,7 @@ def _ragged_paged_attention_wrapper(*args):
                 md.block_tables,
                 md.query_start_loc,
                 md.request_distribution,
-                #sinks,
+                sinks,
             )
         return kv_cache, output_TNH
 
@@ -213,14 +213,6 @@ def __call__(self,
             q_TNH, k_TKH = self.rope(q_TNH, k_TKH, md.input_positions)
 
         with jax.named_scope("attn_op"):
-            # Padding H dim of q,k,v to be the multiple of 128
-            multiple_of_128 = ((self.head_dim - 1) // 128 + 1) * 128
-            q_TNH = jnp.pad(q_TNH, ((0, 0), (0, 0),
-                                    (0, multiple_of_128 - self.head_dim)))
-            k_TKH = jnp.pad(k_TKH, ((0, 0), (0, 0),
-                                    (0, multiple_of_128 - self.head_dim)))
-            v_TKH = jnp.pad(v_TKH, ((0, 0), (0, 0),
-                                    (0, multiple_of_128 - self.head_dim)))
             new_kv_cache, attn_out_TNH = self.attention(
                 kv_cache, q_TNH, k_TKH, v_TKH, self.sinks_N.value, md,
                 self.mesh)