diff --git a/fla/layers/nsa.py b/fla/layers/nsa.py
index ae3849846..37de20663 100644
--- a/fla/layers/nsa.py
+++ b/fla/layers/nsa.py
@@ -10,6 +10,7 @@
 from einops import rearrange
 from transformers.utils import logging
 
+from fla.layers.utils import pad_input, unpad_input
 from fla.modules import RotaryEmbedding
 from fla.ops.nsa.parallel import parallel_nsa
 from fla.ops.utils.index import prepare_lens_from_mask
@@ -80,17 +81,16 @@ def forward(
                 "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
             )
 
-        batch_size, seq_len, _ = hidden_states.size()
+        batch_size, q_len, _ = hidden_states.size()
 
         q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
         k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
         v = rearrange(self.v_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
         g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=3)
-        g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
 
         cu_seqlens = kwargs.get('cu_seqlens', None)
 
-        seqlen_offset, max_seqlen = 0, seq_len
+        seqlen_offset, max_seqlen = 0, q_len
         if past_key_values is not None:
             seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
             max_seqlen = q.shape[1] + seqlen_offset
@@ -109,27 +109,46 @@ def forward(
             k_cached, v_cached = past_key_values.update(
                 attn_state=(k.flatten(-2, -1), v.flatten(-2, -1)),
                 layer_idx=self.layer_idx,
-                offset=seq_len,
-                cache_kwargs=dict(window_size=self.window_size)
+                offset=q_len,
             )['attn_state']
             if cache_has_content:
                 k, v = k_cached, v_cached
                 k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
                 v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
 
-        o = parallel_nsa(
-            q=q,
-            k=k,
-            v=v,
-            g_cmp=g_cmp,
-            g_slc=g_slc,
-            g_swa=g_swa,
-            block_size=self.block_size,
-            block_counts=self.block_counts,
-            window_size=self.window_size,
-            cu_seqlens=cu_seqlens,
-        )
-        o = o.reshape(batch_size, seq_len, -1)
+        if attention_mask is not None:
+            (q, g), (k, v), indices_q, cu_seqlens, max_seq_lens = unpad_input(
+                (q, g), (k, v), attention_mask, q_len, keepdim=True)
+            g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+            o = parallel_nsa(
+                q=q,
+                k=k,
+                v=v,
+                g_cmp=g_cmp,
+                g_slc=g_slc,
+                g_swa=g_swa,
+                block_size=self.block_size,
+                block_counts=self.block_counts,
+                window_size=self.window_size,
+                cu_seqlens=cu_seqlens,
+            ).squeeze(0)
+            o = pad_input(o, indices_q, batch_size, q_len)
+        else:
+            g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+            o = parallel_nsa(
+                q=q,
+                k=k,
+                v=v,
+                g_cmp=g_cmp,
+                g_slc=g_slc,
+                g_swa=g_swa,
+                block_size=self.block_size,
+                block_counts=self.block_counts,
+                window_size=self.window_size,
+                cu_seqlens=cu_seqlens,
+            )
+
+        o = o.reshape(batch_size, q_len, -1)
         o = self.o_proj(o)
 
         if not output_attentions:
diff --git a/fla/layers/utils.py b/fla/layers/utils.py
index 73b7554aa..6c2368228 100644
--- a/fla/layers/utils.py
+++ b/fla/layers/utils.py
@@ -3,7 +3,7 @@
 
 # Code is adapted from flash-attn.bert_padding.py
 
-from typing import Tuple
+from typing import Tuple, Union
 
 import torch
 from einops import rearrange, repeat
@@ -99,7 +99,7 @@ def get_unpad_data(
 
 
 def unpad_input(
-    q: torch.Tensor,
+    q: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
     states: Tuple[torch.Tensor],
     attention_mask: torch.Tensor,
     q_len: int,
@@ -111,8 +111,9 @@ def unpad_input(
 
 
     Arguments:
-        q (`torch.Tensor`):
+        q (`torch.Tensor` or `Tuple[torch.Tensor]`):
             Query state with padding. Shape: [batch_size, q_len, ...].
+            When it is a tuple, do unpadding for each tensor in the tuple.
         states (`Tuple[torch.Tensor]`):
             Attention state with padding. Shape: [batch_size, seq_len, ...].
         attention_mask (`torch.Tensor`):
@@ -123,19 +124,20 @@ def unpad_input(
             Whether to keep the batch dimension. Default: `False`.
 
     Return:
-        q (`torch.Tensor`):
+        q (`torch.Tensor` or `Tuple[torch.Tensor]`):
             Query state without padding.
             Shape: [1, total_target_length, ...] if `keepdim=True` else [total_target_length, ...].
+            When the `q` passed in is a tuple, return a tuple of such unpadded tensors.
         states (`Tuple[torch.Tensor]`):
             Attention state without padding.
             Shape: [1, total_source_length, ...] if `keepdim=True` else [total_source_length, ...].
         indices_q (`torch.Tensor`):
             The indices of non-masked tokens from the flattened input target sequence.
-        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
+        (cu_seqlens_q, cu_seqlens_k) (`Tuple[torch.LongTensor, torch.LongTensor]`):
             The cumulative sequence lengths for the target (query) and source (key, value),
             used to index into ragged (unpadded) tensors.
             `cu_seqlens` shape is [batch_size + 1].
-        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
+        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int, int]`):
             Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence
             i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
     """
@@ -146,23 +148,30 @@ def unpad_input(
         index_first_axis(rearrange(s, "b s ... -> (b s) ..."), indices_k)
         for s in states
     )
+    if isinstance(q, torch.Tensor):
+        q = (q,)
+        cast_tuple = True
+    else:
+        cast_tuple = False
 
     if q_len == seq_len:
-        q = index_first_axis(rearrange(q, "b s ... -> (b s) ..."), indices_k)
+        q = tuple(index_first_axis(rearrange(q_, "b s ... -> (b s) ..."), indices_k) for q_ in q)
         cu_seqlens_q = cu_seqlens_k
         max_seqlen_in_batch_q = max_seqlen_in_batch_k
         indices_q = indices_k
     elif q_len == 1:
         max_seqlen_in_batch_q = 1
-        cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
+        cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q[0].device)
         indices_q = cu_seqlens_q[:-1]
-        q = q.squeeze(1)
+        q = tuple(q_.squeeze(1) for q_ in q)
     else:
         raise NotImplementedError("We only support either q_len == k_len (prefilling) or q_len == 1 (decoding)")
 
     if keepdim:
-        q = q.unsqueeze(0)
+        q = tuple(q_.unsqueeze(0) for q_ in q)
         state = tuple(s.unsqueeze(0) for s in state)
+    if cast_tuple:
+        q = q[0]
 
     return (
         q,
diff --git a/fla/ops/nsa/compression.py b/fla/ops/nsa/compression.py
index eda416190..8ace86eec 100644
--- a/fla/ops/nsa/compression.py
+++ b/fla/ops/nsa/compression.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 import torch
 import triton
@@ -14,7 +14,7 @@
 
 
 @triton.heuristics({
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+    'IS_VARLEN': lambda args: args['cu_seqlens_q'] is not None
 })
 @triton.autotune(
     configs=[
@@ -31,10 +31,12 @@ def parallel_nsa_compression_fwd_kernel(
     o,
     lse,
     scale,
-    cu_seqlens,
-    token_indices,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    token_indices_q,
     chunk_offsets,
-    T,
+    TQ,
+    TK,
     H: tl.constexpr,
     HQ: tl.constexpr,
     G: tl.constexpr,
@@ -50,28 +52,31 @@ def parallel_nsa_compression_fwd_kernel(
     i_b, i_h = i_bh // H, i_bh % H
 
     if IS_VARLEN:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
-        T = eos - bos
+        i_n, i_t = tl.load(token_indices_q + i_t * 2).to(tl.int32), tl.load(token_indices_q + i_t * 2 + 1).to(tl.int32)
+        bos_q, eos_q = tl.load(cu_seqlens_q + i_n).to(tl.int32), tl.load(cu_seqlens_q + i_n + 1).to(tl.int32)
+        bos_k, eos_k = tl.load(cu_seqlens_k + i_n).to(tl.int32), tl.load(cu_seqlens_k + i_n + 1).to(tl.int32)
+        TQ = eos_q - bos_q
+        TK = eos_k - bos_k
+        TC = tl.cdiv(TK, BS)
         boc = tl.load(chunk_offsets + i_n).to(tl.int32)
     else:
-        bos, eos = i_b * T, i_b * T + T
-        boc = i_b * tl.cdiv(T, BS)
-
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
+        bos_q, eos_q = i_b * TQ, i_b * TQ + TQ
+        TC = tl.cdiv(TK, BS)
+        boc = i_b * TC
 
+    p_q = tl.make_block_ptr(q + (bos_q + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
+    Q_OFFSET = TK - TQ
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    # the number of compression representations in total
-    TC = tl.cdiv(T, BS)
     # the number of compression representations required to iterate over
     # incomplete compression blocks are not included
-    NC = (i_t + 1) // BS
+    # Here we assume that q tokens are last TQ tokens
+    NC = (i_t + Q_OFFSET + 1) // BS
 
-    p_o = tl.make_block_ptr(o + (bos + i_t) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
+    p_o = tl.make_block_ptr(o + (bos_q + i_t) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     # [G, BV]
     b_o = tl.zeros([G, BV], dtype=tl.float32)
     # max scores for the current block
@@ -80,7 +85,7 @@ def parallel_nsa_compression_fwd_kernel(
     b_acc = tl.zeros([G], dtype=tl.float32)
 
     for i_c in range(0, NC, BC):
-        o_c = i_c + tl.arange(0, BC)
+        o_c = i_c + tl.arange(0, BC)  # block idx
 
         p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1))
         p_v = tl.make_block_ptr(v + (boc * H + i_h) * V, (TC, V), (H*V, 1), (i_c, i_v * BV), (BC, BV), (1, 0))
@@ -90,6 +95,8 @@ def parallel_nsa_compression_fwd_kernel(
         b_v = tl.load(p_v, boundary_check=(0, 1))
         # [G, BC]
         b_s = tl.dot(b_q, b_k)
+        # Causal mask; note that NC is the compressed-block idx of q_idx + 1,
+        # i.e. number of blocks that need to be attended to
         b_s = tl.where((o_c < NC)[None, :], b_s, float('-inf'))
 
         # [G]
@@ -99,11 +106,10 @@ def parallel_nsa_compression_fwd_kernel(
         b_p = exp(b_s - b_m[:, None])
         # [G]
         b_acc = b_acc * b_r + tl.sum(b_p, 1)
-
         # [G, BV]
         b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v)
 
-        b_mp = b_m
+        # b_mp = b_m
     if NC == 0:
         b_lse = tl.zeros([G], dtype=tl.float32)
     else:
@@ -112,7 +118,7 @@ def parallel_nsa_compression_fwd_kernel(
 
     tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
     if i_v == 0:
-        tl.store(lse + (bos + i_t) * HQ + i_h * G + tl.arange(0, G), b_lse.to(lse.dtype.element_ty))
+        tl.store(lse + (bos_q + i_t) * HQ + i_h * G + tl.arange(0, G), b_lse.to(lse.dtype.element_ty))
 
 
 @triton.heuristics({
@@ -320,12 +326,14 @@ def parallel_nsa_compression_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    TK: int,
     block_size: int,
     scale: float,
-    cu_seqlens: Optional[torch.LongTensor] = None,
-    token_indices: Optional[torch.LongTensor] = None,
+    cu_seqlens_q: Optional[torch.LongTensor] = None,
+    cu_seqlens_k: Optional[torch.LongTensor] = None,
+    token_indices_q: Optional[torch.LongTensor] = None,
 ):
-    B, T, HQ, K, V = *q.shape, v.shape[-1]
+    B, TQ, HQ, K, V = *q.shape, v.shape[-1]
     H = k.shape[2]
     G = HQ // H
     BC = BS = block_size
@@ -339,11 +347,11 @@ def parallel_nsa_compression_fwd(
     NV = triton.cdiv(V, BV)
     assert NK == 1, "The key dimension can not be larger than 256"
 
-    chunk_offsets = prepare_chunk_offsets(cu_seqlens, BS) if cu_seqlens is not None else None
+    chunk_offsets = prepare_chunk_offsets(cu_seqlens_k, BS) if cu_seqlens_k is not None else None
 
-    grid = (T, NV, B * H)
-    o = torch.empty(B, T, HQ, V, dtype=v.dtype, device=q.device)
-    lse = torch.empty(B, T, HQ, dtype=torch.float, device=q.device)
+    grid = (TQ, NV, B * H)
+    o = torch.empty(B, TQ, HQ, V, dtype=v.dtype, device=q.device)
+    lse = torch.empty(B, TQ, HQ, dtype=torch.float, device=q.device)
 
     parallel_nsa_compression_fwd_kernel[grid](
         q=q,
@@ -352,10 +360,12 @@ def parallel_nsa_compression_fwd(
         o=o,
         lse=lse,
         scale=scale,
-        cu_seqlens=cu_seqlens,
-        token_indices=token_indices,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        token_indices_q=token_indices_q,
         chunk_offsets=chunk_offsets,
-        T=T,
+        TQ=TQ,
+        TK=TK,
         H=H,
         HQ=HQ,
         G=G,
@@ -468,6 +478,7 @@ def forward(
         q,
         k,
         v,
+        TK,
         block_size,
         scale,
         cu_seqlens
@@ -478,20 +489,30 @@ def forward(
         # for example, if the passed `cu_seqlens` is [0, 2, 6],
         # then there are 2 and 4 tokens in the 1st and 2nd sequences respectively, and `token_indices` will be
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
-        token_indices = prepare_token_indices(cu_seqlens) if cu_seqlens is not None else None
+        if cu_seqlens is not None:
+            if isinstance(cu_seqlens, tuple):
+                cu_seqlens_q, cu_seqlens_k = cu_seqlens
+            else:
+                cu_seqlens_q = cu_seqlens_k = cu_seqlens
+            token_indices_q = prepare_token_indices(cu_seqlens_q)
+        else:
+            cu_seqlens_q = cu_seqlens_k = token_indices_q = None
 
         o, lse = parallel_nsa_compression_fwd(
             q=q,
             k=k,
             v=v,
+            TK=TK,
             block_size=block_size,
             scale=scale,
-            cu_seqlens=cu_seqlens,
-            token_indices=token_indices
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            token_indices_q=token_indices_q
         )
         ctx.save_for_backward(q, k, v, o, lse)
-        ctx.cu_seqlens = cu_seqlens
-        ctx.token_indices = token_indices
+        # Use cu_seqlens of q in backward, as cu_seqlens for q & k are different only for inference
+        ctx.cu_seqlens = cu_seqlens_q
+        ctx.token_indices = token_indices_q
         ctx.block_size = block_size
         ctx.scale = scale
         return o.to(q.dtype), lse
@@ -513,16 +534,17 @@ def backward(ctx, do, *args):
             cu_seqlens=ctx.cu_seqlens,
             token_indices=ctx.token_indices
         )
-        return dq.to(q), dk.to(k), dv.to(v), None, None, None
+        return dq.to(q), dk.to(k), dv.to(v), None, None, None, None
 
 
 def parallel_nsa_compression(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    TK: int,
     block_size: int = 64,
     scale: float = None,
-    cu_seqlens: Optional[torch.LongTensor] = None
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None
 ):
     if scale is None:
         scale = k.shape[-1] ** -0.5
@@ -530,6 +552,7 @@ def parallel_nsa_compression(
         q,
         k,
         v,
+        TK,
         block_size,
         scale,
         cu_seqlens
diff --git a/fla/ops/nsa/naive.py b/fla/ops/nsa/naive.py
index 94afef52c..be141b233 100644
--- a/fla/ops/nsa/naive.py
+++ b/fla/ops/nsa/naive.py
@@ -2,49 +2,63 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import warnings
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 import torch
 from einops import repeat
+from torch.nn.attention.flex_attention import and_masks, create_block_mask, flex_attention
 
+from fla.ops.utils import prepare_chunk_offsets, prepare_token_indices
+from fla.ops.utils.pooling import mean_pooling
 
-def naive_nsa(
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+except ImportError:
+    warnings.warn(
+        "Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`",
+        category=ImportWarning
+    )
+    flash_attn_func = None
+
+
+def naive_nsa_sel(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     block_indices: torch.LongTensor,
     block_size: int = 64,
     scale: Optional[float] = None,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
     head_first: bool = False
 ) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
-            queries of shape `[B, T, HQ, K]`..
+            queries of shape `[B, TQ, HQ, K]`..
         k (torch.Tensor):
             keys of shape `[B, T, H, K]`.
             GQA is enforced here. The ratio of query heads (HQ) to key/value heads (H) must be a power of 2 and >=16.
         v (torch.Tensor):
             values of shape `[B, T, H, V]`.
         block_indices (torch.LongTensor):
-            Block indices of shape `[B, T, H, S]` if `head_first=False` else `[B, H, T, S]`.
+            Block indices of shape `[B, TQ, H, S]` if `head_first=False` else `[B, H, TQ, S]`.
             `S` is the number of selected blocks for each query token, which is set to 16 in the paper.
         block_size (int):
             Selected block size. Default: 64.
         scale (Optional[float]):
             Scale factor for attention scores.
             If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
-        cu_seqlens (torch.LongTensor):
+        cu_seqlens (torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor] or None):
             Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
             consistent with the FlashAttention API.
+            When a tuple is provided, it should contain two tensors: `(cu_seqlens_q, cu_seqlens_k)`.
         head_first (Optional[bool]):
             Whether the inputs are in the head-first format. Default: `False`.
             This argument has been deprecated.
 
     Returns:
         o (torch.Tensor):
-            Outputs of shape `[B, T, HQ, V]`.
+            Outputs of shape `[B, TQ, HQ, V]`.
     """
     if scale is None:
         scale = k.shape[-1] ** -0.5
@@ -66,38 +80,307 @@ def naive_nsa(
     BS = block_size
     k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
     q, k, v = map(lambda x: x.float(), (q, k, v))
+    B = q.shape[0]
 
     o = torch.zeros_like(v)
     varlen = True
     if cu_seqlens is None:
         varlen = False
-        B, T = q.shape[:2]
-        cu_seqlens = torch.cat([
-            block_indices.new_tensor(range(0, B*T, T)), block_indices.new_tensor([B*T])
-        ])
+        Tq = Tk = q.shape[1]
+        cu_q = torch.cat([
+            block_indices.new_tensor(range(0, B * Tq, Tq)), block_indices.new_tensor([B * Tq])
+        ]).to(device=q.device)
+        cu_k = torch.cat([
+            block_indices.new_tensor(range(0, B * Tk, Tk)), block_indices.new_tensor([B * Tk])
+        ]).to(device=q.device)
+    else:
+        if isinstance(cu_seqlens, tuple):
+            cu_q, cu_k = cu_seqlens
+        else:
+            cu_q = cu_k = cu_seqlens
 
-    for i in range(len(cu_seqlens) - 1):
+    for i in range(len(cu_q) - 1):
         if not varlen:
             q_b, k_b, v_b, i_b = q[i], k[i], v[i], block_indices[i]
         else:
-            T = cu_seqlens[i+1] - cu_seqlens[i]
-            q_b, k_b, v_b, i_b = map(lambda x: x[0][cu_seqlens[i]:cu_seqlens[i+1]], (q, k, v, block_indices))
-
+            Tq = cu_q[i+1] - cu_q[i]
+            Tk = cu_k[i+1] - cu_k[i]
+            q_b, k_b, v_b, i_b = (q[0][cu_q[i]:cu_q[i+1]], k[0][cu_k[i]:cu_k[i+1]],
+                                  v[0][cu_k[i]:cu_k[i+1]], block_indices[0][cu_q[i]:cu_q[i+1]])
+        assert Tq == Tk, "TQ != TK case is not supported in naive_nsa_sel"
         i_b = i_b.unsqueeze(-1) * BS + i_b.new_tensor(range(BS))
         # [T, S*BS, HQ]
-        i_b = i_b.view(T, block_indices.shape[2], -1).transpose(1, 2)
-        for i_q in range(T):
+        i_b = i_b.view(Tq, block_indices.shape[2], -1).transpose(1, 2)
+        for i_q in range(Tq):
             # [HQ, D]
             q_i = q_b[i_q] * scale
             # [S*BS, HQ]
             i_i = i_b[i_q]
             # [S*BS, HQ, -1]
-            k_i, v_i = map(lambda x: x.gather(0, i_i.clamp(0, T-1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i, v_i = map(lambda x: x.gather(0, i_i.clamp(0, Tk-1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])),
+                           (k_b, v_b))
             # [S*BS, HQ]
-            attn = torch.einsum('h d, n h d -> n h', q_i, k_i).masked_fill(i_i > i_q, float('-inf')).softmax(0)
+            attn = torch.einsum('h d, n h d -> n h', q_i, k_i).masked_fill(
+                torch.logical_or(i_i > i_q, i_i < 0), float('-inf')).softmax(0)
             if not varlen:
                 o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
             else:
-                o[0][cu_seqlens[i]+i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+                o[0][cu_q[i] + i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
 
     return o.to(dtype)
+
+
+def naive_nsa_cmp(q, k_cmp, v_cmp, block_size, scale, cu_seqlens=None):
+    if cu_seqlens is not None:
+        seq_indices = prepare_token_indices(cu_seqlens)
+        kv_cu_seqlens = prepare_chunk_offsets(cu_seqlens, block_size)
+        kv_indices = prepare_token_indices(kv_cu_seqlens)
+        q_b, q_i = seq_indices[:, 0], seq_indices[:, 1]
+        kv_b, kv_i = kv_indices[:, 0], kv_indices[:, 1]
+
+        @torch.compile
+        def varlen_mask(b, h, q_idx, kv_idx):
+            return q_b[q_idx] == kv_b[kv_idx]
+
+        @torch.compile
+        def shifted_varlen_mask(b, h, q_idx, kv_idx):
+            return q_i[q_idx] >= (kv_i[kv_idx] + 1) * block_size - 1
+
+        cmp_mask = and_masks(varlen_mask, shifted_varlen_mask)
+    else:
+        @torch.compile
+        def cmp_mask(b, h, q_idx, kv_idx):
+            return q_idx >= (kv_idx + 1) * block_size - 1
+    B, H, TQ, TKV = q.shape[0], k_cmp.shape[1], q.shape[1], k_cmp.shape[1]
+    block_mask = create_block_mask(cmp_mask, B, H, TQ, TKV)
+
+    o_cmp, lse_cmp = flex_attention(
+        q.transpose(1, 2),
+        k_cmp.transpose(1, 2),
+        v_cmp.transpose(1, 2),
+        block_mask=block_mask,
+        enable_gqa=True,
+        return_lse=True,
+        scale=scale,
+    )
+    return o_cmp.transpose(1, 2), lse_cmp.transpose(1, 2)
+
+
+def naive_nsa_topk(
+    q: torch.Tensor,               # [B, T_q, Hq, D]
+    k_cmp: torch.Tensor,           # [B, T_C, Hkv, D]  (T_C = #compressed blocks)
+    block_counts: Union[int, torch.Tensor],  # int or [B, T_q, Hkv]
+    block_size: int,
+    scale: float,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
+) -> torch.Tensor:
+    B, Tq, Hq, _ = q.shape
+    Hkv = k_cmp.shape[2]
+    G = Hq // Hkv
+    k_cmp = repeat(k_cmp, 'b t h d -> b t (h g) d', g=G)
+
+    device = q.device
+    varlen = True
+    if cu_seqlens is None:
+        varlen = False
+        Tq = q.shape[1]
+        Tc = k_cmp.shape[1]
+        cu_q = torch.cat([
+            torch.arange(0, B * Tq, Tq), torch.tensor([B * Tq])
+        ])
+        cu_k = torch.cat([
+            torch.arange(0, B * Tc, Tc), torch.tensor([B * Tc])
+        ])
+    else:
+        assert B == 1
+        if isinstance(cu_seqlens, tuple):
+            cu_q, cu_k = cu_seqlens
+        else:
+            cu_q = cu_k = cu_seqlens
+        cu_k = prepare_chunk_offsets(cu_k, block_size)
+
+    if isinstance(block_counts, int):
+        S = int(block_counts)
+        assert S >= 0, "block_counts (int) must be >= 0"
+    elif torch.is_tensor(block_counts):
+        S = int(block_counts.max().item())
+    result = torch.full((B, Tq, Hkv, S), -1, device=device, dtype=torch.long)
+
+    for i in range(len(cu_q) - 1):
+        if not varlen:
+            q_b, k_b = q[i], k_cmp[i]
+        else:
+            Tq = (cu_q[i+1] - cu_q[i]).item()
+            Tc = (cu_k[i+1] - cu_k[i]).item()
+            q_b, k_b = q[0][cu_q[i]:cu_q[i+1]], k_cmp[0][cu_k[i]:cu_k[i+1]]
+
+        logits = torch.einsum('t h d, s h d -> t h s', q_b, k_b) * scale  # [Tq, Hq, Tc]
+        logits = logits.reshape(Tq, Hkv, G, Tc)
+        t = torch.arange(Tq, device=device).unsqueeze(1)
+        s = torch.arange(Tc, device=device).unsqueeze(0)
+        block_last_pos = (s + 1) * block_size - 1
+        base_allow = (block_last_pos <= t)  # [Tq,Tc]
+
+        i_qb = (t // block_size)                                                 # [Tq,1]
+        is_current_block = (s == i_qb) | (s == 0) | (s == i_qb - 1)              # [Tq,Tc]
+        logits = logits.masked_fill(~base_allow[:, None, None, :], float("-inf"))
+        allow = base_allow | is_current_block  # [Tq,Tc]
+
+        probs_q = torch.softmax(logits, dim=-1)  # [Tq, Hkv, G, Tc]
+        probs_q = torch.nan_to_num(probs_q, nan=0.0)  # rows with no valid blocks -> 0
+        scores = probs_q.mean(dim=2)  # [Tq, Hkv, Tc]
+        scores = torch.where(is_current_block[:, None, :], 1.0, scores)
+
+        if isinstance(block_counts, int):
+            desired_k = torch.full((Tq, Hkv), S, dtype=torch.long, device=device)
+        elif torch.is_tensor(block_counts):
+            if varlen:
+                assert block_counts.shape == (1, Tq, Hkv)
+                desired_k = block_counts[0].to(device=device, dtype=torch.long)
+            else:
+                assert block_counts.shape == (B, Tq, Hkv)
+                desired_k = block_counts[i].to(device=device, dtype=torch.long)
+        else:
+            raise TypeError("block_counts must be int or torch.Tensor")
+
+        _, topi = torch.topk(scores, k=min(S, Tc), dim=-1)                             # [Tq,Hkv,S]
+
+        # Validate selections against allow mask; pad with -1 where invalid or beyond quota
+        allow_kv = allow[:, None, :].expand(Tq, Hkv, Tc)                      # [Tq,Hkv,Tc]
+        sel_allowed = torch.gather(allow_kv.long(), dim=-1, index=topi).bool()    # [Tq,Hkv,S]
+
+        idx = torch.arange(S, device=device).view(1, 1, S)
+        within_quota = (idx < desired_k.unsqueeze(-1))[:, :, :Tc]                              # [Tq,Hkv,S]
+
+        keep = sel_allowed & within_quota
+        out = torch.full_like(topi, fill_value=-1)                                # pad with -1
+        out = torch.where(keep, topi, out)
+
+        if S > Tc:
+            out = torch.cat((out, torch.full((Tq, Hkv, S - Tc), -1,
+                                             device=device, dtype=topi.dtype)), dim=-1)
+        if varlen:
+            result[0, cu_q[i]:cu_q[i+1]] = out
+        else:
+            result[i] = out
+    return result
+
+
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_cmp: Optional[torch.Tensor] = None,
+    g_slc: Optional[torch.Tensor] = None,
+    g_swa: Optional[torch.Tensor] = None,
+    block_indices: Optional[torch.LongTensor] = None,
+    block_counts: Union[torch.LongTensor, int] = 16,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
+    return_block_indices: bool = False,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.LongTensor]]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, TQ, HQ, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+            GQA is enforced here. The ratio of query heads (HQ) to key/value heads (H) must be a power of 2 and >=16.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]`.
+        g_cmp (torch.Tensor):
+            Gate score for compressed attention of shape `[B, TQ, HQ]`.
+        g_slc (torch.Tensor):
+            Gate score for selected attention of shape `[B, TQ, HQ]`.
+        g_swa (torch.Tensor):
+            Gate score for sliding attentionof shape `[B, TQ, HQ]`.
+        block_indices (torch.LongTensor):
+            Block indices of shape `[B, TQ, H, S]`.
+            `S` is the number of selected blocks for each query token, which is set to 16 in the paper.
+            If `g_cmp` is provided, the passed `block_indices` will be ignored.
+        block_counts (Optional[Union[torch.LongTensor, int]]):
+            Number of selected blocks for each query.
+            If a tensor is provided, with shape `[B, TQ, H]`,
+            each query can select the same number of blocks.
+            If not provided, it will default to 16.
+        block_size (int):
+            Selected block size. Default: 64.
+        window_size (int):
+            Sliding window size. Default: 0.
+        scale (Optional[float]):
+            Scale factor for attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        cu_seqlens (torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor] or None):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+            When a tuple is provided, it should contain two tensors: `(cu_seqlens_q, cu_seqlens_k)`.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, HQ, V]`.
+    """
+    assert block_counts is not None, "block counts must be provided for selection"
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    if cu_seqlens is not None:
+        assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
+    assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
+
+    if cu_seqlens is not None:
+        if isinstance(cu_seqlens, tuple):
+            cu_seqlens_q, cu_seqlens_k = cu_seqlens
+        else:
+            cu_seqlens_q = cu_seqlens_k = cu_seqlens
+    else:
+        cu_seqlens_q = cu_seqlens_k = None
+
+    k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens), mean_pooling(v, block_size, cu_seqlens)
+    o_cmp, lse_cmp = None, None
+    if g_cmp is not None:
+        o_cmp, lse_cmp = naive_nsa_cmp(
+            q=q,
+            k_cmp=k_cmp,
+            v_cmp=v_cmp,
+            block_size=block_size,
+            scale=scale,
+            cu_seqlens=cu_seqlens
+        )
+        if block_indices is not None:
+            warnings.warn("`block_indices` will be ignored when `g_cmp` is provided")
+        block_indices = naive_nsa_topk(
+            q=q,
+            k_cmp=k_cmp,
+            block_counts=block_counts,
+            block_size=block_size,
+            scale=scale,
+            cu_seqlens=cu_seqlens
+        )
+    o = o_slc = naive_nsa_sel(q, k, v, block_indices, block_size, scale, cu_seqlens)
+    if g_slc is not None:
+        o = o_slc * g_slc.unsqueeze(-1)
+    if o_cmp is not None:
+        o = torch.addcmul(o, o_cmp, g_cmp.unsqueeze(-1))
+    if window_size > 0:
+        if cu_seqlens is not None:
+            o_swa = flash_attn_varlen_func(
+                q.squeeze(0), k.squeeze(0), v.squeeze(0),
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=q.shape[1],
+                max_seqlen_k=k.shape[1],
+                causal=True,
+                window_size=(window_size-1, 0)
+            ).unsqueeze(0)
+        else:
+            o_swa = flash_attn_func(
+                q, k, v,
+                causal=True,
+                window_size=(window_size-1, 0)
+            )
+        o = torch.addcmul(o, o_swa, g_swa.unsqueeze(-1))
+    if return_block_indices:
+        return o, block_indices
+    else:
+        return o
diff --git a/fla/ops/nsa/parallel.py b/fla/ops/nsa/parallel.py
index badf15502..bf7bf1f3a 100644
--- a/fla/ops/nsa/parallel.py
+++ b/fla/ops/nsa/parallel.py
@@ -2,7 +2,7 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import warnings
-from typing import Optional, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import triton
@@ -27,7 +27,7 @@
 
 
 @triton.heuristics({
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+    'IS_VARLEN': lambda args: args['cu_seqlens_q'] is not None
 })
 @triton.autotune(
     configs=[
@@ -43,10 +43,12 @@ def parallel_nsa_kernel_topk(
     lse,
     scale,
     block_indices,
-    cu_seqlens,
-    token_indices,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    token_indices_q,
     chunk_offsets,
-    T,
+    TQ,
+    TK,
     H: tl.constexpr,
     HQ: tl.constexpr,
     G: tl.constexpr,
@@ -61,31 +63,35 @@ def parallel_nsa_kernel_topk(
     i_b, i_h = i_bh // H, i_bh % H
 
     if IS_VARLEN:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
-        T = eos - bos
+        i_n, i_t = tl.load(token_indices_q + i_t * 2).to(tl.int32), tl.load(token_indices_q + i_t * 2 + 1).to(tl.int32)
+        bos_q, eos_q = tl.load(cu_seqlens_q + i_n).to(tl.int32), tl.load(cu_seqlens_q + i_n + 1).to(tl.int32)
+        bos_k, eos_k = tl.load(cu_seqlens_k + i_n).to(tl.int32), tl.load(cu_seqlens_k + i_n + 1).to(tl.int32)
+        TQ = eos_q - bos_q
+        TK = eos_k - bos_k
+        TC = tl.cdiv(TK, BS)
         boc = tl.load(chunk_offsets + i_n).to(tl.int32)
     else:
-        bos, eos = i_b * T, i_b * T + T
-        boc = i_b * tl.cdiv(T, BS)
-
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
+        bos_q, eos_q = i_b * TQ, i_b * TQ + TQ
+        TC = tl.cdiv(TK, BS)
+        boc = i_b * TC
+    # boc is the start of the current sequence at [B, TC] dimensions
+    p_q = tl.make_block_ptr(q + (bos_q + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
+    Q_OFFSET = TK - TQ
 
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    # the number of compression representations in total
-    TC = tl.cdiv(T, BS)
     # the number of compression representations required to iterate over
-    # incomplete compression blocks are not included
-    NC = (i_t + 1) // BS
+    # incomplete compression blocks are not included; hence if i_t is the last token in a block, the block will be included
+    # Here we assume that q tokens are last TQ tokens
+    NC = (i_t + Q_OFFSET + 1) // BS
     ################################
     # 1. lse computation
     ################################
     if lse is not None:
-        b_lse = tl.load(lse + (bos + i_t) * HQ + i_h * G + tl.arange(0, G))
+        b_lse = tl.load(lse + (bos_q + i_t) * HQ + i_h * G + tl.arange(0, G))
     else:
         # max scores for the current block
         b_m = tl.full([G], float('-inf'), dtype=tl.float32)
@@ -124,10 +130,11 @@ def parallel_nsa_kernel_topk(
     o_i = tl.zeros([BC], dtype=tl.int32)
     m_i = tl.arange(0, BC) < BC//2
 
-    IC = i_t // BS
-    for i_c in range(0, tl.cdiv(i_t + 1, BS), BC):
+    IC = (i_t + Q_OFFSET) // BS  # Idx of the current query block
+    for i_c in range(0, IC + 1, BC):  # +1, because the current block might be also included
         o_c = i_c + tl.arange(0, BC)
-
+        # Recall k: [B, TC, H, K], boc = i_b * TC
+        # we first shift to k[i_b, 0, i_h], and read a block of transposed keys from k[i_b, i_c, i_h]
         p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1))
         # [BK, BC]
         b_k = tl.load(p_k, boundary_check=(0, 1))
@@ -135,10 +142,10 @@ def parallel_nsa_kernel_topk(
         b_s = tl.dot(b_q, b_k)
         b_s = tl.where(o_c < IC, b_s, float('-inf'))
         # [G, BC]
-        # the 1st and the last 2 blocks are always selected
+        # the 1st and the last 2 blocks are always selected, set normalized scores to 1.0
         b_p = tl.where((o_c == 0) | ((o_c == IC - 1) | (o_c == IC)), 1., exp(b_s - b_lse[:, None]))
         # the importance scores of the current block
-        # [BC]
+        # [BC], take the sum of attention scores over all heads within the current group (KV head)
         b_i, b_ip = tl.sum(b_p, 0), b_i
         # blocks with index < 0 will be skipped
         o_i, o_ip = tl.where(o_c <= IC, o_c, -1), o_i
@@ -155,15 +162,15 @@ def parallel_nsa_kernel_topk(
         else:
             b_i, o_i = _bitonic_merge(b_i, o_i.to(tl.int32), n_dims, True, n_dims)
 
-    m_top = tl.arange(0, BC//S) == 0
-    b_top = tl.sum(m_top[:, None] * tl.reshape(o_i, [BC//S, S]), 0)
+    m_top = tl.arange(0, BC // S) == 0
+    b_top = tl.sum(m_top[:, None] * tl.reshape(o_i, [BC // S, S]), 0)
 
-    p_b = tl.make_block_ptr(block_indices + (bos + i_t) * H*S, (H*S,), (1,), (i_h * S,), (S,), (0,))
+    p_b = tl.make_block_ptr(block_indices + (bos_q + i_t) * H*S, (H*S,), (1,), (i_h * S,), (S,), (0,))
     tl.store(p_b, b_top.to(p_b.dtype.element_ty))
 
 
 @triton.heuristics({
-    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens_q'] is not None,
     'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
 })
 @triton.autotune(
@@ -183,9 +190,11 @@ def parallel_nsa_fwd_kernel(
     scale,
     block_indices,
     block_counts,
-    cu_seqlens,
-    token_indices,
-    T,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    token_indices_q,
+    TQ,
+    TK,
     H: tl.constexpr,
     HQ: tl.constexpr,
     G: tl.constexpr,
@@ -198,50 +207,98 @@ def parallel_nsa_fwd_kernel(
     IS_VARLEN: tl.constexpr,
     USE_BLOCK_COUNTS: tl.constexpr
 ):
-    i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)  # i_t: token, i_v: value dim, i_bh: batch * kv head
     i_b, i_h = i_bh // H, i_bh % H
+    # k: [B, TK, H, K], v: [B, TK, H, V], q: [B, TQ, HQ, K]
+    # block_indices: [B, TQ, H, S]
+    # G = HQ // H, number of groups of heads
+    # lse: [B, TQ, HQ]
 
     if IS_VARLEN:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
-        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
-        T = eos - bos
+        # 2-d sequence indices denoting the cu_seqlens of tokens in each sequence
+        # for example, if the passed `cu_seqlens` is [0, 2, 6],
+        # then there are 2 and 4 tokens in the 1st and 2nd sequences respectively, and `token_indices` will be
+        # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
+        i_n, i_t = tl.load(token_indices_q + i_t * 2).to(tl.int32), tl.load(token_indices_q + i_t * 2 + 1).to(tl.int32)
+        # Then i_t becomes the token index inside the sequence, and i_n is the sequence index.
+        bos_q, eos_q = tl.load(cu_seqlens_q + i_n).to(tl.int32), tl.load(cu_seqlens_q + i_n + 1).to(tl.int32)
+        bos_k, eos_k = tl.load(cu_seqlens_k + i_n).to(tl.int32), tl.load(cu_seqlens_k + i_n + 1).to(tl.int32)
+        TQ = eos_q - bos_q
+        TK = eos_k - bos_k
     else:
-        bos, eos = i_b * T, i_b * T + T
+        bos_q, eos_q = i_b * TQ, i_b * TQ + TQ
+        bos_k, eos_k = i_b * TK, i_b * TK + TK
+    # Then i_t is always the token_idx inside each sequence
+    # bos, eos are the token_idx for * flattened * tokens, of the current sequence
 
-    k += (bos * H + i_h) * K
-    v += (bos * H + i_h) * V
-    block_indices += (bos + i_t) * H*S + i_h * S
+    # We assume that q tokens are logically the last TQ tokens in the current sequence
+    Q_OFFSET = TK - TQ
+
+    k += (bos_k * H + i_h) * K
+    v += (bos_k * H + i_h) * V
+    block_indices += (bos_q + i_t) * H * S + i_h * S
+
+    # k, v: shifted to the start of the current sequence at head i_h, i.e. k[i_b, 0, i_h]
+    # because bos is at the start of the current sequence at [B, T] dimensions
+    # bos + i_t is the index of the current token on [B, T] dimensions
+    # block_indices: shifted to block_indices[i_b, i_t, i_h]
 
+    # block_counts: [B, TQ, H]
     if USE_BLOCK_COUNTS:
-        NS = tl.load(block_counts + (bos + i_t) * H + i_h)
+        NS = tl.load(block_counts + (bos_q + i_t) * H + i_h)
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
+    # q: [B, TQ, HQ, K]
+    p_q = tl.make_block_ptr(
+        q + (bos_q + i_t) * HQ * K,  # base
+        (HQ, K), (K, 1),               # full tensor shape & strides
+        (i_h * G, 0), (G, BK), (1, 0),
+    )
+    # Note that i_h is the head index in KV, which corresponds to G heads in Q starting from i_h * G
+    # p_q then reads the BK dimensions at the last dimension
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
-    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_q = tl.load(p_q, boundary_check=(0, 1))  # note that BK >= K, but there is boundary check
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o = tl.make_block_ptr(o + (bos + i_t) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
-    p_lse = lse + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
+    p_o = tl.make_block_ptr(
+        o + (bos_q + i_t) * HQ * V,
+        (HQ, V), (V, 1),
+        (i_h * G, i_v * BV), (G, BV), (1, 0),
+    )
+    # Similar to p_q; but BV can be smaller than V, so it can be a sub-block of V
+    p_lse = lse + (bos_q + i_t) * HQ + i_h * G + tl.arange(0, G)
+    # lse + (bos + i_t) * HQ is the start of the current sequence at head i_h
+    # i_h * G is the offset for the current head, and tl.arange(0, G) is the offset for the group of heads
+
     # [G, BV]
     b_o = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m = tl.full([G], float('-inf'), dtype=tl.float32)
-    b_acc = tl.zeros([G], dtype=tl.float32)
-    for i in range(NS):
-        i_s = tl.load(block_indices + i).to(tl.int32) * BS
-        if i_s <= i_t and i_s >= 0:
-            p_k = tl.make_block_ptr(k, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1))
-            p_v = tl.make_block_ptr(v, (T, V), (H*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0))
-            # [BK, BS]
+    b_m = tl.full([G], float('-inf'), dtype=tl.float32)  # running maximum
+    b_acc = tl.zeros([G], dtype=tl.float32)  # sumexp
+    for i in range(NS):  # number of blocks
+        i_s = tl.load(block_indices + i).to(tl.int32) * BS  # i_s is the start token index of the current KV block
+        # Here we assume that q tokens are last TQ tokens
+        if i_s <= Q_OFFSET + i_t and i_s >= 0:
+            # Recall: k ([B, T, H, K]) already shifted to the start of the current sequence at head i_h, i.e. k[i_b, 0, i_h]
+            # k is loaded transponsed for the convenience of dot product
+            p_k = tl.make_block_ptr(k, (K, TK), (1, H*K), (0, i_s), (BK, BS), (0, 1))
+            p_v = tl.make_block_ptr(v, (TK, V), (H*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0))
+            # [BK, BS], essentially read keys of all tokens in the block
             b_k = tl.load(p_k, boundary_check=(0, 1))
             # [BS, BV]
             b_v = tl.load(p_v, boundary_check=(0, 1))
-            # [G, BS]
+            # [G, BS], dot-product per head; recall b_q: [G, BK]
             b_s = tl.dot(b_q, b_k)
-            b_s = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s, float('-inf'))
+            # Ensure causal mask; note that i_t may be inside the current block
+            b_s = tl.where((Q_OFFSET + i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s, float('-inf'))
+
+            # Recall stable softmax:
+            # o_i = \sum_{j<=i} exp(s_j - m_i) * v_j
+            #     = exp(m_{i-1} - m_i) * o_{i-1} + exp(s_i - m_i) * v_i
+            # a_i = \sum_{j<=i} exp(s_j - m_i)
+            #     = exp(m_{i-1} - m_i) * a_{i-1} + exp(s_i - m_i)
 
             # [G]
             b_m, b_mp = tl.maximum(b_m, tl.max(b_s, 1)), b_m
@@ -249,11 +306,13 @@ def parallel_nsa_fwd_kernel(
             # [G, BS]
             b_p = exp(b_s - b_m[:, None])
             # [G]
-            b_acc = b_acc * b_r + tl.sum(b_p, 1)
-            # [G, BV]
+            b_acc = b_acc * b_r + tl.sum(b_p, 1)  # summed over T dimension
+            # [G, BV]; note that b_p is fp32, while b_q may not
             b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v)
 
-            b_mp = b_m
+    # o = o_n / a_n
+    # lse = log( exp(m_n) * a_n )
+
     b_o = b_o / b_acc[:, None]
     b_m += log(b_acc)
     tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
@@ -490,17 +549,32 @@ def parallel_nsa_bwd_kernel_dkv(
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
 
+@contiguous
 def parallel_nsa_topk(
     q: torch.Tensor,
     k: torch.Tensor,
-    lse: torch.Tensor,
+    TK: int,
+    lse: Optional[torch.Tensor],
     block_counts: Union[torch.LongTensor, int],
     block_size: int = 64,
     scale: float = None,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
 ) -> torch.LongTensor:
-    B, T, HQ, K = q.shape
-    H = k.shape[2]
+    B, TQ, HQ, K = q.shape
+    _, TC, H, _ = k.shape
+
+    assert k.shape[0] == q.shape[0] and k.shape[-1] == q.shape[-1], "The last dimension of k and q must match"
+    assert lse is None or lse.shape == (B, TQ, HQ), "The shape of lse must be (B, TQ, HQ)"
+
+    if cu_seqlens is not None:
+        if isinstance(cu_seqlens, tuple):
+            cu_seqlens_q, cu_seqlens_k = cu_seqlens
+        else:
+            cu_seqlens_q = cu_seqlens_k = cu_seqlens
+        token_indices_q = prepare_token_indices(cu_seqlens_q)
+    else:
+        cu_seqlens_q = cu_seqlens_k = token_indices_q = None
+
     G = HQ // H
     # the number of selected blocks for each token
     S = block_counts if isinstance(block_counts, int) else block_counts.max().item()
@@ -510,10 +584,9 @@ def parallel_nsa_topk(
     BK = max(triton.next_power_of_2(K), 16)
     assert BC >= 2 * S, f"BC ({BC}) must be greater than or equal to 2 * S ({S})"
 
-    block_indices = torch.zeros(B, T, H, S, dtype=torch.int32, device=q.device)
-    token_indices = prepare_token_indices(cu_seqlens) if cu_seqlens is not None else None
-    chunk_offsets = prepare_chunk_offsets(cu_seqlens, BS) if cu_seqlens is not None else None
-    grid = (T, B * H)
+    block_indices = torch.zeros(B, TQ, H, S, dtype=torch.int32, device=q.device)
+    chunk_offsets = prepare_chunk_offsets(cu_seqlens_k, BS) if cu_seqlens_k is not None else None
+    grid = (TQ, B * H)
     # the 1st and the last 2 blocks are always selected
     parallel_nsa_kernel_topk[grid](
         q=q,
@@ -521,10 +594,12 @@ def parallel_nsa_topk(
         lse=lse,
         scale=scale,
         block_indices=block_indices,
-        cu_seqlens=cu_seqlens,
-        token_indices=token_indices,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        token_indices_q=token_indices_q,
         chunk_offsets=chunk_offsets,
-        T=T,
+        TQ=TQ,
+        TK=TK,
         H=H,
         HQ=HQ,
         G=G,
@@ -537,6 +612,7 @@ def parallel_nsa_topk(
     return block_indices
 
 
+@contiguous
 def parallel_nsa_fwd(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -545,11 +621,12 @@ def parallel_nsa_fwd(
     block_counts: Union[torch.LongTensor, int],
     block_size: int,
     scale: float,
-    cu_seqlens: Optional[torch.LongTensor] = None,
-    token_indices: Optional[torch.LongTensor] = None,
+    cu_seqlens_q: Optional[torch.LongTensor] = None,
+    cu_seqlens_k: Optional[torch.LongTensor] = None,
+    token_indices_q: Optional[torch.LongTensor] = None,
 ):
-    B, T, H, K, V, S = *k.shape, v.shape[-1], block_indices.shape[-1]
-    HQ = q.shape[2]
+    B, T_kv, H, K, V, S = *k.shape, v.shape[-1], block_indices.shape[-1]
+    _, T_q, HQ, _ = q.shape
     G = HQ // H
     BS = block_size
     if check_shared_mem('hopper', q.device.index):
@@ -562,9 +639,9 @@ def parallel_nsa_fwd(
     NV = triton.cdiv(V, BV)
     assert NK == 1, "The key dimension can not be larger than 256"
 
-    grid = (T, NV, B * H)
-    o = torch.empty(B, T, HQ, V, dtype=v.dtype, device=q.device)
-    lse = torch.empty(B, T, HQ, dtype=torch.float, device=q.device)
+    grid = (T_q, NV, B * H)
+    o = torch.empty(B, T_q, HQ, V, dtype=v.dtype, device=q.device)
+    lse = torch.empty(B, T_q, HQ, dtype=torch.float, device=q.device)
 
     parallel_nsa_fwd_kernel[grid](
         q=q,
@@ -575,9 +652,11 @@ def parallel_nsa_fwd(
         scale=scale,
         block_indices=block_indices,
         block_counts=block_counts,
-        cu_seqlens=cu_seqlens,
-        token_indices=token_indices,
-        T=T,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        token_indices_q=token_indices_q,
+        TQ=T_q,
+        TK=T_kv,
         H=H,
         HQ=HQ,
         G=G,
@@ -726,7 +805,14 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, scale, cu_seq
         # for example, if the passed `cu_seqlens` is [0, 2, 6],
         # then there are 2 and 4 tokens in the 1st and 2nd sequences respectively, and `token_indices` will be
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
-        token_indices = prepare_token_indices(cu_seqlens) if cu_seqlens is not None else None
+        if cu_seqlens is not None:
+            if isinstance(cu_seqlens, tuple):
+                cu_seqlens_q, cu_seqlens_k = cu_seqlens
+            else:
+                cu_seqlens_q = cu_seqlens_k = cu_seqlens
+            token_indices_q = prepare_token_indices(cu_seqlens_q)
+        else:
+            cu_seqlens_q = cu_seqlens_k = token_indices_q = None
 
         o, lse = parallel_nsa_fwd(
             q=q,
@@ -736,14 +822,16 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, scale, cu_seq
             block_counts=block_counts,
             block_size=block_size,
             scale=scale,
-            cu_seqlens=cu_seqlens,
-            token_indices=token_indices
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            token_indices_q=token_indices_q
         )
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
-        ctx.cu_seqlens = cu_seqlens
-        ctx.token_indices = token_indices
+        # Use cu_seqlens of q in backward, as cu_seqlens for q & k are different only for inference
+        ctx.cu_seqlens = cu_seqlens_q
+        ctx.token_indices = token_indices_q
         ctx.block_size = block_size
         ctx.scale = scale
         return o.to(q.dtype)
@@ -770,6 +858,7 @@ def backward(ctx, do):
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
+@contiguous
 def parallel_nsa(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -778,34 +867,34 @@ def parallel_nsa(
     g_slc: Optional[torch.Tensor] = None,
     g_swa: Optional[torch.Tensor] = None,
     block_indices: Optional[torch.LongTensor] = None,
-    block_counts: Union[torch.LongTensor, int] = 16,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
     block_size: int = 64,
     window_size: int = 0,
     scale: Optional[float] = None,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
 ) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
-            queries of shape `[B, T, HQ, K]`.
+            queries of shape `[B, TQ, HQ, K]`.
         k (torch.Tensor):
             keys of shape `[B, T, H, K]`.
             GQA is enforced here. The ratio of query heads (HQ) to key/value heads (H) must be a power of 2 and >=16.
         v (torch.Tensor):
             values of shape `[B, T, H, V]`.
         g_cmp (torch.Tensor):
-            Gate score for compressed attention of shape `[B, T, HQ]`.
+            Gate score for compressed attention of shape `[B, TQ, HQ]`.
         g_slc (torch.Tensor):
-            Gate score for selected attention of shape `[B, T, HQ]`.
+            Gate score for selected attention of shape `[B, TQ, HQ]`.
         g_swa (torch.Tensor):
-            Gate score for sliding attentionof shape `[B, T, HQ]`.
+            Gate score for sliding attentionof shape `[B, TQ, HQ]`.
         block_indices (torch.LongTensor):
-            Block indices of shape `[B, T, H, S]`.
+            Block indices of shape `[B, TQ, H, S]`.
             `S` is the number of selected blocks for each query token, which is set to 16 in the paper.
-            If `g_cmp` is provided, the passed `block_indices` will be ignored.
+            Will override the computed block indices from compression if provided.
         block_counts (Optional[Union[torch.LongTensor, int]]):
             Number of selected blocks for each query.
-            If a tensor is provided, with shape `[B, T, H]`,
+            If a tensor is provided, with shape `[B, TQ, H]`,
             each query can select the same number of blocks.
             If not provided, it will default to 16.
         block_size (int):
@@ -815,9 +904,10 @@ def parallel_nsa(
         scale (Optional[float]):
             Scale factor for attention scores.
             If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
-        cu_seqlens (torch.LongTensor):
+        cu_seqlens (torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor] or None):
             Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
             consistent with the FlashAttention API.
+            When a tuple is provided, it should contain two tensors: `(cu_seqlens_q, cu_seqlens_k)`.
 
     Returns:
         o (torch.Tensor):
@@ -830,28 +920,38 @@ def parallel_nsa(
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
-    k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens), mean_pooling(v, block_size, cu_seqlens)
+    if cu_seqlens is not None:
+        if isinstance(cu_seqlens, tuple):
+            cu_seqlens_q, cu_seqlens_k = cu_seqlens
+        else:
+            cu_seqlens_q = cu_seqlens_k = cu_seqlens
+    else:
+        cu_seqlens_q = cu_seqlens_k = None
+    k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens_k), mean_pooling(v, block_size, cu_seqlens_k)
     o_cmp, lse_cmp = None, None
     if g_cmp is not None:
         o_cmp, lse_cmp = parallel_nsa_compression(
             q=q,
             k=k_cmp,
             v=v_cmp,
+            TK=k.shape[1],
             block_size=block_size,
             scale=scale,
             cu_seqlens=cu_seqlens
         )
-        if block_indices is not None:
-            warnings.warn("`block_indices` will be ignored when `g_cmp` is provided")
-        block_indices = parallel_nsa_topk(
-            q=q,
-            k=k_cmp,
-            lse=lse_cmp,
-            block_counts=block_counts,
-            block_size=block_size,
-            scale=scale,
-            cu_seqlens=cu_seqlens
-        )
+        if block_indices is None:
+            block_indices = parallel_nsa_topk(
+                q=q,
+                k=k_cmp,
+                lse=lse_cmp,
+                TK=k.shape[1],
+                block_counts=block_counts,
+                block_size=block_size,
+                scale=scale,
+                cu_seqlens=cu_seqlens
+            )
+        else:
+            warnings.warn("`block_indices` computed from compression is overridden")
     o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
     if g_slc is not None:
         o = o_slc * g_slc.unsqueeze(-1)
@@ -859,13 +959,12 @@ def parallel_nsa(
         o = torch.addcmul(o, o_cmp, g_cmp.unsqueeze(-1))
     if window_size > 0:
         if cu_seqlens is not None:
-            max_seqlen = q.shape[1]
             o_swa = flash_attn_varlen_func(
                 q.squeeze(0), k.squeeze(0), v.squeeze(0),
-                cu_seqlens_q=cu_seqlens,
-                cu_seqlens_k=cu_seqlens,
-                max_seqlen_q=max_seqlen,
-                max_seqlen_k=max_seqlen,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=q.shape[1],
+                max_seqlen_k=k.shape[1],
                 causal=True,
                 window_size=(window_size-1, 0)
             ).unsqueeze(0)
diff --git a/tests/ops/test_nsa.py b/tests/ops/test_nsa.py
index 851ddb10e..542e6d04f 100644
--- a/tests/ops/test_nsa.py
+++ b/tests/ops/test_nsa.py
@@ -1,19 +1,50 @@
 # -*- coding: utf-8 -*-
 
 import os
+import warnings
 from typing import List
 
 import pytest
-import torch
-import triton
 
-from fla.ops.nsa.naive import naive_nsa
-from fla.ops.nsa.parallel import parallel_nsa
-from fla.ops.utils import prepare_token_indices
-from fla.utils import assert_close, device
+os.environ['TRITON_F32_DEFAULT'] = 'ieee'
 
+import torch  # noqa: E402
+import triton  # noqa: E402
 
-# FIXME
+from fla.ops.nsa.compression import parallel_nsa_compression  # noqa: E402
+from fla.ops.nsa.naive import naive_nsa, naive_nsa_cmp, naive_nsa_sel, naive_nsa_topk  # noqa: E402
+from fla.ops.nsa.parallel import parallel_nsa, parallel_nsa_fwd, parallel_nsa_topk  # noqa: E402
+from fla.ops.utils import prepare_chunk_offsets, prepare_token_indices  # noqa: E402
+from fla.ops.utils.pooling import mean_pooling  # noqa: E402
+from fla.utils import assert_close, device  # noqa: E402
+
+
+def build_block_indices(B, T, H, S, block_size, seq_indices=None):
+    block_indices = torch.full((B, T, H, S), -1, dtype=torch.long, device=device)
+    for b in range(B):
+        for i in range(T):
+            if seq_indices is None:
+                t = i
+            else:
+                _, t = seq_indices[i]
+            for h in range(H):
+                i_i = torch.randperm(triton.cdiv(t + 1, block_size))[:S]
+                block_indices[b, i, h, :len(i_i)] = i_i
+    block_indices = block_indices.sort(-1)[0]
+    return block_indices
+
+
+def build_partial_varlen(x, cu_seqlens, q_lens):
+    partial_x = torch.cat([x[:, cu_seqlens[i + 1] - q_lens[i]: cu_seqlens[i + 1]] for i in range(len(q_lens))], dim=1)
+    return partial_x
+
+
+# Tests on individual ops are skipped as tests on the whole NSA function are added;
+# see `test_parallel_decode` and `test_parallel_decode_varlen`.
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
 @pytest.mark.parametrize(
     ('B', 'T', 'H', 'HQ', 'D', 'S', 'block_size', 'scale', 'dtype'),
     [
@@ -28,39 +59,32 @@
     ]
 )
 def test_parallel(
-    B: int,
-    T: int,
-    H: int,
-    HQ: int,
-    D: int,
-    S: int,
-    block_size: int,
-    scale: float,
-    dtype: torch.dtype,
+        B: int,
+        T: int,
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        scale: float,
+        dtype: torch.dtype,
 ):
     torch.manual_seed(42)
-    os.environ['TRITON_F32_DEFAULT'] = 'ieee'
 
     q = torch.randn((B, T, HQ, D), dtype=dtype, device=device).requires_grad_(True)
     k = torch.randn((B, T, H, D), dtype=dtype, device=device).requires_grad_(True)
     v = torch.randn((B, T, H, D), dtype=dtype, device=device).requires_grad_(True)
     do = torch.randn((B, T, HQ, D), dtype=dtype, device=device)
 
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device=device)
-    for b in range(B):
-        for t in range(T):
-            for h in range(H):
-                i_i = torch.randperm(max(1, triton.cdiv(t, block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
-    block_indices = block_indices.sort(-1)[0]
+    block_indices = build_block_indices(B, T, H, S, block_size)
 
-    ref = naive_nsa(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+    ref = naive_nsa_sel(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
     ref.backward(do)
     ref_dq, q.grad = q.grad.clone(), None
     ref_dk, k.grad = k.grad.clone(), None
     ref_dv, v.grad = v.grad.clone(), None
 
-    tri = parallel_nsa(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+    tri = parallel_nsa(q=q, k=k, v=v, block_indices=block_indices, block_counts=S, block_size=block_size, scale=scale)
     tri.backward(do)
     tri_dq, q.grad = q.grad.clone(), None
     tri_dk, k.grad = k.grad.clone(), None
@@ -72,12 +96,17 @@ def test_parallel(
     assert_close("dv", ref_dv, tri_dv, 0.005)
 
 
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
 @pytest.mark.parametrize(
     ('H', 'HQ', 'D', 'S', 'block_size', 'cu_seqlens', 'dtype'),
     [
         pytest.param(*test, id="H{}-HQ{}-D{}-S{}-block_size{}-cu_seqlens{}-{}".format(*test))
         for test in [
             (1, 16, 64, 16, 32, [0, 15], torch.float16),
+            (1, 16, 64, 8, 16, [0, 15, 205, 550, 800], torch.float16),
             (2, 32, 64, 16, 32, [0, 256, 500, 1000], torch.float16),
             (2, 32, 100, 16, 32, [0, 15, 100, 300, 1200, 2000], torch.float16),
         ]
@@ -88,16 +117,15 @@ def test_parallel(
     reason='Skipping test because SKIP_TEST_CHUNK_VARLEN is set'
 )
 def test_parallel_varlen(
-    H: int,
-    HQ: int,
-    D: int,
-    S: int,
-    block_size: int,
-    cu_seqlens: List[int],
-    dtype: torch.dtype,
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        cu_seqlens: List[int],
+        dtype: torch.dtype,
 ):
     torch.manual_seed(42)
-    os.environ['TRITON_F32_DEFAULT'] = 'ieee'
 
     T = cu_seqlens[-1]
     cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
@@ -108,17 +136,10 @@ def test_parallel_varlen(
     v = torch.randn((1, T, H, D), dtype=dtype, device=device).requires_grad_()
     do = torch.randn((1, T, HQ, D), dtype=dtype, device=device)
 
-    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device=device)
-    seq_indices = prepare_token_indices(cu_seqlens).tolist()
-
-    for i in range(T):
-        _, t = seq_indices[i]
-        for h in range(H):
-            i_i = torch.randperm(max(1, triton.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
-    block_indices = block_indices.sort(-1)[0]
+    seq_indices = prepare_token_indices(cu_seqlens)
+    block_indices = build_block_indices(1, T, H, S, block_size, seq_indices.tolist())
 
-    ref = naive_nsa(
+    ref = naive_nsa_sel(
         q=q,
         k=k,
         v=v,
@@ -136,6 +157,7 @@ def test_parallel_varlen(
         k=k,
         v=v,
         block_indices=block_indices,
+        block_counts=S,
         block_size=block_size,
         cu_seqlens=cu_seqlens
     )
@@ -148,3 +170,775 @@ def test_parallel_varlen(
     assert_close('dq', ref_dq, tri_dq, 0.005)
     assert_close('dk', ref_dk, tri_dk, 0.005)
     assert_close('dv', ref_dv, tri_dv, 0.005)
+
+
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
+@pytest.mark.parametrize(
+    ('B', 'T', 'Tq', 'H', 'HQ', 'D', 'S', 'block_size', 'scale', 'dtype'),
+    [
+        pytest.param(*test, id="B{}-T{}-Tq{}-H{}-HQ{}-D{}-S{}-block_size{}-scale{}-{}".format(*test))
+        for test in [
+            (1, 63, 1, 1, 16, 64, 16, 32, 1.0, torch.float16),
+            (3, 111, 15, 1, 32, 100, 16, 32, 1.0, torch.float16),
+            (3, 1024, 3, 2, 32, 60, 16, 32, 0.1, torch.float16),
+            (3, 1024, 33, 2, 32, 128, 16, 32, 0.1, torch.float16),
+            (4, 2048, 25, 2, 32, 64, 16, 32, 0.1, torch.float16)
+        ]
+    ]
+)
+def test_parallel_selective_decode(
+        B: int,
+        T: int,
+        Tq: int,
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        scale: float,
+        dtype: torch.dtype,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device=device)
+    k = torch.randn((B, T, H, D), dtype=dtype, device=device)
+    v = torch.randn((B, T, H, D), dtype=dtype, device=device)
+
+    block_indices = build_block_indices(B, T, H, S, block_size)
+
+    o_full, lse_full = parallel_nsa_fwd(
+        q, k, v,
+        block_indices,
+        S,
+        block_size,
+        scale,
+    )
+
+    o_short, lse_short = parallel_nsa_fwd(
+        q[:, -Tq:], k, v, block_indices[:, -Tq:],
+        S,
+        block_size,
+        scale,
+    )
+
+    o_naive_fla = naive_nsa_sel(
+        q, k, v, block_indices, block_size, scale
+    )
+
+    assert_close(
+        'outputs: full-vs-naive',
+        o_naive_fla, o_full, 0.005
+    )
+    assert_close(
+        'outputs: full-vs-cached',
+        o_short, o_full[:, -Tq:], 0.005
+    )
+    assert_close(
+        'log-sum-exp: full-vs-cached',
+        lse_short, lse_full[:, -Tq:], 0.005
+    )
+
+
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
+@pytest.mark.parametrize(
+    ('B', 'T', 'Tq', 'H', 'HQ', 'D', 'block_size', 'scale', 'dtype'),
+    [
+        pytest.param(*test, id="B{}-T{}-Tq{}-H{}-HQ{}-D{}-block_size{}-scale{}-{}".format(*test))
+        for test in [
+            # Can't pass this as rel grad error bloats with short inputs. Numerical issue?
+            # (1, 63, 1, 1, 16, 64, 32, 1.0, torch.float16),
+            (3, 111, 15, 1, 32, 100, 32, 1.0, torch.float16),
+            (3, 1024, 3, 2, 32, 60, 32, 0.1, torch.float16),
+            (3, 1024, 33, 2, 32, 128, 32, 0.1, torch.float16),
+            (4, 2048, 25, 2, 32, 64, 32, 0.1, torch.float16)
+        ]
+    ]
+)
+def test_parallel_compressive(
+        B: int,
+        T: int,
+        Tq: int,
+        H: int,
+        HQ: int,
+        D: int,
+        block_size: int,
+        scale: float,
+        dtype: torch.dtype,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device=device).requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device=device).requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device=device).requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device=device)
+
+    k_cmp, v_cmp = mean_pooling(k, block_size), mean_pooling(v, block_size)
+    o_full, lse_full = parallel_nsa_compression(
+        q=q,
+        k=k_cmp,
+        v=v_cmp,
+        TK=T,
+        block_size=block_size,
+        scale=scale,
+    )
+    o_full.backward(do)
+    tri_dq, q.grad = q.grad.clone(), None
+    tri_dk, k.grad = k.grad.clone(), None
+    tri_dv, v.grad = v.grad.clone(), None
+
+    o_naive, lse_naive = naive_nsa_cmp(
+        q=q,
+        k_cmp=k_cmp,
+        v_cmp=v_cmp,
+        block_size=block_size,
+        scale=scale,
+    )
+    o_naive.backward(do)
+    ref_dq, q.grad = q.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dv, v.grad = v.grad.clone(), None
+
+    assert_close(
+        'outputs: full-vs-naive',
+        o_full, o_naive, 0.005
+    )
+    # For positions not attending to any token, the log-sum-exp should be -inf; the kernel returns 0 instead, it is
+    # OK as those positions will not be used in the compressive attention anyway.
+    assert_close(
+        'log-sum-exp: full-vs-naive',
+        lse_full, torch.where(lse_naive == float('-inf'), 0, lse_naive), 0.005
+    )
+    assert_close('dq', ref_dq, tri_dq, 0.005)
+    assert_close('dk', ref_dk, tri_dk, 0.005)
+    assert_close('dv', ref_dv, tri_dv, 0.005)
+
+    o_short, lse_short = parallel_nsa_compression(
+        q[:, -Tq:], k_cmp, v_cmp, T, block_size, scale,
+    )
+
+    assert_close(
+        'outputs: full-vs-cached',
+        o_short, o_full[:, -Tq:], 0.005
+    )
+
+    assert_close(
+        'log-sum-exp: full-vs-cached',
+        lse_short, lse_full[:, -Tq:], 0.005
+    )
+
+
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
+@pytest.mark.parametrize(
+    ('B', 'T', 'Tq', 'H', 'HQ', 'D', 'S', 'block_size', 'scale', 'dtype', 'reuse_lse'),
+    [
+        pytest.param(*test, id="B{}-T{}-Tq{}-H{}-HQ{}-D{}-S{}-block_size{}-scale{}-{}-reuse_lse{}".format(*test))
+        for test in [
+            (1, 1, 1, 1, 16, 64, 16, 32, 1.0, torch.float16, True),
+            (3, 111, 15, 1, 32, 100, 16, 32, 1.0, torch.float16, False),
+            (3, 1024, 3, 2, 32, 60, 16, 32, 0.1, torch.float32, True),
+            (3, 1024, 33, 2, 32, 128, 16, 32, 0.1, torch.float32, False),
+            (4, 2048, 25, 2, 32, 64, 16, 32, 0.1, torch.float32, True)  # Use FP32 to reduce numerical issues
+        ]
+    ]
+)
+def test_parallel_topk_decode(
+        B: int,
+        T: int,
+        Tq: int,
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        scale: float,
+        dtype: torch.dtype,
+        reuse_lse: bool,
+):
+    torch.manual_seed(42)
+    # Use a wider range to reduce numerical issues, otherwise there will be too many mismatches due to close scores.
+    q = torch.rand((B, T, HQ, D), dtype=dtype, device=device) * 10 - 5
+    k = torch.rand((B, T, H, D), dtype=dtype, device=device) * 10 - 5
+    v = torch.rand((B, T, H, D), dtype=dtype, device=device) * 10 - 5
+
+    k_cmp, v_cmp = mean_pooling(k, block_size), mean_pooling(v, block_size)
+
+    if reuse_lse:
+        # For positions not attending to any token, the log-sum-exp should be -inf; the kernel returns 0 instead, it is
+        # OK as those positions will not be used in the compressive attention anyway.
+        _, lse_full = naive_nsa_cmp(
+            q=q,
+            k_cmp=k_cmp,
+            v_cmp=v_cmp,
+            block_size=block_size,
+            scale=scale,
+        )
+        lse_full = torch.where(lse_full == float('-inf'), 0, lse_full)
+    else:
+        lse_full = None
+
+    block_indices = parallel_nsa_topk(
+        q=q,
+        k=k_cmp,
+        TK=T,
+        lse=lse_full,
+        block_counts=S,
+        block_size=block_size,
+        scale=scale,
+    )
+
+    block_indices_naive = naive_nsa_topk(
+        q, k_cmp, block_counts=S, block_size=block_size, scale=scale,
+    )
+
+    # Separate checks for forcefully selected blocks (0, -1, -2)
+    fixed_block_indices, free_block_indices = block_indices[:, :, :, :3], block_indices[:, :, :, 3:]
+    fixed_block_indices_naive, free_block_indices_naive = (
+        block_indices_naive[:, :, :, :3], block_indices_naive[:, :, :, 3:])
+
+    fixed_block_indices, _ = torch.sort(fixed_block_indices, dim=-1)
+    fixed_block_indices_naive, _ = torch.sort(fixed_block_indices_naive, dim=-1)
+
+    assert (fixed_block_indices == fixed_block_indices_naive).all(), \
+        "Different in forcefully selected block indices compared to naive"
+
+    if not (free_block_indices == free_block_indices_naive).all():
+        indices = torch.nonzero(free_block_indices != free_block_indices_naive, as_tuple=False)
+        for idx in range(indices.shape[0]):
+            b_i, t_i, h_i, s_i = indices[idx]
+            q_vals = q[b_i.item(), t_i.item(), h_i * (HQ // H): (h_i + 1) * (HQ // H), :]
+            k_vals = k_cmp[b_i.item(), :, h_i.item()]
+            a_s = torch.einsum('h k, s k -> s h', q_vals, k_vals) * scale
+            a_s[t_i // block_size + ((t_i + 1) % block_size == 0).int():] = float('-inf')
+            a_sn = torch.softmax(a_s, dim=0)
+            a_snm = a_sn.mean(-1)
+            m = a_s.max(dim=0, keepdim=True).values
+            a_lse = torch.log(torch.exp(a_s - m).sum(0)) + m.squeeze(0)
+            if lse_full is not None:
+                k_lse = lse_full[b_i.item(), t_i.item(), h_i * (HQ // H): (h_i + 1) * (HQ // H)]
+                assert_close('lse vs naive ' + str(indices[idx]), a_lse, k_lse, ratio=0.005)
+
+            assert_close('block-score vs naive ' + str(indices[idx]), a_snm[free_block_indices[b_i, t_i, h_i, s_i]],
+                         a_snm[free_block_indices_naive[b_i, t_i, h_i, s_i]], ratio=0.005)
+        warnings.warn(f"Block indices mismatch: {len(indices)}/{block_indices.numel()} "
+                      f"({len(indices) / free_block_indices_naive.numel():.2f}), seemingly due to numerical issues.")
+
+    block_indices_short = parallel_nsa_topk(
+        q=q[:, -Tq:],
+        k=k_cmp,
+        lse=lse_full[:, -Tq:] if lse_full is not None else None,
+        TK=T,
+        block_counts=S,
+        block_size=block_size,
+        scale=scale,
+    )
+
+    fixed_block_indices_short, free_block_indices_short = (
+        block_indices_short[:, :, :, :3], block_indices_short[:, :, :, 3:])
+    fixed_block_indices_short, _ = torch.sort(fixed_block_indices_short, dim=-1)
+    assert (fixed_block_indices_short == fixed_block_indices[:, -Tq:]).all(), \
+        "Different in forcefully selected block indices compared to full"
+    assert (free_block_indices_short == free_block_indices[:, -Tq:]).all(), \
+        "Different in free block indices compared to full"
+
+
+# Numerical issues are intensified by discrete block selection; hence we need to use FP32 and/or to reuse block indices
+@pytest.mark.parametrize(
+    ('B', 'T', 'Tq', 'H', 'HQ', 'D', 'S', 'block_size', 'scale', 'window_size', 'dtype', 'reuse_index'),
+    [
+        pytest.param(*test, id="B{}-T{}-Tq{}-H{}-HQ{}-D{}-S{}-block_size{}-scale{}-W{}-{}-reuse_index{}".format(*test))
+        for test in [
+            (1, 1, 1, 1, 16, 64, 16, 32, 1.0, 0, torch.float16, False),
+            (3, 111, 15, 1, 32, 100, 16, 32, 1.0, 128, torch.float16, False),
+            (3, 1024, 280, 1, 32, 100, 16, 32, 1.0, 0, torch.float32, False),
+            (4, 1024, 256, 1, 32, 100, 16, 32, 1.0, 16, torch.float16, True),
+            (3, 1024, 3, 2, 32, 60, 16, 32, 0.1, 128, torch.float16, True),
+            (3, 1024, 33, 2, 32, 128, 16, 32, 0.1, 0, torch.float32, False),
+            (4, 2048, 25, 2, 32, 64, 16, 32, 0.1, 512, torch.float16, True)
+        ]
+    ]
+)
+def test_parallel_decode(
+        B: int,
+        T: int,
+        Tq: int,
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        scale: float,
+        window_size: int,
+        dtype: torch.dtype,
+        reuse_index: bool
+):
+    torch.manual_seed(42)
+
+    q = (torch.rand((B, T, HQ, D), dtype=dtype, device=device) * 3 - 2).requires_grad_(True)
+    k = (torch.rand((B, T, H, D), dtype=dtype, device=device) * 3 - 2).requires_grad_(True)
+    v = (torch.rand((B, T, H, D), dtype=dtype, device=device) * 3 - 2).requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device=device)
+
+    g = torch.randn((B, T, HQ, 3), dtype=dtype, device=device)
+    g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+
+    if reuse_index:
+        o_naive, block_indices = naive_nsa(
+            q, k, v, g_cmp, g_slc, g_swa,
+            block_counts=S, block_size=block_size, scale=scale, window_size=window_size, return_block_indices=True)
+    else:
+        o_naive = naive_nsa(
+            q, k, v, g_cmp, g_slc, g_swa,
+            block_counts=S, block_size=block_size, scale=scale, window_size=window_size)
+        block_indices = None
+
+    o_naive.backward(do)
+    ref_dq, q.grad = q.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dv, v.grad = v.grad.clone(), None
+
+    o_full = parallel_nsa(q, k, v, g_cmp, g_slc, g_swa, block_indices=block_indices,
+                          block_counts=S, block_size=block_size, scale=scale, window_size=window_size)
+    o_full.backward(do)
+    tri_dq, q.grad = q.grad.clone(), None
+    tri_dk, k.grad = k.grad.clone(), None
+    tri_dv, v.grad = v.grad.clone(), None
+
+    assert_close('full vs naive', o_full, o_naive, 0.005)
+    assert_close('dq', ref_dq, tri_dq, 0.005)
+    assert_close('dk', ref_dk, tri_dk, 0.005)
+    assert_close('dv', ref_dv, tri_dv, 0.005)
+
+    o_short = parallel_nsa(
+        q[:, -Tq:], k, v, g_cmp[:, -Tq:], g_slc[:, -Tq:], g_swa[:, -Tq:],
+        block_indices=block_indices[:, -Tq:] if reuse_index else None,
+        block_counts=S,
+        block_size=block_size,
+        scale=scale,
+        window_size=window_size
+    )
+
+    assert_close('short vs full', o_short, o_full[:, -Tq:], 0.005)
+
+
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
+@pytest.mark.parametrize(
+    ('H', 'HQ', 'D', 'S', 'block_size', 'cu_seqlens', 'q_lens', 'dtype'),
+    [
+        pytest.param(*test, id="H{}-HQ{}-D{}-S{}-block_size{}-cu_seqlens{}-q_lens{}-{}".format(*test))
+        for test in [
+            (1, 16, 64, 16, 32, [0, 15], [1, ], torch.float16),
+            (1, 16, 64, 8, 16, [0, 15, 205, 550, 800], [3, 15, 30, 8], torch.float16),
+            (2, 32, 64, 16, 32, [0, 256, 500, 1000], [1, 15, 4], torch.float16),
+            (2, 32, 100, 16, 32, [0, 15, 100, 300, 1200, 2000], [5, 3, 1, 1, 128], torch.float16),
+        ]
+    ]
+)
+@pytest.mark.skipif(
+    os.getenv('SKIP_TEST_CHUNK_VARLEN') == '1',
+    reason='Skipping test because SKIP_TEST_CHUNK_VARLEN is set'
+)
+def test_parallel_selective_varlen_decode(
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        cu_seqlens,
+        q_lens,
+        dtype: torch.dtype,
+):
+    torch.manual_seed(42)
+
+    T = cu_seqlens[-1]
+    cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+
+    # seq-first required for inputs with variable lengths
+    q = torch.randn((1, T, HQ, D), dtype=dtype, device=device)
+    k = torch.randn((1, T, H, D), dtype=dtype, device=device)
+    v = torch.randn((1, T, H, D), dtype=dtype, device=device)
+    scale = 1.0 / (D ** 0.5)
+
+    seq_indices = prepare_token_indices(cu_seqlens)
+    block_indices = build_block_indices(1, T, H, S, block_size, seq_indices.tolist())
+
+    o_full, lse_full = parallel_nsa_fwd(
+        q, k, v,
+        block_indices,
+        S,
+        block_size,
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_k=cu_seqlens,
+        scale=scale,
+        token_indices_q=seq_indices,
+    )
+
+    ref = naive_nsa_sel(
+        q=q,
+        k=k,
+        v=v,
+        block_indices=block_indices,
+        block_size=block_size,
+        cu_seqlens=cu_seqlens
+    )
+
+    q_short = build_partial_varlen(q, cu_seqlens, q_lens)
+    block_indices_short = build_partial_varlen(block_indices, cu_seqlens, q_lens)
+    cu_seqlens_q = torch.cumsum(torch.tensor([0] + q_lens), dim=0).to(device)
+    token_indices_q = prepare_token_indices(cu_seqlens_q)
+
+    o_short_ref = build_partial_varlen(o_full, cu_seqlens, q_lens)
+    lse_short_ref = build_partial_varlen(lse_full, cu_seqlens, q_lens)
+
+    o_short, lse_short = parallel_nsa_fwd(
+        q_short, k, v,
+        block_indices_short,
+        S,
+        block_size,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens,
+        scale=1.0 / (D ** 0.5),
+        token_indices_q=token_indices_q
+    )
+
+    assert_close('outputs: full vs naive', ref, o_full, 0.005)
+    assert_close('outputs: full vs short', o_short, o_short_ref, 0.005)
+    assert_close('lse: full vs short', lse_short, lse_short_ref, 0.005)
+
+
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
+@pytest.mark.parametrize(
+    ('H', 'HQ', 'D', 'block_size', 'cu_seqlens', 'q_lens', 'dtype'),
+    [
+        pytest.param(*test, id="H{}-HQ{}-D{}-block_size{}-cu_seqlens{}-q_lens{}-{}".format(*test))
+        for test in [
+            (1, 16, 64, 32, [0, 15], [1, ], torch.float16),
+            (1, 16, 64, 16, [0, 15, 205, 550, 800], [3, 15, 30, 8], torch.float16),
+            (2, 32, 64, 32, [0, 256, 500, 1000], [1, 15, 4], torch.float16),
+            (2, 32, 100, 32, [0, 15, 100, 300, 1200, 2000], [5, 3, 1, 1, 128], torch.float16),
+        ]
+    ]
+)
+@pytest.mark.skipif(
+    os.getenv('SKIP_TEST_CHUNK_VARLEN') == '1',
+    reason='Skipping test because SKIP_TEST_CHUNK_VARLEN is set'
+)
+def test_parallel_compressive_varlen(
+        H: int,
+        HQ: int,
+        D: int,
+        block_size: int,
+        cu_seqlens,
+        q_lens,
+        dtype: torch.dtype,
+):
+    torch.manual_seed(42)
+
+    T = cu_seqlens[-1]
+    cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+
+    # seq-first required for inputs with variable lengths
+    q = torch.randn((1, T, HQ, D), dtype=dtype, device=device).requires_grad_(True)
+    k = torch.randn((1, T, H, D), dtype=dtype, device=device).requires_grad_(True)
+    v = torch.randn((1, T, H, D), dtype=dtype, device=device).requires_grad_(True)
+    do = torch.randn((1, T, HQ, D), dtype=dtype, device=device)
+
+    scale = 1.0 / (D ** 0.5)
+    k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens), mean_pooling(v, block_size, cu_seqlens)
+
+    o_full, lse_full = parallel_nsa_compression(
+        q=q,
+        k=k_cmp,
+        v=v_cmp,
+        TK=T,
+        block_size=block_size,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    o_full.backward(do)
+    tri_dq, q.grad = q.grad.clone(), None
+    tri_dk, k.grad = k.grad.clone(), None
+    tri_dv, v.grad = v.grad.clone(), None
+
+    o_naive, lse_naive = naive_nsa_cmp(
+        q=q,
+        k_cmp=k_cmp,
+        v_cmp=v_cmp,
+        block_size=block_size,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    o_naive.backward(do)
+    ref_dq, q.grad = q.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dv, v.grad = v.grad.clone(), None
+
+    assert_close('outputs: full vs naive', o_naive, o_full, 0.005)
+    assert_close('lse: full vs naive', torch.where(lse_naive == float('-inf'), 0, lse_naive), lse_full, 0.005)
+    assert_close('dq', ref_dq, tri_dq, 0.005)
+    assert_close('dk', ref_dk, tri_dk, 0.005)
+    assert_close('dv', ref_dv, tri_dv, 0.005)
+
+    q_short = build_partial_varlen(q, cu_seqlens, q_lens)
+    cu_seqlens_q = torch.cumsum(torch.tensor([0] + q_lens), dim=0).to(device)
+
+    o_short_ref = build_partial_varlen(o_full, cu_seqlens, q_lens)
+    lse_short_ref = build_partial_varlen(lse_full, cu_seqlens, q_lens)
+
+    o_short, lse_short = parallel_nsa_compression(
+        q_short,
+        k_cmp, v_cmp,
+        T,
+        block_size,
+        scale,
+        cu_seqlens=(cu_seqlens_q, cu_seqlens),
+    )
+
+    assert_close('outputs: full vs short', o_short, o_short_ref, 0.005)
+    assert_close('lse: full vs short', lse_short, lse_short_ref, 0.005)
+
+
+@pytest.mark.skipif(
+    True,
+    reason='Skipping redundant individual tests'
+)
+@pytest.mark.parametrize(
+    ('H', 'HQ', 'D', 'S', 'block_size', 'scale', 'cu_seqlens', 'q_lens', 'dtype', 'reuse_lse'),
+    [
+        pytest.param(*test,
+                     id="H{}-HQ{}-D{}-S{}-block_size{}-scale{}-cu_seqlens{}-q_lens{}-{}-reuse_lse{}".format(*test))
+        for test in [
+            (1, 16, 64, 16, 32, 1.0, [0, 15], [1, ], torch.float16, True),
+            (1, 16, 64, 8, 16, 0.1, [0, 15, 205, 550, 800], [3, 15, 30, 8], torch.float16, False),
+            (2, 32, 64, 16, 32, 1.0, [0, 256, 500, 1000], [1, 15, 4], torch.float32, True),
+            (2, 32, 100, 16, 32, 0.1, [0, 15, 100, 300, 1200, 2000], [5, 3, 1, 1, 128], torch.float32, False),
+        ]
+    ]
+)
+def test_parallel_topk_varlen(
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        scale: float,
+        cu_seqlens,
+        q_lens,
+        dtype: torch.dtype,
+        reuse_lse: bool,
+):
+    torch.manual_seed(42)
+
+    T = cu_seqlens[-1]
+    cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+
+    # Use a wider range to reduce numerical issues, otherwise there will be too many mismatches due to close scores.
+    q = torch.rand((1, T, HQ, D), dtype=dtype, device=device) * 10 - 5
+    k = torch.rand((1, T, H, D), dtype=dtype, device=device) * 10 - 5
+    v = torch.rand((1, T, H, D), dtype=dtype, device=device) * 10 - 5
+
+    k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens), mean_pooling(v, block_size, cu_seqlens)
+    seq_indices = prepare_token_indices(cu_seqlens)
+
+    kv_cu_seqlens = prepare_chunk_offsets(cu_seqlens, block_size)
+
+    if reuse_lse:
+        # For positions not attending to any token, the log-sum-exp should be -inf; the kernel returns 0 instead, it is
+        # OK as those positions will not be used in the compressive attention anyway.
+        _, lse_full = naive_nsa_cmp(
+            q=q,
+            k_cmp=k_cmp,
+            v_cmp=v_cmp,
+            block_size=block_size,
+            scale=scale,
+            cu_seqlens=cu_seqlens
+        )
+        lse_full = torch.where(lse_full == float('-inf'), 0, lse_full)
+    else:
+        lse_full = None
+
+    block_indices = parallel_nsa_topk(
+        q=q,
+        k=k_cmp,
+        TK=T,
+        lse=lse_full,
+        block_counts=S,
+        block_size=block_size,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+
+    block_indices_naive = naive_nsa_topk(
+        q, k_cmp, block_counts=S, block_size=block_size, scale=scale, cu_seqlens=cu_seqlens,
+    )
+
+    # Separate checks for forcefully selected blocks (0, -1, -2)
+    fixed_block_indices, free_block_indices = block_indices[:, :, :, :3], block_indices[:, :, :, 3:]
+    fixed_block_indices_naive, free_block_indices_naive = (
+        block_indices_naive[:, :, :, :3], block_indices_naive[:, :, :, 3:])
+
+    fixed_block_indices, _ = torch.sort(fixed_block_indices, dim=-1)
+    fixed_block_indices_naive, _ = torch.sort(fixed_block_indices_naive, dim=-1)
+
+    assert (fixed_block_indices == fixed_block_indices_naive).all(), \
+        "Different in forcefully selected block indices compared to naive"
+
+    if not (free_block_indices == free_block_indices_naive).all():
+        indices = torch.nonzero(free_block_indices != free_block_indices_naive, as_tuple=False)
+        for idx in range(indices.shape[0]):
+            _, t_i, h_i, s_i = indices[idx]
+            q_vals = q[0, t_i.item(), h_i * (HQ // H): (h_i + 1) * (HQ // H), :]
+
+            i_n = seq_indices[t_i.item(), 0]
+            t = seq_indices[t_i.item(), 1]  # in-sequence index
+            bos_k = kv_cu_seqlens[i_n]
+            eos_k = kv_cu_seqlens[i_n + 1]
+
+            k_vals = k_cmp[0, bos_k: eos_k, h_i.item()]
+            a_s = torch.einsum('h k, s k -> s h', q_vals, k_vals) * scale
+            a_s[t // block_size + ((t + 1) % block_size == 0).int():] = float('-inf')
+            a_sn = torch.softmax(a_s, dim=0)
+            a_snm = a_sn.mean(-1)
+            m = a_s.max(dim=0, keepdim=True).values
+            a_lse = torch.log(torch.exp(a_s - m).sum(0)) + m.squeeze(0)
+            if lse_full is not None:
+                k_lse = lse_full[0, t_i.item(), h_i * (HQ // H): (h_i + 1) * (HQ // H)]
+                assert_close('block lse vs naive ' + str(indices[idx]), a_lse, k_lse, ratio=0.005)
+            assert_close('block-score vs naive ' + str(indices[idx]),
+                         a_snm[free_block_indices[0, t_i, h_i, s_i]],
+                         a_snm[free_block_indices_naive[0, t_i, h_i, s_i]], ratio=0.005)
+        warnings.warn(f"Block indices mismatch: {len(indices)}/{block_indices.numel()} "
+                      f"({len(indices) / free_block_indices_naive.numel():.2f}), seemingly due to numerical issues.")
+
+    q_short = build_partial_varlen(q, cu_seqlens, q_lens)
+    cu_seqlens_q = torch.cumsum(torch.tensor([0] + q_lens), dim=0).to(device)
+
+    fixed_block_indices_short_ref = build_partial_varlen(fixed_block_indices, cu_seqlens, q_lens)
+    free_block_indices_short_ref = build_partial_varlen(free_block_indices, cu_seqlens, q_lens)
+    lse_short_ref = build_partial_varlen(lse_full, cu_seqlens, q_lens) if lse_full is not None else None
+
+    block_indices_short = parallel_nsa_topk(
+        q=q_short,
+        k=k_cmp,
+        lse=lse_short_ref,
+        TK=T,
+        block_counts=S,
+        block_size=block_size,
+        scale=scale,
+        cu_seqlens=(cu_seqlens_q, cu_seqlens),
+    )
+
+    fixed_block_indices_short, free_block_indices_short = (
+        block_indices_short[:, :, :, :3], block_indices_short[:, :, :, 3:])
+    fixed_block_indices_short, _ = torch.sort(fixed_block_indices_short, dim=-1)
+    assert (fixed_block_indices_short == fixed_block_indices_short_ref).all(), \
+        "Different in forcefully selected block indices compared to full"
+    assert (free_block_indices_short == free_block_indices_short_ref).all(), \
+        "Different in free block indices compared to full"
+
+
+@pytest.mark.parametrize(
+    ('H', 'HQ', 'D', 'S', 'block_size', 'scale', 'window_size', 'cu_seqlens', 'q_lens', 'dtype', 'reuse_index'),
+    [
+        pytest.param(
+            *test,
+            id=(
+                    "H{}-HQ{}-D{}-S{}-block_size{}-scale{}-W{}-cu_seqlens{}-q_lens{}-{}-reuse_index{}".format(*test)
+            ),
+        )
+        for test in [
+            (1, 16, 64, 16, 32, 0.1, 128, [0, 15], [1, ], torch.float16, False),
+            (1, 16, 64, 8, 16, 1.0, 32, [0, 15, 205, 550, 800], [3, 15, 30, 8], torch.float16, False),
+            (2, 32, 64, 16, 32, 0.1, 64, [0, 256, 500, 1000], [1, 15, 4], torch.float16, False),
+            (2, 32, 100, 16, 32, 1.0, 0, [0, 15, 100, 300, 1200, 2000], [5, 3, 1, 1, 128], torch.float32, False),
+            (2, 32, 100, 16, 32, 1.0, 64, [0, 15, 100, 300, 1200, 2000], [5, 3, 1, 1, 128], torch.float16, True),
+        ]
+    ]
+)
+@pytest.mark.skipif(
+    os.getenv('SKIP_TEST_CHUNK_VARLEN') == '1',
+    reason='Skipping test because SKIP_TEST_CHUNK_VARLEN is set'
+)
+def test_parallel_varlen_decode(
+        H: int,
+        HQ: int,
+        D: int,
+        S: int,
+        block_size: int,
+        scale: float,
+        window_size: int,
+        cu_seqlens,
+        q_lens,
+        dtype: torch.dtype,
+        reuse_index: bool,
+):
+    torch.manual_seed(42)
+
+    T = cu_seqlens[-1]
+    cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+
+    q = (torch.rand((1, T, HQ, D), dtype=dtype, device=device) * 3 - 2).requires_grad_(True)
+    k = (torch.rand((1, T, H, D), dtype=dtype, device=device) * 3 - 2).requires_grad_(True)
+    v = (torch.rand((1, T, H, D), dtype=dtype, device=device) * 3 - 2).requires_grad_(True)
+    do = torch.randn((1, T, HQ, D), dtype=dtype, device=device)
+
+    g = torch.randn((1, T, HQ, 3), dtype=dtype, device=device)
+    g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+
+    if reuse_index:
+        o_naive, block_indices = naive_nsa(
+            q, k, v, g_cmp, g_slc, g_swa, block_counts=S, block_size=block_size,
+            scale=scale, window_size=window_size, cu_seqlens=cu_seqlens, return_block_indices=True)
+    else:
+        o_naive = naive_nsa(
+            q, k, v, g_cmp, g_slc, g_swa, block_counts=S, block_size=block_size,
+            scale=scale, window_size=window_size, cu_seqlens=cu_seqlens)
+        block_indices = None
+
+    o_naive.backward(do)
+    ref_dq, q.grad = q.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dv, v.grad = v.grad.clone(), None
+
+    o_full = parallel_nsa(
+        q, k, v, g_cmp, g_slc, g_swa, block_indices=block_indices, block_counts=S, block_size=block_size,
+        scale=scale, window_size=window_size, cu_seqlens=cu_seqlens)
+    o_full.backward(do)
+    tri_dq, q.grad = q.grad.clone(), None
+    tri_dk, k.grad = k.grad.clone(), None
+    tri_dv, v.grad = v.grad.clone(), None
+
+    assert_close('full vs naive', o_full, o_naive, 0.005)
+    assert_close('dq', ref_dq, tri_dq, 0.005)
+    assert_close('dk', ref_dk, tri_dk, 0.005)
+    assert_close('dv', ref_dv, tri_dv, 0.005)
+
+    q_short = build_partial_varlen(q, cu_seqlens, q_lens)
+    g_short = build_partial_varlen(g, cu_seqlens, q_lens)
+    g_cmp, g_slc, g_swa = g_short.sigmoid().unbind(-1)
+    cu_seqlens_q = torch.cumsum(torch.tensor([0] + q_lens), dim=0).int().to(device)
+
+    if block_indices is not None:
+        block_indices = build_partial_varlen(block_indices, cu_seqlens, q_lens)
+
+    o_short_ref = build_partial_varlen(o_full, cu_seqlens, q_lens)
+
+    o_short = parallel_nsa(
+        q_short, k, v, g_cmp, g_slc, g_swa, block_indices=block_indices, block_counts=S, block_size=block_size,
+        scale=scale, window_size=window_size, cu_seqlens=(cu_seqlens_q, cu_seqlens), )
+
+    assert_close('outputs: full vs short', o_short, o_short_ref, 0.005)