fla-org · Nathancgy · Sep 23, 2025 · Sep 26, 2025 · Nov 10, 2025 · Nov 10, 2025
diff --git a/fla/__init__.py b/fla/__init__.py
@@ -26,6 +26,7 @@
     RodimusAttention,
     RWKV6Attention,
     RWKV7Attention,
+    StickBreakingAttention,
 )
 from fla.models import (
     ABCForCausalLM,
@@ -74,6 +75,8 @@
     RWKV6Model,
     RWKV7ForCausalLM,
     RWKV7Model,
+    StickBreakingAttentionForCausalLM,
+    StickBreakingAttentionModel,
     TransformerForCausalLM,
     TransformerModel,
 )
@@ -105,6 +108,7 @@
     'RodimusAttention', 'RodimusForCausalLM', 'RodimusModel',
     'RWKV6Attention', 'RWKV6ForCausalLM', 'RWKV6Model',
     'RWKV7Attention', 'RWKV7ForCausalLM', 'RWKV7Model',
+    'StickBreakingAttention', 'StickBreakingAttentionForCausalLM', 'StickBreakingAttentionModel',
 ]
 
 __version__ = '0.4.0'
diff --git a/fla/layers/__init__.py b/fla/layers/__init__.py
@@ -30,6 +30,7 @@
 from .rodimus import RodimusAttention, SlidingWindowSharedKeyAttention
 from .rwkv6 import RWKV6Attention
 from .rwkv7 import RWKV7Attention
+from .stickbreaking_attn import StickBreakingAttention
 
 __all__ = [
     'ABCAttention',
@@ -61,6 +62,7 @@
     'RodimusAttention',
     'RWKV6Attention',
     'RWKV7Attention',
+    'StickBreakingAttention',
     'SlidingWindowSharedKeyAttention',
     'DeltaFormerAttention',
 ]
diff --git a/fla/layers/stickbreaking_attn.py b/fla/layers/stickbreaking_attn.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers.utils import logging
+
+from fla.modules import RMSNorm
+from fla.ops.stickbreaking_attn import parallel_stickbreaking_attn
+
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+
+
+logger = logging.get_logger(__name__)
+
+
+class StickBreakingAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_heads: int = 32,
+        num_kv_heads: int | None = None,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        window_size: int | None = None,
+        max_position_embeddings: int | None = None,
+        layer_idx: int | None = None,
+    ):
+        super().__init__()
+
+        if parallel_stickbreaking_attn is None:
+            raise ImportError(
+                "StickBreakingAttention kernels are not available. Ensure Triton is installed and ops are importable.",
+            )
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        if num_kv_heads is None:
+            self.num_kv_heads = self.num_heads
+        else:
+            self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.qkv_bias = qkv_bias
-        self.num_kv_groups = self.num_heads // self.num_kv_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.kv_dim = self.num_kv_heads * self.head_dim
-        self.qkv_bias = qkv_bias
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(f"hidden_size ({self.hidden_size}) must be divisible by num_heads ({self.num_heads}).")
+        if self.num_heads % self.num_kv_heads != 0:
+            raise ValueError(f"num_heads ({self.num_heads}) must be divisible by num_kv_heads ({self.num_kv_heads}).")
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        if self.num_kv_heads != self.num_heads:
+            raise NotImplementedError("GQA/MQA (num_kv_heads != num_heads) is not supported yet for StickBreakingAttention.")
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.qkv_bias = qkv_bias
-        self.num_kv_groups = self.num_heads // self.num_kv_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.kv_dim = self.num_kv_heads * self.head_dim
-        self.qkv_bias = qkv_bias
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(f"hidden_size ({self.hidden_size}) must be divisible by num_heads ({self.num_heads}).")
+        if self.num_heads % self.num_kv_heads != 0:
+            raise ValueError(f"num_heads ({self.num_heads}) must be divisible by num_kv_heads ({self.num_kv_heads}).")
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        if self.num_kv_heads != self.num_heads:
+            raise NotImplementedError("GQA/MQA (num_kv_heads != num_heads) is not supported yet for StickBreakingAttention.")
+        self.kv_dim = self.num_kv_heads * self.head_dim
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+
+        self.window_size = window_size
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_idx = layer_idx
+
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=self.qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=self.qkv_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+        if qk_norm:
+            self.q_norm = RMSNorm(self.head_dim)
+            self.k_norm = RMSNorm(self.head_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+
+        if use_cache:
+            warnings.warn(
+                "StickBreakingAttention does not support KV cache yet; falling back to use_cache=False.")
-        if attention_mask is not None:
-            assert len(attention_mask.shape) == 2, (
-                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
-                "for padding purposes (0 indicating padding). "
-                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
-            )
-
-        if use_cache:
-            warnings.warn(
-                "StickBreakingAttention does not support KV cache yet; falling back to use_cache=False.")
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+            valid = attention_mask.to(dtype=torch.bool)
+            if not torch.all(valid):
+                raise NotImplementedError(
+                    "Padding masks are not yet supported. Please supply packed sequences via `cu_seqlens` "
+                    "or implement masking before calling StickBreakingAttention."
+                )
+
+        if use_cache:
+            warnings.warn(
+                "StickBreakingAttention does not support KV cache yet; falling back to use_cache=False.")
-        if attention_mask is not None:
-            assert len(attention_mask.shape) == 2, (
-                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
-                "for padding purposes (0 indicating padding). "
-                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
-            )
-
-        if use_cache:
-            warnings.warn(
-                "StickBreakingAttention does not support KV cache yet; falling back to use_cache=False.")
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+            valid = attention_mask.to(dtype=torch.bool)
+            if not torch.all(valid):
+                raise NotImplementedError(
+                    "Padding masks are not yet supported. Please supply packed sequences via `cu_seqlens` "
+                    "or implement masking before calling StickBreakingAttention."
+                )
+
+        if use_cache:
+            warnings.warn(
+                "StickBreakingAttention does not support KV cache yet; falling back to use_cache=False.")
+            use_cache = False
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+        v = rearrange(self.v_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
+
+        if self.qk_norm:
+            q, k = self.q_norm(q), self.k_norm(k)
+
+        cu_seqlens = kwargs.get('cu_seqlens')
+        o, _rem = parallel_stickbreaking_attn(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens=cu_seqlens,
+        )
+        o = o.reshape(batch_size, q_len, -1)
+        o = self.o_proj(o)
+
+        return o, None, past_key_values
diff --git a/fla/models/__init__.py b/fla/models/__init__.py
@@ -31,6 +31,11 @@
 from fla.models.rwkv6 import RWKV6Config, RWKV6ForCausalLM, RWKV6Model
 from fla.models.rwkv7 import RWKV7Config, RWKV7ForCausalLM, RWKV7Model
 from fla.models.samba import SambaConfig, SambaForCausalLM, SambaModel
+from fla.models.stickbreaking_attn import (
+    StickBreakingAttentionConfig,
+    StickBreakingAttentionForCausalLM,
+    StickBreakingAttentionModel,
+)
 from fla.models.transformer import TransformerConfig, TransformerForCausalLM, TransformerModel
 
 __all__ = [
@@ -63,4 +68,5 @@
     'RWKV7Config', 'RWKV7ForCausalLM', 'RWKV7Model',
     'SambaConfig', 'SambaForCausalLM', 'SambaModel',
     'TransformerConfig', 'TransformerForCausalLM', 'TransformerModel',
+    'StickBreakingAttentionConfig', 'StickBreakingAttentionForCausalLM', 'StickBreakingAttentionModel',
 ]
diff --git a/fla/models/stickbreaking_attn/__init__.py b/fla/models/stickbreaking_attn/__init__.py
@@ -0,0 +1,15 @@
+
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+
+from fla.models.stickbreaking_attn.configuration_stickbreaking_attn import StickBreakingAttentionConfig
+from fla.models.stickbreaking_attn.modeling_stickbreaking_attn import (
+    StickBreakingAttentionForCausalLM,
+    StickBreakingAttentionModel,
+)
+
+AutoConfig.register(StickBreakingAttentionConfig.model_type, StickBreakingAttentionConfig, exist_ok=True)
+AutoModel.register(StickBreakingAttentionConfig, StickBreakingAttentionModel, exist_ok=True)
+AutoModelForCausalLM.register(StickBreakingAttentionConfig, StickBreakingAttentionForCausalLM, exist_ok=True)
+
+
+__all__ = ['StickBreakingAttentionConfig', 'StickBreakingAttentionForCausalLM', 'StickBreakingAttentionModel']
diff --git a/fla/models/stickbreaking_attn/configuration_stickbreaking_attn.py b/fla/models/stickbreaking_attn/configuration_stickbreaking_attn.py
@@ -0,0 +1,82 @@
+import warnings
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class StickBreakingAttentionConfig(PretrainedConfig):
+
+    model_type = 'stickbreaking_attn'
+    keys_to_ignore_at_inference = ['past_key_values']
+
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_hidden_layers: int = 24,
+        num_heads: int = 32,
+        num_kv_heads: int | None = None,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        window_size: int | None = None,
+        max_position_embeddings: int = 2048,
+        hidden_ratio: int | None = 4,
+        intermediate_size: int | None = None,
+        hidden_act: str = "swish",
+        initializer_range: float = 0.02,
+        elementwise_affine: bool | None = True,
+        norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        pad_token_id: int | None = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        fuse_linear_cross_entropy: bool = False,
+        use_l2warp: bool = False,
+        vocab_size: int = 32000,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.window_size = window_size
+        self.max_position_embeddings = max_position_embeddings
+
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+
+        self.initializer_range = initializer_range
+        self.elementwise_affine = elementwise_affine
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.fuse_linear_cross_entropy = fuse_linear_cross_entropy
+        self.use_l2warp = use_l2warp
+        self.vocab_size = vocab_size
+
+        if fuse_cross_entropy and fuse_linear_cross_entropy:
+            raise ValueError(
+                "`fuse_cross_entropy` and `fuse_linear_cross_entropy` cannot be True at the same time.",
+            )
+        if fuse_linear_cross_entropy:
+            warnings.warn(
+                "`fuse_linear_cross_entropy` is enabled, which can improves memory efficiency "
+                "at the potential cost of reduced precision. "
+                "If you observe issues like loss divergence, consider disabling this setting.",
+            )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )