add input size assertions. fix kda doc

sustcsonglin · sustcsonglin · commit 1700f8ddebe7 · 2025-11-11T16:48:13.000Z
diff --git a/fla/ops/gla/chunk.py b/fla/ops/gla/chunk.py
@@ -1316,5 +1316,9 @@ def chunk_gla(
             )
     if scale is None:
         scale = q.shape[-1] ** -0.5
+    if initial_state is not None:
+        assert initial_state.dtype == torch.float32, "initial_state must be in float32."
+    assert q.shape == k.shape == g.shape, "q, k, g must have the same shape."
+    assert v.shape == (q.shape[0], q.shape[1], q.shape[2], v.shape[-1]), "v must be of shape (batch size, seq len, num of head, head dim)."
     o, final_state = ChunkGLAFunction.apply(q, k, v, g, scale, initial_state, output_final_state, cu_seqlens)
     return o, final_state
diff --git a/fla/ops/kda/chunk.py b/fla/ops/kda/chunk.py
@@ -334,6 +334,11 @@ def chunk_kda(
                 f"The number of initial states is expected to be equal to the number of input sequences, "
                 f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}.",
             )
+    if initial_state is not None:
+        assert initial_state.dtype == torch.float32, "initial_state must be in float32."
+    assert q.shape == k.shape == g.shape, "q, k, g must have the same shape."
+    assert beta.shape == (q.shape[0], q.shape[1], q.shape[2]), "beta must be of shape (batch size, num of head, seq len)."
+    assert v.shape == (q.shape[0], q.shape[1], q.shape[2], v.shape[-1]), "v must be of shape (batch size, seq len, num of head, head dim)."
     if scale is None:
         scale = k.shape[-1] ** -0.5
     o, final_state = ChunkKDAFunction.apply(