Restore FlexAttention and FlashV3 backward (#2473)

bertmaher · facebook-github-bot · commit 611bf702394b · 2024-09-27T18:34:39.000-07:00
Summary: Pull Request resolved: #2473 Reviewed By: xuzhao9 Differential Revision: D63543625 Pulled By: bertmaher fbshipit-source-id: 1693e15875544bda0f5f6c69daa5597fffd80509
diff --git a/torchbenchmark/operators/flash_attention/operator.py b/torchbenchmark/operators/flash_attention/operator.py
@@ -225,9 +225,7 @@ def flash_v3(
         q = q.transpose(1, 2).contiguous()
         k = k.transpose(1, 2).contiguous()
         v = v.transpose(1, 2).contiguous()
-        fn = lambda: flashattn_hopper_cuda.fwd(
-            q, k, v, None, self.sm_scale, self.causal
-        )
+        fn = lambda: flash_attn_v3(q, k, v, self.sm_scale, self.causal)
         return fn
 
     @register_benchmark()
@@ -360,6 +358,25 @@ def sdpa_flash_attention(q, k, v):
             v,
         )
 
+    @register_benchmark()
+    def flex_attention(self, q, k, v):
+        from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        flex_attention = torch.compile(flex_attention, dynamic=False)
+
+        if self.causal:
+            B, H, S, D = q.shape
+            block_mask = create_block_mask(
+                causal_mask, B=None, H=None, Q_LEN=S, KV_LEN=S
+            )
+        else:
+            block_mask = None
+
+        return lambda: flex_attention(q, k, v, block_mask=block_mask)
+
     @register_metric()
     def tflops(
         self, fn_name: str, example_inputs: Any, metrics: BenchmarkOperatorMetrics