[do NOT land] llama-3 8B w/ flex_attention

XilunWu · XilunWu · commit b9e59546998a · 2025-05-11T23:19:35.000-07:00
ghstack-source-id: c2d400e Pull Request resolved: #1181
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -47,6 +47,17 @@
         multiple_of=1024,
         rope_theta=500000,
     ),
+    "8B_flex_attn": TransformerModelArgs(
+        dim=4096,
+        n_layers=32,
+        n_heads=32,
+        n_kv_heads=8,
+        ffn_dim_multiplier=1.3,
+        multiple_of=1024,
+        rope_theta=500000,
+        use_flex_attn=True,
+        attn_mask_type="block_causal",
+    ),
     "70B": TransformerModelArgs(
         dim=8192,
         n_layers=80,
diff --git a/torchtitan/models/llama3/model.py b/torchtitan/models/llama3/model.py
@@ -51,12 +51,6 @@ def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> Non
                 "See https://github.com/pytorch/pytorch/issues/147879"
             )
 
-        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
-            raise ValueError(
-                "FlexAttention is not compatible with CP yet. "
-                "We are still working on this."
-            )
-
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
         nparams_embedding = sum(