ai-dynamo
diff --git a/‎collector/trtllm/collect_attn.py‎
Lines changed: 7 additions & 2 deletions b/‎collector/trtllm/collect_attn.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎collector/trtllm/collect_moe.py‎
Lines changed: 4 additions & 0 deletions b/‎collector/trtllm/collect_moe.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/aiconfigurator/sdk/common.py‎
Lines changed: 4 additions & 1 deletion b/‎src/aiconfigurator/sdk/common.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/aiconfigurator/sdk/models.py‎
Lines changed: 27 additions & 2 deletions b/‎src/aiconfigurator/sdk/models.py‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎src/aiconfigurator/sdk/operations.py‎
Lines changed: 18 additions & 4 deletions b/‎src/aiconfigurator/sdk/operations.py‎
Lines changed: 18 additions & 4 deletions
@@ -272,11 +272,14 @@ def get_context_attention_test_cases():
                         #print(f'collecting heads: {n} kv_heads: {num_kv_heads} seq: {s} batchsize: {b}')
                         # use fp8 kv cache, fp8 context fmha, is_context_phase. in torch flow, int8 kvcache is not supported yet.
                         # fp16 kv cache, fp16 context fmha, is_context_phase
-                        if head_dim == 64:
+                        if h == 64:
                             test_cases.append([b, s, n, num_kv_heads, h, 128, False, False, True, 'context_attention_perf.txt'])
+                            test_cases.append([b, s, n, num_kv_heads, h, 0, False, False, True, 'context_attention_perf.txt'])
                             if has_fp8:
                                 test_cases.append([b, s, n, num_kv_heads, h, 128, True, False, True, 'context_attention_perf.txt'])
                                 test_cases.append([b, s, n, num_kv_heads, h, 128, True, True, True, 'context_attention_perf.txt'])
+                                test_cases.append([b, s, n, num_kv_heads, h, 0, True, False, True, 'context_attention_perf.txt'])
+                                test_cases.append([b, s, n, num_kv_heads, h, 0, True, True, True, 'context_attention_perf.txt'])
                         else:
                             test_cases.append([b, s, n, num_kv_heads, h, 0, False, False, True, 'context_attention_perf.txt'])
                             if has_fp8:
@@ -375,10 +378,12 @@ def get_generation_attention_test_cases():
                             maxNumHeadsQPerKvInCta = 32
                             if mNumHeadsQPerKv >= maxNumHeadsQPerKvInCta and mNumHeadsQPerKv % maxNumHeadsQPerKvInCta != 0:
                                 continue
-                        if head_dim == 64:
+                        if h == 64:
                             test_cases.append([b, s, n, n_kv, h, 128, False, False, False, 'generation_attention_perf.txt'])
+                            test_cases.append([b, s, n, n_kv, h, 0, False, False, False, 'generation_attention_perf.txt'])
                             if has_fp8:
                                 test_cases.append([b, s, n, n_kv, h, 128, True, False, False, 'generation_attention_perf.txt'])
+                                test_cases.append([b, s, n, n_kv, h, 0, True, False, False, 'generation_attention_perf.txt'])
                                 # currently, fp8 is not for generation compute
                                 #test_cases.append([b, s, n, n_kv, 128, True, True, False, 'generation_attention_perf.txt'])
                         else:
 
@@ -242,6 +242,10 @@ def run_moe_torch(moe_type, num_tokens_lists, hidden_size, inter_size, topk, num
         swiglu_limit = torch.tensor(
             [7.0] * (num_experts // moe_ep_size),
             dtype=torch.float32).cuda()
+        if 86 < getSMVersion() <100:
+            model_config.moe_backend = 'triton'
+        else:
+            model_config.moe_backend = 'cutlass' if not min_latency_mode else 'trtllm'
     else:
         model_config.moe_backend = 'cutlass' if not min_latency_mode else 'trtllm'
 
 
@@ -70,7 +70,9 @@ class BlockConfig:
                                                         BlockConfig(None, True, 3.28125, False, 1),
                                                         BlockConfig(None, True, 5.25, False, 1)
                                                     ]
-                                                    ]
+                                                    ],
+                'GPT_OSS_120B':['MOE',36,64,8,64,2880,2880,201088,131072,4,128,2880,None],
+                'GPT_OSS_20B':['MOE',24,64,8,64,2880,2880,201088,131072,4,32,2880,None]
               }
 
 """
@@ -186,6 +188,7 @@ class MoEQuantMode(Enum):
     fp8_block = QuantMapping(1, 2, 'fp8_block') # specific for trtllm torch ds fp8
     w4afp8 = QuantMapping(0.5, 2, 'w4afp8') # specific for trtllm torch ds w4a8
     nvfp4 = QuantMapping(0.5, 4, 'nvfp4') # nvfp4 on blackwell
+    w4a16_mxfp4 = QuantMapping(0.5, 1, 'w4a16_mxfp4') #native data format for gpt oss
 
 class FMHAQuantMode(Enum):
     """
 
@@ -266,10 +266,33 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
         fmha_quant_mode = self.config.fmha_quant_mode
         workload_distribution = self.config.workload_distribution + f"_{self._power_law_alpha}"
 
+        if self.model_name in ['GPT_OSS_120B','GPT_OSS_20B']:
+            attn_scale_factor = 2
+            window_size = 128
+            self.context_ops.append(ops.ContextAttention(f'context_attention', 
+                                                         self._num_layers/attn_scale_factor, 
+                                                         self._num_heads//tp_size, 
+                                                         num_kv_heads_per_GPU, 
+                                                         kvcache_quant_mode, 
+                                                         fmha_quant_mode,
+                                                         window_size,
+                                                         self._head_size))
+            self.generation_ops.append(ops.GenerationAttention(f'generation_attention', 
+                                                               self._num_layers/attn_scale_factor, 
+                                                               self._num_heads//tp_size, 
+                                                               num_kv_heads_per_GPU, 
+                                                               kvcache_quant_mode,
+                                                               window_size,
+                                                               self._head_size))
+        else:
+            attn_scale_factor = 1
+
         self.context_ops.extend([ops.Embedding(f'context_embedding', 1, self._vocab_size, h, 0.3),
                                 ops.ElementWise(f'context_add_norm_1', self._num_layers, 2*h, 2*h, 0.8),
                                 ops.GEMM(f'context_qkv_gemm', self._num_layers, self._num_heads*self._head_size//tp_size+self._head_size*num_kv_heads_per_GPU*2, h, gemm_quant_mode),
-                                ops.ContextAttention(f'context_attention', self._num_layers, self._num_heads//tp_size, num_kv_heads_per_GPU, kvcache_quant_mode, fmha_quant_mode),
+                                ops.ContextAttention(f'context_attention', self._num_layers/attn_scale_factor, 
+                                                     self._num_heads//tp_size, num_kv_heads_per_GPU, kvcache_quant_mode, 
+                                                     fmha_quant_mode, head_size=self._head_size),
                                 ops.GEMM(f'context_proj_gemm', self._num_layers, h, self._num_heads*self._head_size//tp_size, gemm_quant_mode),
                                 ops.ElementWise(f'context_add_norm_2', self._num_layers, 2*h, 2*h, 0.8)])
 
@@ -290,7 +313,9 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
         self.generation_ops.extend([ops.Embedding(f'generation_embedding', 1, self._vocab_size, h, 0.3),
                                 ops.ElementWise(f'generation_add_norm_1', self._num_layers, 2*h, 2*h, 0.8),
                                 ops.GEMM(f'generation_qkv_gemm', self._num_layers, self._num_heads*self._head_size//tp_size+self._head_size*num_kv_heads_per_GPU*2, h, gemm_quant_mode),
-                                ops.GenerationAttention(f'generation_attention', self._num_layers, self._num_heads//tp_size, num_kv_heads_per_GPU, kvcache_quant_mode),
+                                ops.GenerationAttention(f'generation_attention', self._num_layers/attn_scale_factor, 
+                                                        self._num_heads//tp_size, num_kv_heads_per_GPU, kvcache_quant_mode, 
+                                                        head_size=self._head_size),
                                 ops.GEMM(f'generation_proj_gemm', self._num_layers, h, self._num_heads*self._head_size//tp_size, gemm_quant_mode),
                                 ops.ElementWise(f'generation_add_norm_2', self._num_layers, 2*h, 2*h, 0.8)])
 
 
@@ -317,18 +317,25 @@ def __init__(self,
                  n: int, 
                  n_kv: int, 
                  kvcache_quant_mode: common.KVCacheQuantMode, 
-                 fmha_quant_mode: common.FMHAQuantMode) -> None:
+                 fmha_quant_mode: common.FMHAQuantMode,
+                 window_size: int = 0,
+                 head_size: int = 128) -> None:
         super().__init__(name, scale_factor)
         self._n = n
         self._weights = 0.0
         self._n_kv = n_kv
         self._kvcache_quant_mode = kvcache_quant_mode
         self._fmha_quant_mode = fmha_quant_mode
+        self._window_size = window_size
+        self._head_size = head_size
 
     def query(self, database:PerfDatabase, **kwargs):
         batch_size = kwargs.get('batch_size')
         isl = kwargs.get('s')
-        return database.query_context_attention(batch_size, isl, self._n, self._n_kv, self._kvcache_quant_mode, self._fmha_quant_mode)*self._scale_factor
+        return database.query_context_attention(batch_size, isl, self._n, self._n_kv, 
+                                                self._kvcache_quant_mode, self._fmha_quant_mode,
+                                                window_size=self._window_size,
+                                                head_size=self._head_size)*self._scale_factor
 
     def get_weights(self, **kwargs):
         return self._weights * self._scale_factor
@@ -342,19 +349,26 @@ def __init__(self,
                  scale_factor: float, 
                  n: int, 
                  n_kv: int, 
-                 kv_cache_dtype: common.KVCacheQuantMode) -> None:
+                 kv_cache_dtype: common.KVCacheQuantMode,
+                 window_size: int = 0,
+                 head_size: int = 128) -> None:
         super().__init__(name, scale_factor)
         self._n = n
         self._weights = 0.0
         self._n_kv = n_kv
         self._kv_cache_dtype = kv_cache_dtype
+        self._window_size = window_size
+        self._head_size = head_size 
 
     def query(self, database:PerfDatabase, **kwargs):
         beam_width = kwargs.get('beam_width')
         assert(beam_width == 1), "only support beam_width=1"
         batch_size = kwargs.get('batch_size')
         s = kwargs.get('s')
-        return database.query_generation_attention(batch_size, s, self._n, self._n_kv, self._kv_cache_dtype)*self._scale_factor
+        return database.query_generation_attention(batch_size, s, self._n, self._n_kv, 
+                                                   self._kv_cache_dtype,
+                                                   window_size=self._window_size,
+                                                   head_size=self._head_size)*self._scale_factor
 
     def get_weights(self, **kwargs):
         return self._weights * self._scale_factor