update format3

YijiaZhao · YijiaZhao · commit 19fa3cf3fd87 · 2025-11-05T11:00:04.000+08:00
diff --git a/collector/collect.py b/collector/collect.py
@@ -267,9 +267,9 @@ def create_process_exit_error(device_id, exit_code):
     # Wait for processes
     for p in processes:
         if "moe" in func.__name__:
-            p.join(timeout = 2000)
+            p.join(timeout=2000)
         else:
-            p.join(timeout = 10)
+            p.join(timeout=10)
         if p.is_alive():
             logger.warning(f"Process {p.pid} did not terminate, forcing...")
             p.terminate()
@@ -376,7 +376,8 @@ def collect_trtllm(num_processes: int, ops: list[str] | None = None):
             "get_func": "get_context_mla_test_cases",
             "run_func": "run_mla",
             "version_handler": lambda v: "trtllm.collect_mla_1_1rc2"
-            if v.startswith(("1.1.0", "1.2.0")) else "trtllm.collect_mla",
+            if v.startswith(("1.1.0", "1.2.0"))
+            else "trtllm.collect_mla",
         },
         {
             "name": "trtllm",
@@ -385,7 +386,8 @@ def collect_trtllm(num_processes: int, ops: list[str] | None = None):
             "get_func": "get_generation_mla_test_cases",
             "run_func": "run_mla",
             "version_handler": lambda v: "trtllm.collect_mla_1_1rc2"
-            if v.startswith(("1.1.0", "1.2.0")) else "trtllm.collect_mla",
+            if v.startswith(("1.1.0", "1.2.0"))
+            else "trtllm.collect_mla",
         },
         # Attention collections - separate entries for context and generation
         {
diff --git a/collector/trtllm/collect_mla.py b/collector/trtllm/collect_mla.py
@@ -21,7 +21,7 @@
 
 
 def get_context_mla_test_cases():
-    dtype_list = [tensorrt_llm.bindings.DataType.BF16] # not support f8 for trt < v1.1
+    dtype_list = [tensorrt_llm.bindings.DataType.BF16]  # not support f8 for trt < v1.1
     test_cases = []
     n_list = [128]
     b_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
@@ -72,7 +72,7 @@ def get_context_mla_test_cases():
 
 
 def get_generation_mla_test_cases():
-    dtype_list = [tensorrt_llm.bindings.DataType.BF16] # not support f8 for trt < v1.1
+    dtype_list = [tensorrt_llm.bindings.DataType.BF16]  # not support f8 for trt < v1.1
     test_cases = []
     n_list = [128]
     for n in n_list:
diff --git a/collector/trtllm/collect_mla_1_1rc2.py b/collector/trtllm/collect_mla_1_1rc2.py
@@ -126,6 +126,7 @@ def get_generation_mla_test_cases():
                         )
     return test_cases
 
+
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
diff --git a/src/aiconfigurator/sdk/perf_database.py b/src/aiconfigurator/sdk/perf_database.py
@@ -1916,9 +1916,10 @@ def get_sol(
                 * b
                 * (
                     n * s * h  # Q read, assuming 16 bits
-                    + n * s * h # Output write, assuming 16 bits
-                ) +  kvcache_quant_mode.value.memory * b * (2 * n_kv * s * h)  # K,V read
-            ) #TODO fp8 io
+                    + n * s * h  # Output write, assuming 16 bits
+                )
+                + kvcache_quant_mode.value.memory * b * (2 * n_kv * s * h)  # K,V read
+            )  # TODO fp8 io
             sol_math = ops / self.system_spec["gpu"]["float16_tc_flops"] * 1000 / fmha_quant_mode.value.compute
             sol_mem = mem_bytes / self.system_spec["gpu"]["mem_bw"] * 1000
             sol_time = max(sol_math, sol_mem)
@@ -2035,9 +2036,9 @@ def get_sol(
             ops = (
                 b * num_heads * 2 / 2 * (s * s * 192 + s * s * 128)
             )  # 2 for fma, 2 for causality. num_heads, for local heads
-            mem_bytes = b * num_heads * (
-                kvcache_quant_mode.value.memory * (s * 192 + s * 128) + 2 * (s * 192 + s * 128)
-            ) # fp16 io + fp16/fp8 kv cache, TODO fp8 io
+            mem_bytes = (
+                b * num_heads * (kvcache_quant_mode.value.memory * (s * 192 + s * 128) + 2 * (s * 192 + s * 128))
+            )  # fp16 io + fp16/fp8 kv cache, TODO fp8 io
             sol_math = ops / self.system_spec["gpu"]["float16_tc_flops"] * 1000 / fmha_quant_mode.value.compute
             sol_mem = mem_bytes / self.system_spec["gpu"]["mem_bw"] * 1000
             sol_time = max(sol_math, sol_mem)

Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,7 @@ def get_generation_mla_test_cases():`
`126`	`126`	`)`
`127`	`127`	`return test_cases`
`128`	`128`
	`129`	`+`
`129`	`130`	`# Copied from transformers.models.llama.modeling_llama.rotate_half`
`130`	`131`	`def rotate_half(x):`
`131`	`132`	`"""Rotates half the hidden dims of the input."""`