re-use perf_database function to calculate SOL in collector

kaim-eng · kaim-eng · commit ff7a82a58bbe · 2025-11-20T08:34:04.000-08:00
Signed-off-by: Kai Ma &lt;kaim@nvidia.com&gt;
diff --git a/collector/collect.py b/collector/collect.py
@@ -134,6 +134,16 @@ def worker(
             worker_logger.exception("Failed to initialize NVML power monitor")
             raise  # Fail if power measurement requested but NVML unavailable
 
+    # Get default power limit if measuring power but no limits specified
+    default_power_limit = None
+    if measure_power and not power_limits:
+        try:
+            from nvml_power_monitor import get_power_management_limit
+            default_power_limit = get_power_management_limit(device_id)
+            worker_logger.info(f"Auto-detected power limit: {default_power_limit}W on device {device_id}")
+        except Exception as e:
+            worker_logger.warning(f"Could not get power limit, power data will not be recorded: {e}")
+
     # Process tasks
     while True:
         task_info = queue.get()
@@ -150,7 +160,7 @@ def worker(
             task_id = create_test_case_id(task, "unknown", module_name)
 
         # Sweep power limits
-        for power_limit in power_limits or [None]:
+        for power_limit in power_limits or [default_power_limit]:
             with lock:
                 progress_value.value += 1
 
diff --git a/collector/helper.py b/collector/helper.py
@@ -383,3 +383,164 @@ def measure_kernel_power(
     avg_power_watts = total_energy_j / (total_time_ms / 1000)  # J / seconds
 
     return avg_latency_ms, avg_power_watts
+
+
+def get_system_spec_from_device(device_name: str) -> dict:
+    """Load full system spec from device name.
+    
+    Args:
+        device_name: GPU device name
+        
+    Returns:
+        Full system_spec dict with 'gpu' key
+    """
+    device_upper = device_name.upper()
+    if "H100" in device_upper:
+        system_file = "h100_sxm.yaml"
+    elif "H200" in device_upper:
+        system_file = "h200_sxm.yaml"
+    elif "A100" in device_upper:
+        system_file = "a100_sxm.yaml"
+    elif "B200" in device_upper:
+        system_file = "b200_sxm.yaml"
+    elif "GB200" in device_upper:
+        system_file = "gb200_sxm.yaml"
+    else:
+        raise ValueError(f"Unsupported GPU: {device_name}")
+
+    systems_dir = pkg_resources.files("aiconfigurator") / "systems"
+    yaml_path = systems_dir / system_file
+
+    with open(yaml_path) as f:
+        system_spec = yaml.safe_load(f)
+
+    return system_spec
+
+
+def _get_gemm_quant_mode(dtype_str: str):
+    """Map dtype string to GEMMQuantMode enum."""
+    from aiconfigurator.sdk import common
+    
+    dtype_map = {
+        "float16": common.GEMMQuantMode.float16,
+        "fp8": common.GEMMQuantMode.fp8,
+        "fp8_block": common.GEMMQuantMode.fp8_block,
+        "nvfp4": common.GEMMQuantMode.nvfp4,
+    }
+    
+    if dtype_str not in dtype_map:
+        raise ValueError(f"Unsupported dtype: {dtype_str}")
+    
+    return dtype_map[dtype_str]
+
+
+def _get_kvcache_quant_mode(dtype_str: str, use_fp8_kv_cache: bool):
+    """Map dtype and fp8 flag to KVCacheQuantMode enum."""
+    from aiconfigurator.sdk import common
+    
+    if use_fp8_kv_cache or "fp8" in dtype_str.lower():
+        return common.KVCacheQuantMode.fp8
+    else:
+        return common.KVCacheQuantMode.float16
+
+
+def _get_fmha_quant_mode(dtype_str: str, use_fp8_context_fmha: bool):
+    """Map dtype and fp8 flag to FMHAQuantMode enum."""
+    from aiconfigurator.sdk import common
+    
+    if use_fp8_context_fmha or "fp8" in dtype_str.lower():
+        return common.FMHAQuantMode.fp8
+    else:
+        return common.FMHAQuantMode.float16
+
+
+def is_gemm_compute_bound_collector(m: int, n: int, k: int, dtype: str, device_name: str) -> bool:
+    """
+    Determine if a GEMM operation is compute-bound.
+    Wrapper for use in collectors.
+    
+    Args:
+        m, n, k: GEMM dimensions (C = A @ B, A is mxk, B is kxn)
+        dtype: Data type (e.g., 'float16', 'fp8')
+        device_name: GPU device name
+    
+    Returns:
+        True if compute-bound, False if memory-bound
+    """
+    from aiconfigurator.sdk import common
+    from aiconfigurator.sdk.perf_database import PerfDatabase
+    
+    system_spec = get_system_spec_from_device(device_name)
+    quant_mode = _get_gemm_quant_mode(dtype)
+    
+    # Create minimal PerfDatabase instance just to call query_gemm with SOL_FULL
+    db = PerfDatabase.__new__(PerfDatabase)
+    db.system_spec = system_spec
+    
+    sol_time, sol_math, sol_mem = db.query_gemm(m, n, k, quant_mode, sol_mode=common.SOLMode.SOL_FULL)
+    return sol_math > sol_mem
+
+
+def is_context_attention_compute_bound_collector(
+    b: int,
+    s: int,
+    num_heads: int,
+    num_key_value_heads: int,
+    head_dim: int,
+    dtype: str,
+    kv_cache_dtype: str,
+    use_fp8_kv_cache: bool,
+    use_fp8_context_fmha: bool,
+    device_name: str,
+    attention_window_size: int = 0,
+) -> bool:
+    """
+    Determine if context (prefill) attention is compute-bound.
+    Wrapper for use in collectors.
+    
+    Args:
+        b: Batch size
+        s: Sequence length (input)
+        num_heads: Number of query heads (H_q)
+        num_key_value_heads: Number of key/value heads (H_kv)
+        head_dim: Head dimension
+        dtype: Activation dtype
+        kv_cache_dtype: KV cache dtype
+        use_fp8_kv_cache: Whether using FP8 for KV cache
+        use_fp8_context_fmha: Whether using FP8 for context FMHA
+        device_name: GPU device name
+        attention_window_size: Attention window size
+    
+    Returns:
+        True if compute-bound, False if memory-bound
+    """
+    from aiconfigurator.sdk import common
+    from aiconfigurator.sdk.perf_database import PerfDatabase
+    
+    system_spec = get_system_spec_from_device(device_name)
+    kvcache_quant_mode = _get_kvcache_quant_mode(kv_cache_dtype, use_fp8_kv_cache)
+    fmha_quant_mode = _get_fmha_quant_mode(dtype, use_fp8_context_fmha)
+    
+    # Create minimal PerfDatabase instance just to call query_context_attention with SOL_FULL
+    db = PerfDatabase.__new__(PerfDatabase)
+    db.system_spec = system_spec
+    
+    sol_time, sol_math, sol_mem = db.query_context_attention(
+        b, s, num_heads, num_key_value_heads,
+        kvcache_quant_mode, fmha_quant_mode,
+        sol_mode=common.SOLMode.SOL_FULL,
+        window_size=attention_window_size,
+        head_size=head_dim
+    )
+    return sol_math > sol_mem
+
+
+def is_generation_attention_compute_bound_collector() -> bool:
+    """
+    Determine if generation (decode) attention is compute-bound.
+    Generation attention is ALWAYS memory-bound.
+    
+    Returns:
+        False (always memory-bound)
+    """
+    return False
diff --git a/collector/trtllm/collect_attn.py b/collector/trtllm/collect_attn.py
@@ -23,59 +23,13 @@
     get_dtype_size,
     get_gpu_specs_from_device,
     get_sm_version,
+    is_context_attention_compute_bound_collector,
+    is_generation_attention_compute_bound_collector,
     log_perf,
     measure_kernel_power,
 )
 
 
-def is_context_attention_compute_bound(b, s, num_heads, num_key_value_heads, d, dtype, kv_cache_dtype, device_name):
-    """
-    Determine if context (prefill) attention is compute-bound with Grouped-Query Attention.
-
-    Args:
-        b: Batch size
-        s: Sequence length (input)
-        num_heads: Number of query heads (H_q)
-        num_key_value_heads: Number of key/value heads (H_kv)
-        d: Head dimension
-        dtype: Activation dtype
-        kv_cache_dtype: KV cache dtype
-        device_name: GPU device name
-
-    Returns:
-        True if compute-bound, False if memory-bound
-    """
-    gpu_specs = get_gpu_specs_from_device(device_name)
-    dtype_size = get_dtype_size(dtype)
-    kv_dtype_size = get_dtype_size(kv_cache_dtype)
-
-    # Hardware intensity
-    if "fp8" in dtype.lower():
-        hardware_tflops = gpu_specs["fp8_tflops"]
-    else:
-        hardware_tflops = gpu_specs["float16_tflops"]
-
-    hardware_intensity = (hardware_tflops * 1e12) / (gpu_specs["mem_bw_gbs"] * 1e9)
-
-    # GQA Attention FLOPs
-    total_flops = 4 * b * num_heads * s * s * d
-
-    # GQA Attention Memory Movement
-    memory_bytes = (
-        dtype_size * b * s * num_heads * d  # Q read (all query heads)
-        + kv_dtype_size * b * s * num_key_value_heads * d * 2  # K read and write (KV heads)
-        + kv_dtype_size * b * s * num_key_value_heads * d * 2  # V read and write (KV heads)
-        + dtype_size * b * s * num_heads * d  # Output write
-    )
-
-    arithmetic_intensity = total_flops / memory_bytes
-
-    return arithmetic_intensity > hardware_intensity
-
-
-def is_generation_attention_compute_bound():
-    """Generation (decode) attention is ALWAYS memory-bound"""
-    return False
 
 
 def run_attention_torch(
@@ -303,14 +257,24 @@ def run_attention_torch(
 
     # Determine if compute-bound
     if is_context_phase:
-        compute_bound = is_context_attention_compute_bound(
-            batch_size, input_len, num_heads, num_key_value_heads, head_dim, dtype_str, kv_cache_dtype_str, device_name
+        compute_bound = is_context_attention_compute_bound_collector(
+            batch_size,
+            input_len,
+            num_heads,
+            num_key_value_heads,
+            head_dim,
+            dtype_str,
+            kv_cache_dtype_str,
+            use_fp8_kv_cache,
+            use_fp8_context_fmha,
+            device_name,
+            attention_window_size,
         )
         isl = input_len
         step = 0
         op_name = "context_attention"
     else:
-        compute_bound = is_generation_attention_compute_bound()
+        compute_bound = is_generation_attention_compute_bound_collector()
         isl = 1
         step = input_len
         op_name = "generation_attention"
diff --git a/collector/trtllm/collect_gemm.py b/collector/trtllm/collect_gemm.py
@@ -12,6 +12,7 @@
     get_dtype_size,
     get_gpu_specs_from_device,
     get_sm_version,
+    is_gemm_compute_bound_collector,
     log_perf,
     measure_kernel_power,
 )
@@ -86,36 +87,6 @@ def get_gemm_test_cases():
     return test_cases
 
 
-def is_gemm_compute_bound(m, n, k, dtype, device_name):
-    """
-    Determine if a GEMM operation is compute-bound.
-
-    Args:
-        m, n, k: GEMM dimensions (C = A @ B, A is mxk, B is kxn)
-        dtype: Data type (e.g., 'float16', 'fp8')
-        device_name: GPU device name
-
-    Returns:
-        True if compute-bound, False if memory-bound
-    """
-    gpu_specs = get_gpu_specs_from_device(device_name)
-    dtype_size = get_dtype_size(dtype)
-
-    # Hardware intensity (FLOPs per byte)
-    if "fp8" in dtype.lower():
-        hardware_tflops = gpu_specs["fp8_tflops"]
-    else:
-        hardware_tflops = gpu_specs["float16_tflops"]
-
-    hardware_intensity = (hardware_tflops * 1e12) / (gpu_specs["mem_bw_gbs"] * 1e9)
-
-    # GEMM arithmetic intensity
-    total_flops = 2 * m * n * k
-    memory_bytes = dtype_size * (m * k + k * n + m * n)
-    arithmetic_intensity = total_flops / memory_bytes
-
-    # Compute-bound if arithmetic intensity > hardware intensity
-    return arithmetic_intensity > hardware_intensity
 
 
 def run_gemm(
@@ -222,7 +193,7 @@ def run_gemm(
             op.forward(x)
 
     # Determine if compute-bound
-    compute_bound = is_gemm_compute_bound(m, n, k, gemm_type, device_name)
+    compute_bound = is_gemm_compute_bound_collector(m, n, k, gemm_type, device_name)
 
     # Benchmarking
     if measure_power and power_monitor is not None and not compute_bound:
diff --git a/src/aiconfigurator/sdk/perf_database.py b/src/aiconfigurator/sdk/perf_database.py