@@ -74,7 +74,7 @@ def import_trtllm():
7474
7575
7676def benchmark_trtllm_allreduce (
77- dtype : str , test_range : str , world_size : int , rank : int , use_slurm : bool , perf_filename : str
77+ dtype : str , test_range : str , world_size : int , rank : int , use_slurm : bool , perf_filename : str , measure_power : bool = False
7878):
7979 """Benchmark TensorRT-LLM AllReduce implementation"""
8080 trtllm_mods = import_trtllm ()
@@ -92,6 +92,20 @@ def benchmark_trtllm_allreduce(
9292 trtllm_mods ["cudart" ].cudaSetDevice (local_rank )
9393 mapping = trtllm_mods ["Mapping" ](world_size = world_size , rank = rank , gpus_per_node = gpus_per_node , tp_size = world_size )
9494
95+ # Initialize NVML power monitor if power measurement is enabled
96+ power_monitor = None
97+ if measure_power :
98+ try :
99+ from nvml_power_monitor import NVMLPowerMonitor
100+
101+ power_monitor = NVMLPowerMonitor (gpu_indices = [local_rank ])
102+ if rank == 0 :
103+ print (f"NVML power monitoring enabled on all ranks" )
104+ except Exception as e :
105+ if rank == 0 :
106+ print (f"Warning: Failed to initialize NVML power monitor: { e } " )
107+ raise # Fail if power measurement requested but NVML unavailable
108+
95109 # Parse test range
96110 min_size , max_size , ratio = [int (i ) for i in test_range .split ("," )]
97111 torch_dtype = tllm ._utils .str_dtype_to_torch (dtype )
@@ -140,30 +154,61 @@ def benchmark_trtllm_allreduce(
140154 g .replay ()
141155 torch .cuda .synchronize ()
142156
143- start_event .record ()
144- for i in range (num_runs ):
145- g .replay ()
146- end_event .record ()
147- torch .cuda .synchronize ()
157+ # Power measurement for AllReduce operations
158+ if measure_power and power_monitor is not None :
159+ power_monitor .begin_window ("allreduce" , sync_execution = True )
160+ start_event .record ()
161+ for i in range (num_runs ):
162+ g .replay ()
163+ end_event .record ()
164+ measurement = power_monitor .end_window ("allreduce" , sync_execution = True )
165+ torch .cuda .synchronize ()
166+ # Calculate average power across this rank
167+ power_rank = measurement .gpu_energy [local_rank ] / measurement .time
168+ else :
169+ start_event .record ()
170+ for i in range (num_runs ):
171+ g .replay ()
172+ end_event .record ()
173+ torch .cuda .synchronize ()
174+ power_rank = None
148175
149176 latency = start_event .elapsed_time (end_event ) / num_runs / repeat_n
150177
178+ # Collect power data from all ranks if measuring
179+ if measure_power and power_rank is not None :
180+ # Gather power from all ranks to rank 0
181+ power_list = [None ] * world_size if rank == 0 else None
182+ torch .distributed .all_gather_object (power_list if rank == 0 else [], power_rank )
183+ if rank == 0 :
184+ # Average power across all GPUs
185+ avg_power = sum (power_list ) / len (power_list )
186+ else :
187+ avg_power = None
188+ else :
189+ avg_power = None
190+
151191 if rank == 0 and local_rank == 0 :
152- print (f"[TensorRT-LLM] Size: { size } , Latency: { latency :.4f} ms" )
192+ print (f"[TensorRT-LLM] Size: { size } , Latency: { latency :.4f} ms" + ( f", Power: { avg_power :.2f } W" if avg_power is not None else "" ) )
153193
154194 # Get TensorRT-LLM version
155195 trtllm_version = tllm .__version__ if hasattr (tllm , "__version__" ) else "unknown"
156196
197+ # Build result item
198+ item = {
199+ "allreduce_dtype" : dtype ,
200+ "num_gpus" : world_size ,
201+ "message_size" : size ,
202+ "latency" : latency ,
203+ "implementation" : "trtllm" ,
204+ }
205+
206+ if avg_power is not None :
207+ item ["power" ] = avg_power
208+ item ["compute_bound" ] = 0 # Communication is always memory/bandwidth-bound
209+
157210 log_perf (
158- item_list = [
159- {
160- "allreduce_dtype" : dtype ,
161- "num_gpus" : world_size ,
162- "message_size" : size ,
163- "latency" : latency ,
164- "implementation" : "trtllm" ,
165- }
166- ],
211+ item_list = [item ],
167212 framework = "TRTLLM" ,
168213 version = trtllm_version ,
169214 device_name = torch .cuda .get_device_name (),
@@ -244,11 +289,25 @@ def setup_vllm_distributed(world_size, rank, use_slurm):
244289
245290
246291def benchmark_vllm_allreduce (
247- dtype : str , test_range : str , world_size : int , rank : int , use_slurm : bool , perf_filename : str
292+ dtype : str , test_range : str , world_size : int , rank : int , use_slurm : bool , perf_filename : str , measure_power : bool = False
248293):
249294 """Benchmark vLLM custom AllReduce backend"""
250295 vllm_mods , local_rank = setup_vllm_distributed (world_size , rank , use_slurm )
251296
297+ # Initialize NVML power monitor if power measurement is enabled
298+ power_monitor = None
299+ if measure_power :
300+ try :
301+ from nvml_power_monitor import NVMLPowerMonitor
302+
303+ power_monitor = NVMLPowerMonitor (gpu_indices = [local_rank ])
304+ if rank == 0 :
305+ print (f"NVML power monitoring enabled on all ranks" )
306+ except Exception as e :
307+ if rank == 0 :
308+ print (f"Warning: Failed to initialize NVML power monitor: { e } " )
309+ raise # Fail if power measurement requested but NVML unavailable
310+
252311 # Parse test range
253312 min_size , max_size , ratio = [int (i ) for i in test_range .split ("," )]
254313
@@ -303,11 +362,23 @@ def benchmark_vllm_allreduce(
303362 graph .replay ()
304363 torch .cuda .synchronize ()
305364
306- start_event .record ()
307- for i in range (num_runs ):
308- graph .replay ()
309- end_event .record ()
310- torch .cuda .synchronize ()
365+ # Power measurement for graph mode
366+ if measure_power and power_monitor is not None :
367+ power_monitor .begin_window ("allreduce_graph" , sync_execution = True )
368+ start_event .record ()
369+ for i in range (num_runs ):
370+ graph .replay ()
371+ end_event .record ()
372+ measurement = power_monitor .end_window ("allreduce_graph" , sync_execution = True )
373+ torch .cuda .synchronize ()
374+ power_rank = measurement .gpu_energy [local_rank ] / measurement .time
375+ else :
376+ start_event .record ()
377+ for i in range (num_runs ):
378+ graph .replay ()
379+ end_event .record ()
380+ torch .cuda .synchronize ()
381+ power_rank = None
311382
312383 else :
313384 # Eager mode
@@ -324,17 +395,43 @@ def benchmark_vllm_allreduce(
324395 start_event = torch .cuda .Event (enable_timing = True )
325396 end_event = torch .cuda .Event (enable_timing = True )
326397
327- start_event .record ()
328- for _ in range (num_runs ):
329- for _ in range (repeat_n ):
330- _ = vllm_mods ["tensor_model_parallel_all_reduce" ](input_tensor .clone ())
331- end_event .record ()
332- torch .cuda .synchronize ()
398+ # Power measurement for eager mode
399+ if measure_power and power_monitor is not None :
400+ power_monitor .begin_window ("allreduce_eager" , sync_execution = True )
401+ start_event .record ()
402+ for _ in range (num_runs ):
403+ for _ in range (repeat_n ):
404+ _ = vllm_mods ["tensor_model_parallel_all_reduce" ](input_tensor .clone ())
405+ end_event .record ()
406+ measurement = power_monitor .end_window ("allreduce_eager" , sync_execution = True )
407+ torch .cuda .synchronize ()
408+ power_rank = measurement .gpu_energy [local_rank ] / measurement .time
409+ else :
410+ start_event .record ()
411+ for _ in range (num_runs ):
412+ for _ in range (repeat_n ):
413+ _ = vllm_mods ["tensor_model_parallel_all_reduce" ](input_tensor .clone ())
414+ end_event .record ()
415+ torch .cuda .synchronize ()
416+ power_rank = None
333417
334418 latency = start_event .elapsed_time (end_event ) / num_runs / repeat_n
335419
420+ # Collect power data from all ranks if measuring
421+ if measure_power and power_rank is not None :
422+ # Gather power from all ranks to rank 0
423+ power_list = [None ] * world_size if rank == 0 else None
424+ torch .distributed .all_gather_object (power_list if rank == 0 else [], power_rank )
425+ if rank == 0 :
426+ # Average power across all GPUs
427+ avg_power = sum (power_list ) / len (power_list )
428+ else :
429+ avg_power = None
430+ else :
431+ avg_power = None
432+
336433 if rank == 0 :
337- print (f"[vLLM-{ mode_str } ] Size: { size } , Latency: { latency :.4f} ms" )
434+ print (f"[vLLM-{ mode_str } ] Size: { size } , Latency: { latency :.4f} ms" + ( f", Power: { avg_power :.2f } W" if avg_power is not None else "" ) )
338435
339436 # Get vLLM version
340437 try :
@@ -344,16 +441,21 @@ def benchmark_vllm_allreduce(
344441 except :
345442 vllm_version = "unknown"
346443
444+ # Build result item
445+ item = {
446+ "allreduce_dtype" : dtype ,
447+ "num_gpus" : world_size ,
448+ "message_size" : size ,
449+ "latency" : latency ,
450+ "backend" : f"vllm_{ mode_str } " ,
451+ }
452+
453+ if avg_power is not None :
454+ item ["power" ] = avg_power
455+ item ["compute_bound" ] = 0 # Communication is always memory/bandwidth-bound
456+
347457 log_perf (
348- item_list = [
349- {
350- "allreduce_dtype" : dtype ,
351- "num_gpus" : world_size ,
352- "message_size" : size ,
353- "latency" : latency ,
354- "backend" : f"vllm_{ mode_str } " ,
355- }
356- ],
458+ item_list = [item ],
357459 framework = "vLLM" ,
358460 version = vllm_version ,
359461 device_name = torch .cuda .get_device_name (),
@@ -376,9 +478,10 @@ def allreduce_benchmark(
376478 perf_filename : str = "custom_allreduce_perf.txt" ,
377479 world_size : Optional [int ] = None ,
378480 rank : Optional [int ] = None ,
481+ measure_power : bool = False ,
379482):
380483 """
381- CUDA Graph based AllReduce benchmark method supporting multiple backends
484+ CUDA Graph based AllReduce benchmark method supporting multiple backends with optional power measurement
382485 """
383486 # Setup distributed environment based on backend
384487 if backend == "trtllm" :
@@ -396,7 +499,7 @@ def allreduce_benchmark(
396499 if world_size == 1 :
397500 raise RuntimeError ("Benchmark must run with world_size > 1" )
398501
399- benchmark_trtllm_allreduce (dtype , test_range , world_size , rank , use_slurm , perf_filename )
502+ benchmark_trtllm_allreduce (dtype , test_range , world_size , rank , use_slurm , perf_filename , measure_power )
400503
401504 elif backend == "vllm" :
402505 if use_slurm :
@@ -418,7 +521,7 @@ def allreduce_benchmark(
418521 if world_size == 1 :
419522 raise RuntimeError ("Benchmark must run with world_size > 1" )
420523
421- benchmark_vllm_allreduce (dtype , test_range , world_size , rank , use_slurm , perf_filename )
524+ benchmark_vllm_allreduce (dtype , test_range , world_size , rank , use_slurm , perf_filename , measure_power )
422525 else :
423526 raise ValueError (f"Unknown backend: { backend } " )
424527
@@ -445,9 +548,22 @@ def allreduce_benchmark(
445548 # Additional arguments for vLLM when not using MPI/SLURM
446549 parser .add_argument ("--world-size" , default = 8 , type = int , help = "World size for distributed setup (vLLM)" )
447550 parser .add_argument ("--rank" , default = 0 , type = int , help = "Rank for distributed setup (vLLM)" )
551+ parser .add_argument (
552+ "--measure_power" ,
553+ action = "store_true" ,
554+ default = False ,
555+ help = "Enable power measurement during AllReduce benchmark" ,
556+ )
448557
449558 args = parser .parse_args ()
450559
451560 allreduce_benchmark (
452- args .backend , args .dtype , args .range , args .use_slurm , args .perf_filename , args .world_size , args .rank
561+ args .backend ,
562+ args .dtype ,
563+ args .range ,
564+ args .use_slurm ,
565+ args .perf_filename ,
566+ args .world_size ,
567+ args .rank ,
568+ args .measure_power ,
453569 )
0 commit comments