Skip to content

Commit 844cc25

Browse files
committed
Patch: Migrate NVML from Zeus and fix the collect_comm.sh bug
Signed-off-by: Kai Ma <[email protected]>
1 parent d7c5fb5 commit 844cc25

File tree

8 files changed

+192
-76
lines changed

8 files changed

+192
-76
lines changed

collector/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ NVML power/energy counters only update once 100 ms, which is significantly longe
135135

136136
### Requirements
137137

138-
- Zeus (`pip install zeus`) for measurement & GPU power limit control
138+
- pynvml (`pip install nvidia-ml-py`, already included in aiconfigurator dependencies) for measurement & GPU power limit control
139139
- Setting power limits requires root access. Easiest way is to run in a Docker container run with `--cap-add SYS_ADMIN`.
140140

141141
### Relevant Columns in Result CSV

collector/collect.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,17 @@ def worker(
122122
torch.cuda.set_device(device_id)
123123
worker_logger.info(f"Worker {device_id} initialized for {module_name}")
124124

125-
# Initialize Zeus monitor if power measurement is enabled
126-
zeus_monitor = None
125+
# Initialize NVML power monitor if power measurement is enabled
126+
power_monitor = None
127127
if measure_power:
128128
try:
129-
from zeus.monitor import ZeusMonitor
129+
from nvml_power_monitor import NVMLPowerMonitor
130130

131-
zeus_monitor = ZeusMonitor(gpu_indices=[device_id])
132-
worker_logger.info(f"Zeus power monitoring enabled on device {device_id}")
131+
power_monitor = NVMLPowerMonitor(gpu_indices=[device_id])
132+
worker_logger.info(f"NVML power monitoring enabled on device {device_id}")
133133
except Exception:
134-
worker_logger.exception("Failed to initialize Zeus")
135-
raise # Fail if power measurement requested but Zeus unavailable
134+
worker_logger.exception("Failed to initialize NVML power monitor")
135+
raise # Fail if power measurement requested but NVML unavailable
136136

137137
# Process tasks
138138
while True:
@@ -157,9 +157,9 @@ def worker(
157157
# Set power limit if specified
158158
if power_limit is not None:
159159
try:
160-
from zeus.device import get_gpus
160+
from nvml_power_monitor import set_power_management_limit
161161

162-
get_gpus().set_power_management_limit(device_id, power_limit)
162+
set_power_management_limit(device_id, power_limit)
163163
worker_logger.debug(f"Set power limit to {power_limit}W on device {device_id}")
164164
except Exception as e:
165165
worker_logger.warning(f"Failed to set power limit: {e}")
@@ -169,7 +169,7 @@ def worker(
169169
func(
170170
*task,
171171
device,
172-
zeus_monitor=zeus_monitor,
172+
power_monitor=power_monitor,
173173
power_limit=power_limit,
174174
measure_power=measure_power,
175175
kernel_power_measurement_duration=kernel_power_measurement_duration,

collector/collect_all_reduce.py

Lines changed: 158 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def import_trtllm():
7474

7575

7676
def benchmark_trtllm_allreduce(
77-
dtype: str, test_range: str, world_size: int, rank: int, use_slurm: bool, perf_filename: str
77+
dtype: str, test_range: str, world_size: int, rank: int, use_slurm: bool, perf_filename: str, measure_power: bool = False
7878
):
7979
"""Benchmark TensorRT-LLM AllReduce implementation"""
8080
trtllm_mods = import_trtllm()
@@ -92,6 +92,20 @@ def benchmark_trtllm_allreduce(
9292
trtllm_mods["cudart"].cudaSetDevice(local_rank)
9393
mapping = trtllm_mods["Mapping"](world_size=world_size, rank=rank, gpus_per_node=gpus_per_node, tp_size=world_size)
9494

95+
# Initialize NVML power monitor if power measurement is enabled
96+
power_monitor = None
97+
if measure_power:
98+
try:
99+
from nvml_power_monitor import NVMLPowerMonitor
100+
101+
power_monitor = NVMLPowerMonitor(gpu_indices=[local_rank])
102+
if rank == 0:
103+
print(f"NVML power monitoring enabled on all ranks")
104+
except Exception as e:
105+
if rank == 0:
106+
print(f"Warning: Failed to initialize NVML power monitor: {e}")
107+
raise # Fail if power measurement requested but NVML unavailable
108+
95109
# Parse test range
96110
min_size, max_size, ratio = [int(i) for i in test_range.split(",")]
97111
torch_dtype = tllm._utils.str_dtype_to_torch(dtype)
@@ -140,30 +154,61 @@ def benchmark_trtllm_allreduce(
140154
g.replay()
141155
torch.cuda.synchronize()
142156

143-
start_event.record()
144-
for i in range(num_runs):
145-
g.replay()
146-
end_event.record()
147-
torch.cuda.synchronize()
157+
# Power measurement for AllReduce operations
158+
if measure_power and power_monitor is not None:
159+
power_monitor.begin_window("allreduce", sync_execution=True)
160+
start_event.record()
161+
for i in range(num_runs):
162+
g.replay()
163+
end_event.record()
164+
measurement = power_monitor.end_window("allreduce", sync_execution=True)
165+
torch.cuda.synchronize()
166+
# Calculate average power across this rank
167+
power_rank = measurement.gpu_energy[local_rank] / measurement.time
168+
else:
169+
start_event.record()
170+
for i in range(num_runs):
171+
g.replay()
172+
end_event.record()
173+
torch.cuda.synchronize()
174+
power_rank = None
148175

149176
latency = start_event.elapsed_time(end_event) / num_runs / repeat_n
150177

178+
# Collect power data from all ranks if measuring
179+
if measure_power and power_rank is not None:
180+
# Gather power from all ranks to rank 0
181+
power_list = [None] * world_size if rank == 0 else None
182+
torch.distributed.all_gather_object(power_list if rank == 0 else [], power_rank)
183+
if rank == 0:
184+
# Average power across all GPUs
185+
avg_power = sum(power_list) / len(power_list)
186+
else:
187+
avg_power = None
188+
else:
189+
avg_power = None
190+
151191
if rank == 0 and local_rank == 0:
152-
print(f"[TensorRT-LLM] Size: {size}, Latency: {latency:.4f} ms")
192+
print(f"[TensorRT-LLM] Size: {size}, Latency: {latency:.4f} ms" + (f", Power: {avg_power:.2f} W" if avg_power is not None else ""))
153193

154194
# Get TensorRT-LLM version
155195
trtllm_version = tllm.__version__ if hasattr(tllm, "__version__") else "unknown"
156196

197+
# Build result item
198+
item = {
199+
"allreduce_dtype": dtype,
200+
"num_gpus": world_size,
201+
"message_size": size,
202+
"latency": latency,
203+
"implementation": "trtllm",
204+
}
205+
206+
if avg_power is not None:
207+
item["power"] = avg_power
208+
item["compute_bound"] = 0 # Communication is always memory/bandwidth-bound
209+
157210
log_perf(
158-
item_list=[
159-
{
160-
"allreduce_dtype": dtype,
161-
"num_gpus": world_size,
162-
"message_size": size,
163-
"latency": latency,
164-
"implementation": "trtllm",
165-
}
166-
],
211+
item_list=[item],
167212
framework="TRTLLM",
168213
version=trtllm_version,
169214
device_name=torch.cuda.get_device_name(),
@@ -244,11 +289,25 @@ def setup_vllm_distributed(world_size, rank, use_slurm):
244289

245290

246291
def benchmark_vllm_allreduce(
247-
dtype: str, test_range: str, world_size: int, rank: int, use_slurm: bool, perf_filename: str
292+
dtype: str, test_range: str, world_size: int, rank: int, use_slurm: bool, perf_filename: str, measure_power: bool = False
248293
):
249294
"""Benchmark vLLM custom AllReduce backend"""
250295
vllm_mods, local_rank = setup_vllm_distributed(world_size, rank, use_slurm)
251296

297+
# Initialize NVML power monitor if power measurement is enabled
298+
power_monitor = None
299+
if measure_power:
300+
try:
301+
from nvml_power_monitor import NVMLPowerMonitor
302+
303+
power_monitor = NVMLPowerMonitor(gpu_indices=[local_rank])
304+
if rank == 0:
305+
print(f"NVML power monitoring enabled on all ranks")
306+
except Exception as e:
307+
if rank == 0:
308+
print(f"Warning: Failed to initialize NVML power monitor: {e}")
309+
raise # Fail if power measurement requested but NVML unavailable
310+
252311
# Parse test range
253312
min_size, max_size, ratio = [int(i) for i in test_range.split(",")]
254313

@@ -303,11 +362,23 @@ def benchmark_vllm_allreduce(
303362
graph.replay()
304363
torch.cuda.synchronize()
305364

306-
start_event.record()
307-
for i in range(num_runs):
308-
graph.replay()
309-
end_event.record()
310-
torch.cuda.synchronize()
365+
# Power measurement for graph mode
366+
if measure_power and power_monitor is not None:
367+
power_monitor.begin_window("allreduce_graph", sync_execution=True)
368+
start_event.record()
369+
for i in range(num_runs):
370+
graph.replay()
371+
end_event.record()
372+
measurement = power_monitor.end_window("allreduce_graph", sync_execution=True)
373+
torch.cuda.synchronize()
374+
power_rank = measurement.gpu_energy[local_rank] / measurement.time
375+
else:
376+
start_event.record()
377+
for i in range(num_runs):
378+
graph.replay()
379+
end_event.record()
380+
torch.cuda.synchronize()
381+
power_rank = None
311382

312383
else:
313384
# Eager mode
@@ -324,17 +395,43 @@ def benchmark_vllm_allreduce(
324395
start_event = torch.cuda.Event(enable_timing=True)
325396
end_event = torch.cuda.Event(enable_timing=True)
326397

327-
start_event.record()
328-
for _ in range(num_runs):
329-
for _ in range(repeat_n):
330-
_ = vllm_mods["tensor_model_parallel_all_reduce"](input_tensor.clone())
331-
end_event.record()
332-
torch.cuda.synchronize()
398+
# Power measurement for eager mode
399+
if measure_power and power_monitor is not None:
400+
power_monitor.begin_window("allreduce_eager", sync_execution=True)
401+
start_event.record()
402+
for _ in range(num_runs):
403+
for _ in range(repeat_n):
404+
_ = vllm_mods["tensor_model_parallel_all_reduce"](input_tensor.clone())
405+
end_event.record()
406+
measurement = power_monitor.end_window("allreduce_eager", sync_execution=True)
407+
torch.cuda.synchronize()
408+
power_rank = measurement.gpu_energy[local_rank] / measurement.time
409+
else:
410+
start_event.record()
411+
for _ in range(num_runs):
412+
for _ in range(repeat_n):
413+
_ = vllm_mods["tensor_model_parallel_all_reduce"](input_tensor.clone())
414+
end_event.record()
415+
torch.cuda.synchronize()
416+
power_rank = None
333417

334418
latency = start_event.elapsed_time(end_event) / num_runs / repeat_n
335419

420+
# Collect power data from all ranks if measuring
421+
if measure_power and power_rank is not None:
422+
# Gather power from all ranks to rank 0
423+
power_list = [None] * world_size if rank == 0 else None
424+
torch.distributed.all_gather_object(power_list if rank == 0 else [], power_rank)
425+
if rank == 0:
426+
# Average power across all GPUs
427+
avg_power = sum(power_list) / len(power_list)
428+
else:
429+
avg_power = None
430+
else:
431+
avg_power = None
432+
336433
if rank == 0:
337-
print(f"[vLLM-{mode_str}] Size: {size}, Latency: {latency:.4f} ms")
434+
print(f"[vLLM-{mode_str}] Size: {size}, Latency: {latency:.4f} ms" + (f", Power: {avg_power:.2f} W" if avg_power is not None else ""))
338435

339436
# Get vLLM version
340437
try:
@@ -344,16 +441,21 @@ def benchmark_vllm_allreduce(
344441
except:
345442
vllm_version = "unknown"
346443

444+
# Build result item
445+
item = {
446+
"allreduce_dtype": dtype,
447+
"num_gpus": world_size,
448+
"message_size": size,
449+
"latency": latency,
450+
"backend": f"vllm_{mode_str}",
451+
}
452+
453+
if avg_power is not None:
454+
item["power"] = avg_power
455+
item["compute_bound"] = 0 # Communication is always memory/bandwidth-bound
456+
347457
log_perf(
348-
item_list=[
349-
{
350-
"allreduce_dtype": dtype,
351-
"num_gpus": world_size,
352-
"message_size": size,
353-
"latency": latency,
354-
"backend": f"vllm_{mode_str}",
355-
}
356-
],
458+
item_list=[item],
357459
framework="vLLM",
358460
version=vllm_version,
359461
device_name=torch.cuda.get_device_name(),
@@ -376,9 +478,10 @@ def allreduce_benchmark(
376478
perf_filename: str = "custom_allreduce_perf.txt",
377479
world_size: Optional[int] = None,
378480
rank: Optional[int] = None,
481+
measure_power: bool = False,
379482
):
380483
"""
381-
CUDA Graph based AllReduce benchmark method supporting multiple backends
484+
CUDA Graph based AllReduce benchmark method supporting multiple backends with optional power measurement
382485
"""
383486
# Setup distributed environment based on backend
384487
if backend == "trtllm":
@@ -396,7 +499,7 @@ def allreduce_benchmark(
396499
if world_size == 1:
397500
raise RuntimeError("Benchmark must run with world_size > 1")
398501

399-
benchmark_trtllm_allreduce(dtype, test_range, world_size, rank, use_slurm, perf_filename)
502+
benchmark_trtllm_allreduce(dtype, test_range, world_size, rank, use_slurm, perf_filename, measure_power)
400503

401504
elif backend == "vllm":
402505
if use_slurm:
@@ -418,7 +521,7 @@ def allreduce_benchmark(
418521
if world_size == 1:
419522
raise RuntimeError("Benchmark must run with world_size > 1")
420523

421-
benchmark_vllm_allreduce(dtype, test_range, world_size, rank, use_slurm, perf_filename)
524+
benchmark_vllm_allreduce(dtype, test_range, world_size, rank, use_slurm, perf_filename, measure_power)
422525
else:
423526
raise ValueError(f"Unknown backend: {backend}")
424527

@@ -445,9 +548,22 @@ def allreduce_benchmark(
445548
# Additional arguments for vLLM when not using MPI/SLURM
446549
parser.add_argument("--world-size", default=8, type=int, help="World size for distributed setup (vLLM)")
447550
parser.add_argument("--rank", default=0, type=int, help="Rank for distributed setup (vLLM)")
551+
parser.add_argument(
552+
"--measure_power",
553+
action="store_true",
554+
default=False,
555+
help="Enable power measurement during AllReduce benchmark",
556+
)
448557

449558
args = parser.parse_args()
450559

451560
allreduce_benchmark(
452-
args.backend, args.dtype, args.range, args.use_slurm, args.perf_filename, args.world_size, args.rank
561+
args.backend,
562+
args.dtype,
563+
args.range,
564+
args.use_slurm,
565+
args.perf_filename,
566+
args.world_size,
567+
args.rank,
568+
args.measure_power,
453569
)

collector/collect_comm.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,14 @@ if [[ "$all_reduce_backend" == "trtllm" ]]; then
6363
for n in "${num_gpus_allreduce[@]}"; do
6464
echo "Running TRTLLM AllReduce benchmark with $n GPUs using CUDA Graph method"
6565
mpirun -n "$n" --allow-run-as-root python3 collect_all_reduce.py \
66-
--perf-filename "custom_allreduce_perf.txt"
66+
--perf-filename "custom_allreduce_perf.txt" ${measure_power:+--measure_power}
6767
done
6868
elif [[ "$all_reduce_backend" == "vllm" ]]; then
6969
# VLLM allreduce implementation
7070
for n in "${num_gpus_allreduce[@]}"; do
7171
echo "Running VLLM AllReduce benchmark with $n GPUs"
7272
torchrun --nproc_per_node=$n collect_all_reduce.py --backend vllm \
73-
--perf-filename "custom_allreduce_perf.txt"
73+
--perf-filename "custom_allreduce_perf.txt" ${measure_power:+--measure_power}
7474
done
7575
fi
7676

0 commit comments

Comments
 (0)