From 8b8a685308ead7e0be833eaf7773a1d48cd827f1 Mon Sep 17 00:00:00 2001 From: btaanish <87608292+btaanish@users.noreply.github.com> Date: Mon, 10 Nov 2025 20:15:01 +0800 Subject: [PATCH 1/3] Update README with multinode and accuracy check details Added instructions for multinode runs and accuracy checks. --- closed/NVIDIA/README.md | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/closed/NVIDIA/README.md b/closed/NVIDIA/README.md index 3ae3ffebc..efb99400d 100644 --- a/closed/NVIDIA/README.md +++ b/closed/NVIDIA/README.md @@ -382,6 +382,50 @@ $ make run RUN_ARGS="--benchmarks=resnet50,bert --scenarios=offline,server" ``` **If you run into issues, invalid results, or would like to improve your performance,** **read** `documentation/performance_tuning_guide.md`. +### Multinode runs + +From within the container, installing the triton software: + +``` +$ make clone_triton && make build_triton +``` + +Generating the triton config files(after generating the engines): + +``` +$ make generate_triton_config RUN_ARGS="--benchmarks=llama2-70b \ #or other benchmarks + --scenarios=Offline \ +--harness_type=triton \ +--accuracy_target=0.999 \ # or 0.99 +--engine_dir=/path/to/engines \” +``` + +After modifying the `start_triton.sh` based on your specific node config, start the triton engines on each node: + +``` +$ /work/start_triton.sh +``` + +Keeping the engines running, enter one node's(considered the master node hereon) container from a different shell: + +``` +$ docker exec -it "image-name" bash +``` + +Start the accuracy/throughput runs through seperate triton-client frontends for each node: + +``` +$ make run_harness RUN_ARGS="\ + --benchmarks=llama2-70b \ # or the other benchmarks + --scenarios=Offline \ + --harness_type=triton \ + --inference_server=triton \ + --accuracy_target=0.999 \ # or 0.99 + --triton_skip_server_spawn \ + --triton_grpc_ports='ip_of_master_node:8001|ip_of_worker_node_1:8001|ip_of_worker_node_2:8001...'" +``` + + ### How do I run the accuracy checks? You can run the harness for accuracy checks using the `--test_mode=AccuracyOnly` flag: From aed2c29a9d0b23c7dece2e0c0ebd8a97dd631e39 Mon Sep 17 00:00:00 2001 From: btaanish <87608292+btaanish@users.noreply.github.com> Date: Mon, 10 Nov 2025 20:22:35 +0800 Subject: [PATCH 2/3] Create start_triton.sh --- closed/NVIDIA/start_triton.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 closed/NVIDIA/start_triton.sh diff --git a/closed/NVIDIA/start_triton.sh b/closed/NVIDIA/start_triton.sh new file mode 100644 index 000000000..5581f5224 --- /dev/null +++ b/closed/NVIDIA/start_triton.sh @@ -0,0 +1,15 @@ +#!/bin/bash +MODEL_REPO=/path/to/repo_0 #created by triton config builds +TRITON_BIN=/opt/tritonserver/bin/tritonserver +GRPC_PORT=8001 +HTTP_PORT=8000 +METRICS_PORT=8002 + +WORLD_SIZE= #number of GPUs per node + +exec mpirun -n ${WORLD_SIZE} --allow-run-as-root \ + ${TRITON_BIN} \ + --model-repository=${MODEL_REPO} \ + --grpc-port=${GRPC_PORT} \ + --http-port=${HTTP_PORT} \ + --metrics-port=${METRICS_PORT} From 70d39df1f090700c8202389c449904f421f3566b Mon Sep 17 00:00:00 2001 From: btaanish <87608292+btaanish@users.noreply.github.com> Date: Tue, 11 Nov 2025 00:02:22 +0800 Subject: [PATCH 3/3] init --- .../configs/llama2-70b/Offline/__init__.py | 93 +++++++++++++++---- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/closed/NVIDIA/configs/llama2-70b/Offline/__init__.py b/closed/NVIDIA/configs/llama2-70b/Offline/__init__.py index 5ea7d44a5..eecfa0cd1 100644 --- a/closed/NVIDIA/configs/llama2-70b/Offline/__init__.py +++ b/closed/NVIDIA/configs/llama2-70b/Offline/__init__.py @@ -41,7 +41,7 @@ class HopperOfflineGPUBaseConfig(OfflineGPUBaseConfig): trtllm_build_flags = { 'tensor_parallelism': 1, - 'pipeline_parallelism': 1, + 'pipeline_parallelism': 2, } @@ -54,11 +54,10 @@ class BlackwellOfflineGPUBaseConfig(OfflineGPUBaseConfig): trtllm_build_flags = { 'tensor_parallelism': 1, - 'pipeline_parallelism': 1, + 'pipeline_parallelism': 2, 'norm_quant_fusion': 'enable' } - @ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP) class GH200_144GB_aarch64x1(HopperOfflineGPUBaseConfig): system = KnownSystem.GH200_144GB_ARMx1 @@ -87,14 +86,12 @@ class GH200_144GB_aarch64x2(GH200_144GB_aarch64x1): class GH200_144GB_aarch64x2_HighAccuracy(GH200_144GB_aarch64x2): pass - -@ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP, "PP2") class H100_SXM_80GB_PP2x1(HopperOfflineGPUBaseConfig): system = KnownSystem.H100_SXM_80GBx2 vboost_slider = 0 gpu_batch_size = {'llama2-70b': 1024} - offline_expected_qps = 27.5 + offline_expected_qps = 75 trtllm_build_flags = { 'max_num_tokens': 1024, 'tensor_parallelism': 1, @@ -117,7 +114,7 @@ class H100_SXM_80GB_Triton_PP2x1(HopperOfflineGPUBaseConfig): triton_num_frontends_per_model = 1 gpu_batch_size = {'llama2-70b': 2048} - offline_expected_qps = 25 + offline_expected_qps = 75 trtllm_build_flags = { 'max_num_tokens': 1024, 'tensor_parallelism': 1, @@ -125,13 +122,50 @@ class H100_SXM_80GB_Triton_PP2x1(HopperOfflineGPUBaseConfig): } trtllm_runtime_flags = {'max_num_tokens': 1024} +@ConfigRegistry.register( + HarnessType.Custom, + AccuracyTarget.k_99_9, + PowerSetting.MaxP +) +class H100_SXM_80GB_Custom_HighAccuracy(H100_SXM_80GB_Triton_PP2x1): + system = KnownSystem.H100_SXM_80GBx1 + use_triton = True + triton_num_clients_per_frontend = 1 + triton_num_frontends_per_model = 1 + gpu_batch_size = {'llama2-70b': 1024} + offline_expected_qps = 75 + trtllm_build_flags = { + 'max_num_tokens': 1024, + 'tensor_parallelism': 1, + 'pipeline_parallelism': 2, + 'reduce_fusion': 'enable', + 'gemm_swiglu_plugin': 'fp8', + } + trtllm_runtime_flags = { + 'max_num_tokens': 1024, + 'kvcache_free_gpu_mem_frac': 0.95, + } + + +@ConfigRegistry.register( + HarnessType.Triton, + AccuracyTarget.k_99_9, + PowerSetting.MaxP +) +class H100_SXM_80GB_Triton_PP2x1_HighAccuracy(H100_SXM_80GB_Triton_PP2x1): + pass +@ConfigRegistry.register( + HarnessType.Custom, + AccuracyTarget.k_99_9, + PowerSetting.MaxP +) +class H100_SXM_80GB_Triton_PP2x1_HighAccuracy_CustomAlias(H100_SXM_80GB_Triton_PP2x1_HighAccuracy): + pass @ConfigRegistry.register(HarnessType.Triton, AccuracyTarget.k_99, PowerSetting.MaxP, "PP2") class H100_SXM_80GB_Triton_PP2x4(H100_SXM_80GB_Triton_PP2x1): system = KnownSystem.H100_SXM_80GBx8 offline_expected_qps = 25 * 4 - - @ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP, "PP2") class H100_SXM_80GB_PP2x2(H100_SXM_80GB_PP2x1): system = KnownSystem.H100_SXM_80GBx4 @@ -182,21 +216,13 @@ class H100_NVL_94GB_TP2x1(HopperOfflineGPUBaseConfig): @ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP, "TP2") class H100_NVL_94GB_TP2x2(H100_NVL_94GB_TP2x1): system = KnownSystem.H100_NVL_94GBx4 - offline_expected_qps = 25 + offline_expected_qps = 10 @ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxP, "TP2") class H100_NVL_94GB_TP2x4(H100_NVL_94GB_TP2x2): system = KnownSystem.H100_NVL_94GBx8 offline_expected_qps = 50 - - -@ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99, PowerSetting.MaxQ, "TP2") -class H100_NVL_94GB_MaxQ_TP2x4(H100_NVL_94GB_TP2x4): - offline_expected_qps = 45 - power_limit = 350 - - @ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP, "TP2") class H100_NVL_94GB_HighAccuracy_TP2x1(H100_NVL_94GB_TP2x1): pass @@ -362,7 +388,34 @@ class B200_SXM_180GBx8(B200_SXM_180GBx1): system = KnownSystem.B200_SXM_180GBx8 offline_expected_qps = B200_SXM_180GBx1.offline_expected_qps * 8 +@ConfigRegistry.register( + HarnessType.Triton, + AccuracyTarget.k_99, + PowerSetting.MaxP +) +class DGX_H100_H100_SXM_80GBx2_Triton(H100_SXM_80GB_Triton_PP2x1): + system = KnownSystem.H100_SXM_80GBx2 -@ConfigRegistry.register(HarnessType.Custom, AccuracyTarget.k_99_9, PowerSetting.MaxP) -class B200_SXM_180GBx8_HighAccuracy(B200_SXM_180GBx8): +@ConfigRegistry.register( + HarnessType.Triton, + AccuracyTarget.k_99_9, +PowerSetting.MaxP +) +class DGX_H100_H100_SXM_80GBx2_Triton_HA(DGX_H100_H100_SXM_80GBx2_Triton): + pass + +@ConfigRegistry.register( + HarnessType.Triton, + AccuracyTarget.k_99, + PowerSetting.MaxP +) +class DGX_H100_H100_SXM_80GBx1_Triton(H100_SXM_80GB_Triton_PP2x1): + system = KnownSystem.H100_SXM_80GBx1 + +@ConfigRegistry.register( + HarnessType.Triton, + AccuracyTarget.k_99_9, + PowerSetting.MaxP +) +class DGX_H100_H100_SXM_80GBx1_Triton_HA(DGX_H100_H100_SXM_80GBx1_Triton): pass