Skip to content

Commit 5fd6bc4

Browse files
authored
feat: Configurable grpc infer thread count (#8061)
1 parent fd19783 commit 5fd6bc4

File tree

5 files changed

+69
-5
lines changed

5 files changed

+69
-5
lines changed

docs/customization_guide/inference_protocols.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -140,6 +140,14 @@ GRPC status codes can be used for better visibility and monitoring. For more det
140140

141141
For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes)
142142

143+
#### GRPC Inference Handler Threads
144+
145+
In general, using 2 threads per completion queue seems to give the best performance, see [gRPC Performance Best Practices] (https://grpc.io/docs/guides/performance/#c). However, in cases where the performance bottleneck is at the request handling step (e.g. ensemble models), increasing the number of gRPC inference handler threads may lead to a higher throughput.
146+
147+
* `--grpc-infer-thread-count`: 2 by default.
148+
149+
Note: More threads don't always mean better performance.
150+
143151
### Limit Endpoint Access (BETA)
144152

145153
Triton users may want to restrict access to protocols or APIs that are

qa/L0_sequence_stress/test.sh

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -85,6 +85,45 @@ for model_trial in 1 2 4 ; do
8585
wait $SERVER_PID
8686
done
8787

88+
# Test invalid gRPC infer handler thread count
89+
for thread_cnt in -1 0 1 129; do
90+
MODEL_DIR=models1
91+
SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR --grpc-infer-thread-count=$thread_cnt"
92+
SERVER_LOG="./$MODEL_DIR.server.log"
93+
run_server
94+
if [ "$SERVER_PID" != "0" ]; then
95+
echo -e "\n***\n*** Failed: $SERVER started successfully when it was expected to fail\n***"
96+
RET=1
97+
kill SERVER_PID
98+
wait $SERVER_PID
99+
fi
100+
done
101+
102+
# Test gRPC infer handler thread count under stress
103+
thread_cnt=128
104+
for model_trial in 1 2 4 ; do
105+
MODEL_DIR=models${model_trial}
106+
SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR --grpc-infer-thread-count=$thread_cnt"
107+
SERVER_LOG="./$MODEL_DIR.server.log"
108+
run_server
109+
if [ "$SERVER_PID" == "0" ]; then
110+
echo -e "\n***\n*** Failed to start $SERVER\n***"
111+
cat $SERVER_LOG
112+
exit 1
113+
fi
114+
115+
set +e
116+
python $STRESS_TEST >>$CLIENT_LOG 2>&1
117+
if [ $? -ne 0 ]; then
118+
echo -e "\n***\n*** Test Failed\n***"
119+
RET=1
120+
fi
121+
set -e
122+
123+
kill $SERVER_PID
124+
wait $SERVER_PID
125+
done
126+
88127
if [ $RET -eq 0 ]; then
89128
echo -e "\n***\n*** Test Passed\n***"
90129
else

src/command_line_parser.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ enum TritonOptionId {
305305
OPTION_REUSE_GRPC_PORT,
306306
OPTION_GRPC_ADDRESS,
307307
OPTION_GRPC_HEADER_FORWARD_PATTERN,
308+
OPTION_GRPC_INFER_THREAD_COUNT,
308309
OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
309310
OPTION_GRPC_MAX_RESPONSE_POOL_SIZE,
310311
OPTION_GRPC_USE_SSL,
@@ -530,6 +531,10 @@ TritonParser::SetupOptions()
530531
Option::ArgStr,
531532
"The regular expression pattern that will be used for forwarding GRPC "
532533
"headers as inference request parameters."});
534+
grpc_options_.push_back(
535+
{OPTION_GRPC_INFER_THREAD_COUNT, "grpc-infer-thread-count",
536+
Option::ArgInt,
537+
"The number of gRPC inference handler threads. Default is 2."});
533538
grpc_options_.push_back(
534539
{OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
535540
"grpc-infer-allocation-pool-size", Option::ArgInt,
@@ -1441,6 +1446,15 @@ TritonParser::Parse(int argc, char** argv)
14411446
case OPTION_GRPC_ADDRESS:
14421447
lgrpc_options.socket_.address_ = optarg;
14431448
break;
1449+
case OPTION_GRPC_INFER_THREAD_COUNT:
1450+
lgrpc_options.infer_thread_count_ = ParseOption<int>(optarg);
1451+
if (lgrpc_options.infer_thread_count_ < 2 ||
1452+
lgrpc_options.infer_thread_count_ > 128) {
1453+
throw ParseException(
1454+
"invalid argument for --grpc_infer_thread_count. Must be in "
1455+
"the range 2 to 128.");
1456+
}
1457+
break;
14441458
case OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE:
14451459
lgrpc_options.infer_allocation_pool_size_ = ParseOption<int>(optarg);
14461460
break;

src/grpc/grpc_server.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,6 @@
6262
#include "../tracer.h"
6363
#endif // TRITON_ENABLE_TRACING
6464

65-
#define REGISTER_GRPC_INFER_THREAD_COUNT 2
66-
6765
namespace triton { namespace server { namespace grpc {
6866

6967
namespace {
@@ -2390,7 +2388,7 @@ Server::Server(
23902388
// Handler for model inference requests.
23912389
std::pair<std::string, std::string> restricted_kv =
23922390
options.restricted_protocols_.Get(RestrictedCategory::INFERENCE);
2393-
for (int i = 0; i < REGISTER_GRPC_INFER_THREAD_COUNT; ++i) {
2391+
for (int i = 0; i < options.infer_thread_count_; ++i) {
23942392
model_infer_handlers_.emplace_back(new ModelInferHandler(
23952393
"ModelInferHandler", tritonserver_, trace_manager_, shm_manager_,
23962394
&service_, model_infer_cq_.get(),
@@ -2469,6 +2467,8 @@ Server::GetOptions(Options& options, UnorderedMapType& options_map)
24692467
options.infer_compression_level_ =
24702468
static_cast<grpc_compression_level>(infer_compression_level_key);
24712469

2470+
RETURN_IF_ERR(GetValue(
2471+
options_map, "infer_thread_count", &options.infer_thread_count_));
24722472
RETURN_IF_ERR(GetValue(
24732473
options_map, "infer_allocation_pool_size",
24742474
&options.infer_allocation_pool_size_));

src/grpc/grpc_server.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ struct Options {
8484
SslOptions ssl_;
8585
KeepAliveOptions keep_alive_;
8686
grpc_compression_level infer_compression_level_{GRPC_COMPRESS_LEVEL_NONE};
87+
// The number of gRPC inference handler threads. Useful for
88+
// throughput tuning of models that are request handling bounded.
89+
int infer_thread_count_{2};
8790
// The maximum number of inference request/response objects that
8891
// remain allocated for reuse. As long as the number of in-flight
8992
// requests doesn't exceed this value there will be no

0 commit comments

Comments
 (0)