feat: Configurable grpc infer thread count (#8061)

yinggeh · web-flow · commit 5fd6bc48bb05 · 2025-04-04T13:49:01.000-07:00
diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -140,6 +140,14 @@ GRPC status codes can be used for better visibility and monitoring. For more det
 
 For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes)
 
+#### GRPC Inference Handler Threads
+
+In general, using 2 threads per completion queue seems to give the best performance, see [gRPC Performance Best Practices] (https://grpc.io/docs/guides/performance/#c). However, in cases where the performance bottleneck is at the request handling step (e.g. ensemble models), increasing the number of gRPC inference handler threads may lead to a higher throughput.
+
+* `--grpc-infer-thread-count`: 2 by default.
+
+Note: More threads don't always mean better performance.
+
 ### Limit Endpoint Access (BETA)
 
 Triton users may want to restrict access to protocols or APIs that are
diff --git a/qa/L0_sequence_stress/test.sh b/qa/L0_sequence_stress/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -85,6 +85,45 @@ for model_trial in 1 2 4 ; do
     wait $SERVER_PID
 done
 
+# Test invalid gRPC infer handler thread count
+for thread_cnt in -1 0 1 129; do
+    MODEL_DIR=models1
+    SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR --grpc-infer-thread-count=$thread_cnt"
+    SERVER_LOG="./$MODEL_DIR.server.log"
+    run_server
+    if [ "$SERVER_PID" != "0" ]; then
+        echo -e "\n***\n*** Failed: $SERVER started successfully when it was expected to fail\n***"
+        RET=1
+        kill SERVER_PID
+        wait $SERVER_PID
+    fi
+done
+
+# Test gRPC infer handler thread count under stress
+thread_cnt=128
+for model_trial in 1 2 4 ; do
+    MODEL_DIR=models${model_trial}
+    SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR --grpc-infer-thread-count=$thread_cnt"
+    SERVER_LOG="./$MODEL_DIR.server.log"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    set +e
+    python $STRESS_TEST >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** Test Failed\n***"
+        RET=1
+    fi
+    set -e
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+done
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc
@@ -305,6 +305,7 @@ enum TritonOptionId {
   OPTION_REUSE_GRPC_PORT,
   OPTION_GRPC_ADDRESS,
   OPTION_GRPC_HEADER_FORWARD_PATTERN,
+  OPTION_GRPC_INFER_THREAD_COUNT,
   OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
   OPTION_GRPC_MAX_RESPONSE_POOL_SIZE,
   OPTION_GRPC_USE_SSL,
@@ -530,6 +531,10 @@ TritonParser::SetupOptions()
        Option::ArgStr,
        "The regular expression pattern that will be used for forwarding GRPC "
        "headers as inference request parameters."});
+  grpc_options_.push_back(
+      {OPTION_GRPC_INFER_THREAD_COUNT, "grpc-infer-thread-count",
+       Option::ArgInt,
+       "The number of gRPC inference handler threads. Default is 2."});
   grpc_options_.push_back(
       {OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE,
        "grpc-infer-allocation-pool-size", Option::ArgInt,
@@ -1441,6 +1446,15 @@ TritonParser::Parse(int argc, char** argv)
         case OPTION_GRPC_ADDRESS:
           lgrpc_options.socket_.address_ = optarg;
           break;
+        case OPTION_GRPC_INFER_THREAD_COUNT:
+          lgrpc_options.infer_thread_count_ = ParseOption<int>(optarg);
+          if (lgrpc_options.infer_thread_count_ < 2 ||
+              lgrpc_options.infer_thread_count_ > 128) {
+            throw ParseException(
+                "invalid argument for --grpc_infer_thread_count. Must be in "
+                "the range 2 to 128.");
+          }
+          break;
         case OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE:
           lgrpc_options.infer_allocation_pool_size_ = ParseOption<int>(optarg);
           break;
diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc
@@ -62,8 +62,6 @@
 #include "../tracer.h"
 #endif  // TRITON_ENABLE_TRACING
 
-#define REGISTER_GRPC_INFER_THREAD_COUNT 2
-
 namespace triton { namespace server { namespace grpc {
 
 namespace {
@@ -2390,7 +2388,7 @@ Server::Server(
   // Handler for model inference requests.
   std::pair<std::string, std::string> restricted_kv =
       options.restricted_protocols_.Get(RestrictedCategory::INFERENCE);
-  for (int i = 0; i < REGISTER_GRPC_INFER_THREAD_COUNT; ++i) {
+  for (int i = 0; i < options.infer_thread_count_; ++i) {
     model_infer_handlers_.emplace_back(new ModelInferHandler(
         "ModelInferHandler", tritonserver_, trace_manager_, shm_manager_,
         &service_, model_infer_cq_.get(),
@@ -2469,6 +2467,8 @@ Server::GetOptions(Options& options, UnorderedMapType& options_map)
   options.infer_compression_level_ =
       static_cast<grpc_compression_level>(infer_compression_level_key);
 
+  RETURN_IF_ERR(GetValue(
+      options_map, "infer_thread_count", &options.infer_thread_count_));
   RETURN_IF_ERR(GetValue(
       options_map, "infer_allocation_pool_size",
       &options.infer_allocation_pool_size_));
diff --git a/src/grpc/grpc_server.h b/src/grpc/grpc_server.h
@@ -84,6 +84,9 @@ struct Options {
   SslOptions ssl_;
   KeepAliveOptions keep_alive_;
   grpc_compression_level infer_compression_level_{GRPC_COMPRESS_LEVEL_NONE};
+  // The number of gRPC inference handler threads. Useful for
+  // throughput tuning of models that are request handling bounded.
+  int infer_thread_count_{2};
   // The maximum number of inference request/response objects that
   // remain allocated for reuse. As long as the number of in-flight
   // requests doesn't exceed this value there will be no