From 61dc6a12464599770122b51d9cf08482cee5bc92 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Mon, 10 Mar 2025 16:34:21 -0700
Subject: [PATCH 1/5] update

---
 .../core/providers/tensorrt/tensorrt_execution_provider.cc      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 523ebbfae807a..cd98fd3acb59f 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3538,6 +3538,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    cudaSetDevice(device_id_);
     Ort::KernelContext ctx(context);
 
     TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
@@ -4212,6 +4213,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    cudaSetDevice(device_id_);
     Ort::KernelContext ctx(context);
 
     TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);

From 4e4f26ae57a6b28d744f0a2b32099e146f051d7a Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Wed, 12 Mar 2025 11:30:09 -0700
Subject: [PATCH 2/5] update

---
 .../tensorrt/tensorrt_execution_provider.cc          | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index cd98fd3acb59f..cc9445b8549c5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3538,7 +3538,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    // The GPU device is set again here to handle multithreading scenarios.
+    // Consider the following:
+    // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
+    // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
+    // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
     cudaSetDevice(device_id_);
+
     Ort::KernelContext ctx(context);
 
     TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
@@ -4213,7 +4219,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    // The GPU device is set again here to handle multithreading scenarios.
+    // Consider the following:
+    // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
+    // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
+    // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
     cudaSetDevice(device_id_);
+
     Ort::KernelContext ctx(context);
 
     TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);

From 1654c17ca822568477a70bfce4beef66e51113bd Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Wed, 12 Mar 2025 12:18:07 -0700
Subject: [PATCH 3/5] update

---
 .../core/providers/tensorrt/tensorrt_execution_provider.cc    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index cc9445b8549c5..22982b6a1a320 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3543,6 +3543,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
     // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
     // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
+    // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
+    //       and does not impact runtime performance.
     cudaSetDevice(device_id_);
 
     Ort::KernelContext ctx(context);
@@ -4224,6 +4226,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
     // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
     // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
     // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
+    // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
+    //       and does not impact runtime performance.
     cudaSetDevice(device_id_);
 
     Ort::KernelContext ctx(context);

From c5c527e3a7c623d45a5d0f2f6adb06fa2b33906e Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Wed, 12 Mar 2025 15:46:04 -0700
Subject: [PATCH 4/5] move cudaSetDevice to OnRunStart()

---
 .../tensorrt/tensorrt_execution_provider.cc   | 26 ++++++-------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 22982b6a1a320..b75cd88ad3fb9 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1865,6 +1865,14 @@ std::unique_ptr<IDataTransfer> TensorrtExecutionProvider::GetDataTransfer() cons
 }
 
 Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
+  // The GPU device is set again here to handle multithreading scenarios.
+  // Consider the following:
+  // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
+  // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
+  // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
+  // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
+  //       and does not impact runtime performance.
+  cudaSetDevice(device_id_);
   return Status::OK();
 }
 
@@ -3538,15 +3546,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
-    // The GPU device is set again here to handle multithreading scenarios.
-    // Consider the following:
-    // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
-    // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
-    // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
-    // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
-    //       and does not impact runtime performance.
-    cudaSetDevice(device_id_);
-
     Ort::KernelContext ctx(context);
 
     TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
@@ -4221,15 +4220,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
-    // The GPU device is set again here to handle multithreading scenarios.
-    // Consider the following:
-    // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
-    // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
-    // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
-    // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
-    //       and does not impact runtime performance.
-    cudaSetDevice(device_id_);
-
     Ort::KernelContext ctx(context);
 
     TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);

From 1c2ff53a64f44ae03c86fdedb672405585423e11 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Wed, 12 Mar 2025 17:47:11 -0700
Subject: [PATCH 5/5] move cudaSetDevice to compute_func

---
 .../tensorrt/tensorrt_execution_provider.cc   | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index b75cd88ad3fb9..00f53b96f931a 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1865,14 +1865,6 @@ std::unique_ptr<IDataTransfer> TensorrtExecutionProvider::GetDataTransfer() cons
 }
 
 Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
-  // The GPU device is set again here to handle multithreading scenarios.
-  // Consider the following:
-  // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
-  // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
-  // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
-  // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
-  //       and does not impact runtime performance.
-  cudaSetDevice(device_id_);
   return Status::OK();
 }
 
@@ -3546,6 +3538,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    // The GPU device is set again here to handle multithreading scenarios.
+    // Consider the following:
+    // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
+    // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
+    // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
+    // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
+    //       and does not impact runtime performance.
+    CUDA_CALL_THROW(cudaSetDevice(device_id_));
+
     Ort::KernelContext ctx(context);
 
     TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
@@ -4220,6 +4221,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
 
   // Create compute function
   compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    // The GPU device is set again here to handle multithreading scenarios.
+    // Consider the following:
+    // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0)
+    // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function.
+    // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU.
+    // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible
+    //       and does not impact runtime performance.
+    CUDA_CALL_THROW(cudaSetDevice(device_id_));
+
     Ort::KernelContext ctx(context);
 
     TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);