From 61dc6a12464599770122b51d9cf08482cee5bc92 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Mon, 10 Mar 2025 16:34:21 -0700 Subject: [PATCH 1/5] update --- .../core/providers/tensorrt/tensorrt_execution_provider.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 523ebbfae807a..cd98fd3acb59f 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3538,6 +3538,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { + cudaSetDevice(device_id_); Ort::KernelContext ctx(context); TensorrtFuncState* trt_state = reinterpret_cast(state); @@ -4212,6 +4213,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { + cudaSetDevice(device_id_); Ort::KernelContext ctx(context); TensorrtShortFuncState* trt_state = reinterpret_cast(state); From 4e4f26ae57a6b28d744f0a2b32099e146f051d7a Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 12 Mar 2025 11:30:09 -0700 Subject: [PATCH 2/5] update --- .../tensorrt/tensorrt_execution_provider.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index cd98fd3acb59f..cc9445b8549c5 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3538,7 +3538,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { + // The GPU device is set again here to handle multithreading scenarios. + // Consider the following: + // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) + // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. + // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. cudaSetDevice(device_id_); + Ort::KernelContext ctx(context); TensorrtFuncState* trt_state = reinterpret_cast(state); @@ -4213,7 +4219,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { + // The GPU device is set again here to handle multithreading scenarios. + // Consider the following: + // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) + // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. + // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. cudaSetDevice(device_id_); + Ort::KernelContext ctx(context); TensorrtShortFuncState* trt_state = reinterpret_cast(state); From 1654c17ca822568477a70bfce4beef66e51113bd Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 12 Mar 2025 12:18:07 -0700 Subject: [PATCH 3/5] update --- .../core/providers/tensorrt/tensorrt_execution_provider.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index cc9445b8549c5..22982b6a1a320 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3543,6 +3543,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. + // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible + // and does not impact runtime performance. cudaSetDevice(device_id_); Ort::KernelContext ctx(context); @@ -4224,6 +4226,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. + // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible + // and does not impact runtime performance. cudaSetDevice(device_id_); Ort::KernelContext ctx(context); From c5c527e3a7c623d45a5d0f2f6adb06fa2b33906e Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 12 Mar 2025 15:46:04 -0700 Subject: [PATCH 4/5] move cudaSetDevice to OnRunStart() --- .../tensorrt/tensorrt_execution_provider.cc | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 22982b6a1a320..b75cd88ad3fb9 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1865,6 +1865,14 @@ std::unique_ptr TensorrtExecutionProvider::GetDataTransfer() cons } Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { + // The GPU device is set again here to handle multithreading scenarios. + // Consider the following: + // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) + // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. + // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. + // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible + // and does not impact runtime performance. + cudaSetDevice(device_id_); return Status::OK(); } @@ -3538,15 +3546,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { - // The GPU device is set again here to handle multithreading scenarios. - // Consider the following: - // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) - // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. - // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. - // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible - // and does not impact runtime performance. - cudaSetDevice(device_id_); - Ort::KernelContext ctx(context); TensorrtFuncState* trt_state = reinterpret_cast(state); @@ -4221,15 +4220,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { - // The GPU device is set again here to handle multithreading scenarios. - // Consider the following: - // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) - // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. - // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. - // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible - // and does not impact runtime performance. - cudaSetDevice(device_id_); - Ort::KernelContext ctx(context); TensorrtShortFuncState* trt_state = reinterpret_cast(state); From 1c2ff53a64f44ae03c86fdedb672405585423e11 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Wed, 12 Mar 2025 17:47:11 -0700 Subject: [PATCH 5/5] move cudaSetDevice to compute_func --- .../tensorrt/tensorrt_execution_provider.cc | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index b75cd88ad3fb9..00f53b96f931a 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1865,14 +1865,6 @@ std::unique_ptr TensorrtExecutionProvider::GetDataTransfer() cons } Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { - // The GPU device is set again here to handle multithreading scenarios. - // Consider the following: - // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) - // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. - // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. - // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible - // and does not impact runtime performance. - cudaSetDevice(device_id_); return Status::OK(); } @@ -3546,6 +3538,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { + // The GPU device is set again here to handle multithreading scenarios. + // Consider the following: + // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) + // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. + // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. + // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible + // and does not impact runtime performance. + CUDA_CALL_THROW(cudaSetDevice(device_id_)); + Ort::KernelContext ctx(context); TensorrtFuncState* trt_state = reinterpret_cast(state); @@ -4220,6 +4221,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Create compute function compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) { + // The GPU device is set again here to handle multithreading scenarios. + // Consider the following: + // Users can create multiple threads to initialize separate inference sessions on different devices (not just the default device 0) + // Later, additional threads may be spawned to execute inference_session.Run(), which calls this compute function. + // Since new threads default to using device 0, it’s necessary to explicitly set the correct device to ensure computations run on the intended GPU. + // Note: Based on our measurements on the A100 GPU with CUDA 12, the execution time for cudaSetDevice is approximately 0.004 ms, which is negligible + // and does not impact runtime performance. + CUDA_CALL_THROW(cudaSetDevice(device_id_)); + Ort::KernelContext ctx(context); TensorrtShortFuncState* trt_state = reinterpret_cast(state);