NVIDIA · wsttiger · Nov 1, 2025 · Sep 30, 2025 · Sep 30, 2025 · Oct 1, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.onnx filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/all_libs.yaml b/.github/workflows/all_libs.yaml
@@ -64,7 +64,21 @@ jobs:
 
       - name: Install build requirements
         run: |
-          apt install -y --no-install-recommends gfortran libblas-dev
+          apt install -y --no-install-recommends gfortran libblas-dev wget
+
+      - name: Install TensorRT (amd64)
+        if: matrix.platform == 'amd64'
+        run: |
+          apt-cache search tensorrt | awk '{print "Package: "$1"\nPin: version *+cuda${{ matrix.cuda_version }}\nPin-Priority: 1001\n"}' | tee /etc/apt/preferences.d/tensorrt-cuda${{ matrix.cuda_version }}.pref > /dev/null
+          apt update
+          apt install -y tensorrt-dev
+
+      - name: Install TensorRT (arm64)
+        if: matrix.platform == 'arm64'
+        run: |
+          apt-cache search tensorrt | awk '{print "Package: "$1"\nPin: version *+cuda13.0\nPin-Priority: 1001\n"}' | tee /etc/apt/preferences.d/tensorrt-cuda13.0.pref > /dev/null
+          apt update
+          apt install -y tensorrt-dev
 
       - name: Build
         id: build
@@ -92,7 +106,7 @@ jobs:
           LD_LIBRARY_PATH: ${{ env.MPI_PATH }}/lib:${{ env.LD_LIBRARY_PATH }}
         shell: bash
         run: |
-          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} torch lightning ml_collections mpi4py transformers quimb opt_einsum torch nvidia-cublas-cu${{ steps.config.outputs.cuda_major }} cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09
+          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} torch lightning ml_collections mpi4py transformers quimb opt_einsum torch nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only

diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
@@ -63,16 +63,59 @@ jobs:
             build-type: Release
 
     steps:
-      - name: Get code
-        uses: actions/checkout@v4
-        with:
-          set-safe-directory: true
-
       - name: Configure
         id: config
         run: |
           cuda_major=`echo ${{ matrix.cuda_version }} | cut -d . -f1`
           echo "cuda_major=$cuda_major" >> $GITHUB_OUTPUT
+          # Map CUDA 12.6 to 12.9 for TensorRT filename
+          if [ "${{ matrix.cuda_version }}" == "12.6" ]; then
+            tensorrt_cuda_version="12.9"
+            tensorrt_cuda_major="12"
+          else
+            tensorrt_cuda_version="${{ matrix.cuda_version }}"
+            tensorrt_cuda_major="$cuda_major"
+          fi
+          echo "tensorrt_cuda_version=$tensorrt_cuda_version" >> $GITHUB_OUTPUT
+          echo "tensorrt_cuda_major=$tensorrt_cuda_major" >> $GITHUB_OUTPUT
+          tensorrt_major_version="10.13.3"
+          tensorrt_minor_version="9"
+          tensorrt_version="${tensorrt_major_version}.${tensorrt_minor_version}"
+          echo "tensorrt_major_version=$tensorrt_major_version" >> $GITHUB_OUTPUT
+          echo "tensorrt_version=$tensorrt_version" >> $GITHUB_OUTPUT
+
+      - name: Install TensorRT (amd64)
+        shell: bash
+        if: matrix.platform == 'amd64'
+        run: |
+          mkdir -p /trt_download
+          pushd /trt_download
+          pwd
+          wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${{ steps.config.outputs.tensorrt_major_version }}/tars/TensorRT-${{ steps.config.outputs.tensorrt_version }}.Linux.x86_64-gnu.cuda-${{ steps.config.outputs.tensorrt_cuda_version }}.tar.gz
+          tar -zxvf TensorRT-${{ steps.config.outputs.tensorrt_version }}.Linux.x86_64-gnu.cuda-${{ steps.config.outputs.tensorrt_cuda_version }}.tar.gz
+          pwd
+          popd
+          find /trt_download/TensorRT-${{ steps.config.outputs.tensorrt_version }} -name "NvInfer.h"
+          find /trt_download/TensorRT-${{ steps.config.outputs.tensorrt_version }} -name "NvInferRuntime.h"
+
+      - name: Install TensorRT (arm64)
+        shell: bash
+        if: matrix.platform == 'arm64'
+        run: |
+          mkdir -p /trt_download
+          pushd /trt_download
+          pwd
+          wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${{ steps.config.outputs.tensorrt_major_version }}/tars/TensorRT-${{ steps.config.outputs.tensorrt_version }}.Linux.aarch64-gnu.cuda-13.0.tar.gz
+          tar -zxvf TensorRT-${{ steps.config.outputs.tensorrt_version }}.Linux.aarch64-gnu.cuda-13.0.tar.gz
+          pwd
+          popd
+          find /trt_download/TensorRT-${{ steps.config.outputs.tensorrt_version }} -name "NvInfer.h"
+          find /trt_download/TensorRT-${{ steps.config.outputs.tensorrt_version }} -name "NvInferRuntime.h"
+
+      - name: Get code
+        uses: actions/checkout@v4
+        with:
+          set-safe-directory: true
 
       # Do this early to help validate user inputs (if present)
       - name: Fetch assets
@@ -123,6 +166,7 @@ jobs:
               --cudaq-prefix /usr/local/cudaq \
               --build-type ${{ inputs.build_type }} \
               --python-version ${{ matrix.python }} \
+              --tensorrt-path /trt_download/TensorRT-${{ steps.config.outputs.tensorrt_version }} \
               --version ${{ inputs.version || '0.99.99' }}
 
       - name: Upload artifact
@@ -332,11 +376,19 @@ jobs:
         cuda_version: ['12.6', '13.0']
 
     steps:
+
+      - name: Install git for LFS
+        shell: bash
+        run: |
+          apt update
+          apt install -y --no-install-recommends git git-lfs
+
       - name: Get code
         uses: actions/checkout@v4
         with:
           set-safe-directory: true
-
+          lfs: true # download assets file(s) for TRT tests
+
       - name: Configure
         id: config
         run: |

diff --git a/.github/workflows/lib_qec.yaml b/.github/workflows/lib_qec.yaml
@@ -61,6 +61,23 @@ jobs:
       # ========================================================================
       # Build library
       # ========================================================================
+      - name: Install build requirements
+        run: |
+          apt install -y --no-install-recommends gfortran libblas-dev wget
+
+      - name: Install TensorRT (amd64)
+        if: matrix.platform == 'amd64'
+        run: |
+          apt-cache search tensorrt | awk '{print "Package: "$1"\nPin: version *+cuda${{ matrix.cuda_version }}\nPin-Priority: 1001\n"}' | tee /etc/apt/preferences.d/tensorrt-cuda${{ matrix.cuda_version }}.pref > /dev/null
+          apt update
+          apt install -y tensorrt-dev
+
+      - name: Install TensorRT (arm64)
+        if: matrix.platform == 'arm64'
+        run: |
+          apt-cache search tensorrt | awk '{print "Package: "$1"\nPin: version *+cuda13.0\nPin-Priority: 1001\n"}' | tee /etc/apt/preferences.d/tensorrt-cuda13.0.pref > /dev/null
+          apt update
+          apt install -y tensorrt-dev
 
       - name: Build
         id: build
@@ -86,7 +103,7 @@ jobs:
       - name: Install python requirements
         shell: bash
         run: |
-          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum torch nvidia-cublas-cu${{ steps.config.outputs.cuda_major }} cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09
+          pip install numpy pytest cupy-cuda${{ steps.config.outputs.cuda_major }}x cuquantum-cu${{ steps.config.outputs.cuda_major }} quimb opt_einsum torch nvidia-cublas cuquantum-python-cu${{ steps.config.outputs.cuda_major }}==25.09
           # The following tests are needed for docs/sphinx/examples/qec/python/tensor_network_decoder.py.
           if [ "$(uname -m)" == "x86_64" ]; then
               # Stim is not currently available on manylinux ARM wheels, so only

diff --git a/.github/workflows/scripts/build_wheels.sh b/.github/workflows/scripts/build_wheels.sh
@@ -22,6 +22,8 @@ show_help() {
     echo "  --cudaq-prefix    Path to CUDA-Q's install prefix"
     echo "                    (default: \$HOME/.cudaq)"
     echo "  --python-version  Python version to build wheel for (e.g. 3.11)"
+    echo "  --tensorrt-path   Path to TensorRT installation directory"
+    echo "                    (default: /trt_download/TensorRT-10.13.3.9)"
     echo "  --devdeps         Build wheels suitable for internal testing"
     echo "                    (not suitable for distribution but sometimes"
     echo "                    helpful for debugging)"
@@ -68,6 +70,15 @@ parse_options() {
                     exit 1
                 fi
                 ;;
+            --tensorrt-path)
+                if [[ -n "$2" && "$2" != -* ]]; then
+                    tensorrt_path=("$2")
+                    shift 2
+                else
+                    echo "Error: Argument for $1 is missing" >&2
+                    exit 1
+                fi
+                ;;
             --devdeps)
                 devdeps=true
                 shift 1
@@ -99,6 +110,7 @@ parse_options() {
 cudaq_prefix=$HOME/.cudaq
 build_type=Release
 python_version=3.11
+tensorrt_path=/trt_download/TensorRT-10.13.3.9
 devdeps=false
 wheels_version=0.0.0
 cuda_version=12
@@ -136,7 +148,7 @@ export CUDAQX_SOLVERS_VERSION=$wheels_version
 cd libs/qec
 cp pyproject.toml.cu${cuda_version} pyproject.toml
 
-SKBUILD_CMAKE_ARGS="-DCUDAQ_DIR=$cudaq_prefix/lib/cmake/cudaq"
+SKBUILD_CMAKE_ARGS="-DCUDAQ_DIR=$cudaq_prefix/lib/cmake/cudaq;-DTENSORRT_ROOT=$tensorrt_path"
 if ! $devdeps; then
   SKBUILD_CMAKE_ARGS+=";-DCMAKE_CXX_COMPILER_EXTERNAL_TOOLCHAIN=/opt/rh/gcc-toolset-11/root/usr/lib/gcc/${ARCH}-redhat-linux/11/"
 fi
@@ -146,9 +158,12 @@ $python -m build --wheel
 
 CUDAQ_EXCLUDE_LIST=$(for f in $(find $cudaq_prefix/lib -name "*.so" -printf "%P\n" | sort); do echo "--exclude $f"; done | tr '\n' ' ')
 
-LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$(pwd)/_skbuild/lib" \
+LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$(pwd)/_skbuild/lib:$tensorrt_path/lib" \
 $python -m auditwheel -v repair dist/*.whl $CUDAQ_EXCLUDE_LIST \
   --wheel-dir /wheels \
+  --exclude libcudart.so.${cuda_version} \
+  --exclude libnvinfer.so.10 \
+  --exclude libnvonnxparser.so.10 \
   ${PLAT_STR}
 
 # ==============================================================================

diff --git a/assets/tests/surface_code_decoder.onnx b/assets/tests/surface_code_decoder.onnx
diff --git a/libs/qec/include/cudaq/qec/trt_decoder_internal.h b/libs/qec/include/cudaq/qec/trt_decoder_internal.h
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/qec/decoder.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+
+namespace cudaq::qec::trt_decoder_internal {
+
+/// @brief Validates TRT decoder parameters
+/// @param params The parameter map to validate
+/// @throws std::runtime_error if parameters are invalid
+void validate_trt_decoder_parameters(const cudaqx::heterogeneous_map &params);
+
+/// @brief Loads a binary file into memory
+/// @param filename Path to the file to load
+/// @return Vector containing the file contents
+/// @throws std::runtime_error if file cannot be opened
+std::vector<char> load_file(const std::string &filename);
+
+/// @brief Builds a TensorRT engine from an ONNX model
+/// @param onnx_model_path Path to the ONNX model file
+/// @param params Configuration parameters
+/// @param logger TensorRT logger instance
+/// @return Unique pointer to the built TensorRT engine
+/// @throws std::runtime_error if engine building fails
+std::unique_ptr<nvinfer1::ICudaEngine>
+build_engine_from_onnx(const std::string &onnx_model_path,
+                       const cudaqx::heterogeneous_map &params,
+                       nvinfer1::ILogger &logger);
+
+/// @brief Saves a TensorRT engine to a file
+/// @param engine The engine to save
+/// @param file_path Path where to save the engine
+/// @throws std::runtime_error if saving fails
+void save_engine_to_file(nvinfer1::ICudaEngine *engine,
+                         const std::string &file_path);
+
+/// @brief Parses and configures precision settings for TensorRT
+/// @param precision The precision string (fp16, bf16, int8, fp8, noTF32, best)
+/// @param config TensorRT builder config instance
+void parse_precision(const std::string &precision,
+                     nvinfer1::IBuilderConfig *config);
+
+} // namespace cudaq::qec::trt_decoder_internal
diff --git a/libs/qec/lib/CMakeLists.txt b/libs/qec/lib/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(${LIBRARY_NAME} SHARED
 )
 
 add_subdirectory(decoders/plugins/example)
+add_subdirectory(decoders/plugins/trt_decoder)
 add_subdirectory(codes)
 add_subdirectory(device)