ginkgo-project · yhmtsai · Nov 14, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
@@ -0,0 +1,67 @@
+name: ARM-build
+
+on:
+  push:
+    branches:
+      - 'main'
+      - 'master'
+      - 'develop'
+      - 'release/**'
+    tags:
+      - '**'
+  pull_request:
+    types: [opened,synchronize]
+    paths-ignore:
+      - 'doc/**'
+  workflow_dispatch:
+    inputs:
+      debug_enabled:
+        description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
+        required: false
+        default: false
+
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  arm-omp:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {shared: "ON", build_type: "Debug", name: "arm/omp/debug/shared", mixed: "OFF", half: "ON", bfloat16: "OFF"}
+          - {shared: "OFF", build_type: "Release", name: "arm/omp/release/static", mixed: "ON", half: "ON", bfloat16: "OFF"}
+          - {shared: "ON", build_type: "Release", name: "arm/omp/release/shared", mixed: "ON", half: "OFF", bfloat16: "ON"}
+          - {shared: "ON", build_type: "Release", name: "arm/omp/release/shared-16bit", mixed: "ON", half: "ON", bfloat16: "ON"}
+    name: ${{ matrix.config.name }}
+    runs-on: [ubuntu-24.04-arm]
+
+    steps:
+      - name: Checkout the latest code (shallow clone)
+        uses: actions/checkout@v4
+
+      - name: info
+        run: |
+          g++ -v
+          cmake --version
+
+      - name: Debug over SSH (tmate)
+        uses: mxschmitt/[email protected]
+        if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
+
+      - name: configure
+        run: |
+          mkdir build
+          mkdir install
+          export INSTALL_PREFIX=`pwd`/install
+          cd build
+          cmake .. -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-contract=off" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_ENABLE_HALF=${{ matrix.config.half }} -DGINKGO_ENABLE_BFLOAT16=${{ matrix.config.bfloat16 }}
+          make -j4
+          ctest -j4 --output-on-failure
+
+      - name: install
+        run: |
+          cd build
+          make install
+          make test_install
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -218,6 +218,20 @@ build/cuda126/nompi/gcc/cuda/release/shared:
     BUILD_TYPE: "Release"
     MODULE_LOAD: "cmake/3.30.8 cuda/12.6.3 gcc/13.3.0"
 
+build/cuda130/nompi/gcc/cuda/release/shared:
+  extends:
+    - .build_and_test_tum_template
+    - .default_variables
+    - .full_test_condition
+    - .use_tum-nvidia
+  variables:
+    BUILD_CUDA: "ON"
+    BUILD_HWLOC: "OFF"
+    ENABLE_HALF: "ON"
+    ENABLE_BFLOAT16: "ON"
+    BUILD_TYPE: "Release"
+    MODULE_LOAD: "cmake/3.30.8 cuda/13.0.2 gcc/14.3.0"
+
 # ROCm 4.5 and friends
 build/amd/nompi/gcc/rocm45/release/shared:
   extends:
@@ -341,6 +355,21 @@ build/amd/openmpi/gcc/rocm634_wo_omp/release/shared:
     BUILD_TYPE: "Release"
     MODULE_LOAD: "cmake/3.29.6 rocm/6.3.4 gcc/13.3.0 openmpi/5.0.7"
 
+# mi50 is not officially supported by ROCm >= 7
+build/amd/nompi/gcc/rocm710/release/shared:
+  extends:
+    - .build_and_test_tum_template
+    - .default_variables
+    - .full_test_condition
+    - .use_tum-amd-mi210
+  variables:
+    BUILD_HIP: "ON"
+    BUILD_HWLOC: "OFF"
+    BUILD_MPI: "OFF"
+    BUILD_OMP: "OFF"
+    BUILD_TYPE: "Release"
+    MODULE_LOAD: "cmake/3.29.6 rocm/7.1.0 gcc/14.3.0"
+
 # no cuda but latest gcc and clang
 build/nocuda/nompi/gcc/core/debug/static:
   extends:

diff --git a/.gitlab/image.yml b/.gitlab/image.yml
@@ -79,6 +79,12 @@
     - amd-gpus
     - tum
 
+.use_tum-amd-mi210:
+  image: rocky_tum
+  tags:
+    - amd-gpus-mi210
+    - tum
+
 .use_tum-intel:
   image: rocky_tum_intel
   tags:

diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
@@ -12,7 +12,13 @@
     - export CCACHE_DIR=${CCACHE_DIR}
     - export CCACHE_MAXSIZE=${CCACHE_MAXSIZE}
     - source /storage/apps/opt/spack/share/spack/setup-env.sh
-    - export MODULEPATH=/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
+    - mkdir -p lmod/cuda
+    - echo 'prepend_path("PATH","/storage/apps/usr/local/cuda-13.0.2/bin")' > lmod/cuda/13.0.2.lua
+    - echo 'prepend_path("CMAKE_PREFIX_PATH","/storage/apps/usr/local/cuda-13.0.2/.")' >> lmod/cuda/13.0.2.lua
+    - echo 'setenv("CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
+    - echo 'setenv("NVHPC_CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
+    - export MODULEPATH="$(pwd)/lmod":/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
+    - module av
 
 .before_script_git_template:
   before_script:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -234,6 +234,18 @@ if(GINKGO_BUILD_HIP)
             "Disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7"
         )
         set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
+    elseif(
+        GINKGO_HIP_PLATFORM_AMD
+        AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 7.1
+        AND GINKGO_HIP_VERSION VERSION_LESS 7.2
+    )
+        # https://github.com/ROCm/rocm-libraries/pull/1769 should fix this issue in ROCm 7.1.1.
+        # HIP VERSION does not use the excat version number as ROCm. Need to wait for ROCm 7.1.1 to set proper range for ROCm 7.1.0
+        message(
+            STATUS
+            "Disable custom thrust namespace for hip 7.1 because hip does not adapt the custom namespace fully."
+        )
+        set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
     else()
         message(STATUS "Enable custom thrust namespace for hip")
         set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE ON)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -40,6 +40,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
         cusparse_linops_${type}
         PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse
     )
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
+        target_link_libraries(cusparse_linops_${type} PRIVATE Thrust)
+    endif()
     ginkgo_compile_features(cusparse_linops_${type})
 endfunction()
 

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -14,6 +14,11 @@ endif()
 
 find_package(NVTX REQUIRED)
 
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
+    find_package(Thrust REQUIRED)
+    thrust_create_target(Thrust)
+endif()
+
 if(
     CMAKE_CUDA_HOST_COMPILER
     AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER

diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -623,14 +623,18 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     const auto block_dim = default_block_size;
     constexpr auto block_size = default_block_size;
 
-    initialize_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            b->get_size()[0], b->get_size()[1], krylov_dim,
-            as_device_type(b->get_const_values()), b->get_stride(),
-            as_device_type(residual->get_values()), residual->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(stop_status->get_data()));
+    if (grid_dim != 0) {
+        initialize_kernel<block_size>
+            <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                b->get_size()[0], b->get_size()[1], krylov_dim,
+                as_device_type(b->get_const_values()), b->get_stride(),
+                as_device_type(residual->get_values()), residual->get_stride(),
+                as_device_type(givens_sin->get_values()),
+                givens_sin->get_stride(),
+                as_device_type(givens_cos->get_values()),
+                givens_cos->get_stride(),
+                as_device_type(stop_status->get_data()));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
@@ -661,12 +665,14 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     constexpr auto block_size = default_block_size;
     const auto stride_arnoldi = arnoldi_norm->get_stride();
 
-    restart_1_kernel<block_size>
-        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_device_range(krylov_bases),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride());
+    if (grid_dim_1 != 0) {
+        restart_1_kernel<block_size>
+            <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
+                residual->get_size()[0], residual->get_size()[1], krylov_dim,
+                acc::as_device_range(krylov_bases),
+                as_device_type(residual_norm_collection->get_values()),
+                residual_norm_collection->get_stride());
+    }
     kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch(
         exec, residual, residual_norm, reduction_tmp);
 
@@ -695,21 +701,23 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
                                2 * stride_arnoldi),
                 stride_arnoldi, acc::as_device_range(krylov_bases));
     }
-
     const auto grid_dim_2 =
         ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
                 default_block_size);
-    restart_2_kernel<block_size>
-        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1],
-            as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(residual_norm->get_const_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            acc::as_device_range(krylov_bases),
-            as_device_type(next_krylov_basis->get_values()),
-            next_krylov_basis->get_stride(),
-            as_device_type(final_iter_nums->get_data()));
+
+    if (grid_dim_2 != 0) {
+        restart_2_kernel<block_size>
+            <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
+                residual->get_size()[0], residual->get_size()[1],
+                as_device_type(residual->get_const_values()),
+                residual->get_stride(),
+                as_device_type(residual_norm->get_const_values()),
+                as_device_type(residual_norm_collection->get_values()),
+                acc::as_device_range(krylov_bases),
+                as_device_type(next_krylov_basis->get_values()),
+                next_krylov_basis->get_stride(),
+                as_device_type(final_iter_nums->get_data()));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
@@ -919,18 +927,21 @@ void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
     const auto block_dim = block_size;
     const auto grid_dim =
         static_cast<unsigned int>(ceildiv(num_cols, block_size));
-
-    givens_rotation_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
-            iter, as_device_type(hessenberg_iter->get_values()),
-            hessenberg_iter->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(residual_norm->get_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride(),
-            stop_status->get_const_data());
+    if (grid_dim != 0) {
+        givens_rotation_kernel<block_size>
+            <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
+                iter, as_device_type(hessenberg_iter->get_values()),
+                hessenberg_iter->get_stride(),
+                as_device_type(givens_sin->get_values()),
+                givens_sin->get_stride(),
+                as_device_type(givens_cos->get_values()),
+                givens_cos->get_stride(),
+                as_device_type(residual_norm->get_values()),
+                as_device_type(residual_norm_collection->get_values()),
+                residual_norm_collection->get_stride(),
+                stop_status->get_const_data());
+    }
 }
 
 
@@ -949,12 +960,14 @@ void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
              array<stopping_status>* reorth_status,
              array<size_type>* num_reorth)
 {
-    increase_final_iteration_numbers_kernel<<<
-        static_cast<unsigned int>(
-            ceildiv(final_iter_nums->get_size(), default_block_size)),
-        default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(final_iter_nums->get_data()),
-        stop_status->get_const_data(), final_iter_nums->get_size());
+    if (final_iter_nums->get_size() != 0) {
+        increase_final_iteration_numbers_kernel<<<
+            static_cast<unsigned int>(
+                ceildiv(final_iter_nums->get_size(), default_block_size)),
+            default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(final_iter_nums->get_data()),
+            stop_status->get_const_data(), final_iter_nums->get_size());
+    }
     finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
                        buffer_iter, arnoldi_norm, iter,
                        stop_status->get_const_data(), reorth_status->get_data(),
@@ -1007,14 +1020,15 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
     const auto grid_dim = static_cast<unsigned int>(
         ceildiv(num_rows * stride_before_preconditioner, block_size));
     const auto block_dim = block_size;
-
-    calculate_Qy_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_device_range(krylov_bases),
-            as_device_type(y->get_const_values()), y->get_stride(),
-            as_device_type(before_preconditioner->get_values()),
-            stride_before_preconditioner,
-            as_device_type(final_iter_nums->get_const_data()));
+    if (grid_dim != 0) {
+        calculate_Qy_kernel<block_size>
+            <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                num_rows, num_cols, acc::as_device_range(krylov_bases),
+                as_device_type(y->get_const_values()), y->get_stride(),
+                as_device_type(before_preconditioner->get_values()),
+                stride_before_preconditioner,
+                as_device_type(final_iter_nums->get_const_data()));
+    }
     // Calculate qy
     // before_preconditioner = krylov_bases * y
 }

diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
@@ -156,6 +156,9 @@ target_link_libraries(
         CUDA::cufft
         nvtx::nvtx
 )
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
+    target_link_libraries(ginkgo_cuda PRIVATE Thrust)
+endif()
 # NVTX3 is header-only and requires dlopen/dlclose in static builds
 target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})
 

diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -32,7 +32,11 @@ struct config {
      * `device_functions.h`.
      */
 #if GINKGO_HIP_PLATFORM_HCC
-    static constexpr uint32 warp_size = warpSize;
+    // workaround for ROCm >= 7, which does not give warpSize in compile time.
+    // We can not define warpSize via compiler because amd_warp_functions.h
+    // defines a struct variable called warpSize, too. No support for 32 on AMD
+    // GPU yet.
+    static constexpr uint32 warp_size = 64;
 #else  // GINKGO_HIP_PLATFORM_NVCC
     static constexpr uint32 warp_size = 32;
 #endif