diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
new file mode 100644
index 00000000000..7e1d357a11a
--- /dev/null
+++ b/.github/workflows/arm.yml
@@ -0,0 +1,67 @@
+name: ARM-build
+
+on:
+  push:
+    branches:
+      - 'main'
+      - 'master'
+      - 'develop'
+      - 'release/**'
+    tags:
+      - '**'
+  pull_request:
+    types: [opened,synchronize]
+    paths-ignore:
+      - 'doc/**'
+  workflow_dispatch:
+    inputs:
+      debug_enabled:
+        description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
+        required: false
+        default: false
+
+concurrency:
+  group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  arm-omp:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {shared: "ON", build_type: "Debug", name: "arm/omp/debug/shared", mixed: "OFF", half: "ON", bfloat16: "OFF"}
+          - {shared: "OFF", build_type: "Release", name: "arm/omp/release/static", mixed: "ON", half: "ON", bfloat16: "OFF"}
+          - {shared: "ON", build_type: "Release", name: "arm/omp/release/shared", mixed: "ON", half: "OFF", bfloat16: "ON"}
+          - {shared: "ON", build_type: "Release", name: "arm/omp/release/shared-16bit", mixed: "ON", half: "ON", bfloat16: "ON"}
+    name: ${{ matrix.config.name }}
+    runs-on: [ubuntu-24.04-arm]
+
+    steps:
+      - name: Checkout the latest code (shallow clone)
+        uses: actions/checkout@v4
+
+      - name: info
+        run: |
+          g++ -v
+          cmake --version
+
+      - name: Debug over SSH (tmate)
+        uses: mxschmitt/action-tmate@v3.5
+        if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}
+
+      - name: configure
+        run: |
+          mkdir build
+          mkdir install
+          export INSTALL_PREFIX=`pwd`/install
+          cd build
+          cmake .. -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-contract=off" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_ENABLE_HALF=${{ matrix.config.half }} -DGINKGO_ENABLE_BFLOAT16=${{ matrix.config.bfloat16 }}
+          make -j4
+          ctest -j4 --output-on-failure
+
+      - name: install
+        run: |
+          cd build
+          make install
+          make test_install
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d2bce32c72a..58350cd6674 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -218,6 +218,20 @@ build/cuda126/nompi/gcc/cuda/release/shared:
     BUILD_TYPE: "Release"
     MODULE_LOAD: "cmake/3.30.8 cuda/12.6.3 gcc/13.3.0"
 
+build/cuda130/nompi/gcc/cuda/release/shared:
+  extends:
+    - .build_and_test_tum_template
+    - .default_variables
+    - .full_test_condition
+    - .use_tum-nvidia
+  variables:
+    BUILD_CUDA: "ON"
+    BUILD_HWLOC: "OFF"
+    ENABLE_HALF: "ON"
+    ENABLE_BFLOAT16: "ON"
+    BUILD_TYPE: "Release"
+    MODULE_LOAD: "cmake/3.30.8 cuda/13.0.2 gcc/14.3.0"
+
 # ROCm 4.5 and friends
 build/amd/nompi/gcc/rocm45/release/shared:
   extends:
@@ -341,6 +355,21 @@ build/amd/openmpi/gcc/rocm634_wo_omp/release/shared:
     BUILD_TYPE: "Release"
     MODULE_LOAD: "cmake/3.29.6 rocm/6.3.4 gcc/13.3.0 openmpi/5.0.7"
 
+# mi50 is not officially supported by ROCm >= 7
+build/amd/nompi/gcc/rocm710/release/shared:
+  extends:
+    - .build_and_test_tum_template
+    - .default_variables
+    - .full_test_condition
+    - .use_tum-amd-mi210
+  variables:
+    BUILD_HIP: "ON"
+    BUILD_HWLOC: "OFF"
+    BUILD_MPI: "OFF"
+    BUILD_OMP: "OFF"
+    BUILD_TYPE: "Release"
+    MODULE_LOAD: "cmake/3.29.6 rocm/7.1.0 gcc/14.3.0"
+
 # no cuda but latest gcc and clang
 build/nocuda/nompi/gcc/core/debug/static:
   extends:
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index 019bc9b9524..a609a26fdb5 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -79,6 +79,12 @@
     - amd-gpus
     - tum
 
+.use_tum-amd-mi210:
+  image: rocky_tum
+  tags:
+    - amd-gpus-mi210
+    - tum
+
 .use_tum-intel:
   image: rocky_tum_intel
   tags:
diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index 095519afac2..b3d1a84cf8e 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -12,7 +12,13 @@
     - export CCACHE_DIR=${CCACHE_DIR}
     - export CCACHE_MAXSIZE=${CCACHE_MAXSIZE}
     - source /storage/apps/opt/spack/share/spack/setup-env.sh
-    - export MODULEPATH=/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
+    - mkdir -p lmod/cuda
+    - echo 'prepend_path("PATH","/storage/apps/usr/local/cuda-13.0.2/bin")' > lmod/cuda/13.0.2.lua
+    - echo 'prepend_path("CMAKE_PREFIX_PATH","/storage/apps/usr/local/cuda-13.0.2/.")' >> lmod/cuda/13.0.2.lua
+    - echo 'setenv("CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
+    - echo 'setenv("NVHPC_CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
+    - export MODULEPATH="$(pwd)/lmod":/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
+    - module av
 
 .before_script_git_template:
   before_script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0596d106412..46a1d979443 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,6 +234,18 @@ if(GINKGO_BUILD_HIP)
             "Disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7"
         )
         set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
+    elseif(
+        GINKGO_HIP_PLATFORM_AMD
+        AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 7.1
+        AND GINKGO_HIP_VERSION VERSION_LESS 7.2
+    )
+        # https://github.com/ROCm/rocm-libraries/pull/1769 should fix this issue in ROCm 7.1.1.
+        # HIP VERSION does not use the excat version number as ROCm. Need to wait for ROCm 7.1.1 to set proper range for ROCm 7.1.0
+        message(
+            STATUS
+            "Disable custom thrust namespace for hip 7.1 because hip does not adapt the custom namespace fully."
+        )
+        set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
     else()
         message(STATUS "Enable custom thrust namespace for hip")
         set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE ON)
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index d9cee938a2e..606daf9363d 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -40,6 +40,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
         cusparse_linops_${type}
         PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse
     )
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
+        target_link_libraries(cusparse_linops_${type} PRIVATE Thrust)
+    endif()
     ginkgo_compile_features(cusparse_linops_${type})
 endfunction()
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index e6de143e0a9..e5f63833662 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -14,6 +14,11 @@ endif()
 
 find_package(NVTX REQUIRED)
 
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
+    find_package(Thrust REQUIRED)
+    thrust_create_target(Thrust)
+endif()
+
 if(
     CMAKE_CUDA_HOST_COMPILER
     AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER
diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp
index bdf6de03f38..0ee6d1b5321 100644
--- a/common/cuda_hip/solver/cb_gmres_kernels.cpp
+++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -623,14 +623,18 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
     const auto block_dim = default_block_size;
     constexpr auto block_size = default_block_size;
 
-    initialize_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            b->get_size()[0], b->get_size()[1], krylov_dim,
-            as_device_type(b->get_const_values()), b->get_stride(),
-            as_device_type(residual->get_values()), residual->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(stop_status->get_data()));
+    if (grid_dim != 0) {
+        initialize_kernel<block_size>
+            <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                b->get_size()[0], b->get_size()[1], krylov_dim,
+                as_device_type(b->get_const_values()), b->get_stride(),
+                as_device_type(residual->get_values()), residual->get_stride(),
+                as_device_type(givens_sin->get_values()),
+                givens_sin->get_stride(),
+                as_device_type(givens_cos->get_values()),
+                givens_cos->get_stride(),
+                as_device_type(stop_status->get_data()));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
@@ -661,12 +665,14 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     constexpr auto block_size = default_block_size;
     const auto stride_arnoldi = arnoldi_norm->get_stride();
 
-    restart_1_kernel<block_size>
-        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_device_range(krylov_bases),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride());
+    if (grid_dim_1 != 0) {
+        restart_1_kernel<block_size>
+            <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
+                residual->get_size()[0], residual->get_size()[1], krylov_dim,
+                acc::as_device_range(krylov_bases),
+                as_device_type(residual_norm_collection->get_values()),
+                residual_norm_collection->get_stride());
+    }
     kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch(
         exec, residual, residual_norm, reduction_tmp);
 
@@ -695,21 +701,23 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
                                2 * stride_arnoldi),
                 stride_arnoldi, acc::as_device_range(krylov_bases));
     }
-
     const auto grid_dim_2 =
         ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
                 default_block_size);
-    restart_2_kernel<block_size>
-        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1],
-            as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(residual_norm->get_const_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            acc::as_device_range(krylov_bases),
-            as_device_type(next_krylov_basis->get_values()),
-            next_krylov_basis->get_stride(),
-            as_device_type(final_iter_nums->get_data()));
+
+    if (grid_dim_2 != 0) {
+        restart_2_kernel<block_size>
+            <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
+                residual->get_size()[0], residual->get_size()[1],
+                as_device_type(residual->get_const_values()),
+                residual->get_stride(),
+                as_device_type(residual_norm->get_const_values()),
+                as_device_type(residual_norm_collection->get_values()),
+                acc::as_device_range(krylov_bases),
+                as_device_type(next_krylov_basis->get_values()),
+                next_krylov_basis->get_stride(),
+                as_device_type(final_iter_nums->get_data()));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
@@ -919,18 +927,21 @@ void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
     const auto block_dim = block_size;
     const auto grid_dim =
         static_cast<unsigned int>(ceildiv(num_cols, block_size));
-
-    givens_rotation_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
-            iter, as_device_type(hessenberg_iter->get_values()),
-            hessenberg_iter->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(residual_norm->get_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride(),
-            stop_status->get_const_data());
+    if (grid_dim != 0) {
+        givens_rotation_kernel<block_size>
+            <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
+                iter, as_device_type(hessenberg_iter->get_values()),
+                hessenberg_iter->get_stride(),
+                as_device_type(givens_sin->get_values()),
+                givens_sin->get_stride(),
+                as_device_type(givens_cos->get_values()),
+                givens_cos->get_stride(),
+                as_device_type(residual_norm->get_values()),
+                as_device_type(residual_norm_collection->get_values()),
+                residual_norm_collection->get_stride(),
+                stop_status->get_const_data());
+    }
 }
 
 
@@ -949,12 +960,14 @@ void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
              array<stopping_status>* reorth_status,
              array<size_type>* num_reorth)
 {
-    increase_final_iteration_numbers_kernel<<<
-        static_cast<unsigned int>(
-            ceildiv(final_iter_nums->get_size(), default_block_size)),
-        default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(final_iter_nums->get_data()),
-        stop_status->get_const_data(), final_iter_nums->get_size());
+    if (final_iter_nums->get_size() != 0) {
+        increase_final_iteration_numbers_kernel<<<
+            static_cast<unsigned int>(
+                ceildiv(final_iter_nums->get_size(), default_block_size)),
+            default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(final_iter_nums->get_data()),
+            stop_status->get_const_data(), final_iter_nums->get_size());
+    }
     finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
                        buffer_iter, arnoldi_norm, iter,
                        stop_status->get_const_data(), reorth_status->get_data(),
@@ -1007,14 +1020,15 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
     const auto grid_dim = static_cast<unsigned int>(
         ceildiv(num_rows * stride_before_preconditioner, block_size));
     const auto block_dim = block_size;
-
-    calculate_Qy_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_device_range(krylov_bases),
-            as_device_type(y->get_const_values()), y->get_stride(),
-            as_device_type(before_preconditioner->get_values()),
-            stride_before_preconditioner,
-            as_device_type(final_iter_nums->get_const_data()));
+    if (grid_dim != 0) {
+        calculate_Qy_kernel<block_size>
+            <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+                num_rows, num_cols, acc::as_device_range(krylov_bases),
+                as_device_type(y->get_const_values()), y->get_stride(),
+                as_device_type(before_preconditioner->get_values()),
+                stride_before_preconditioner,
+                as_device_type(final_iter_nums->get_const_data()));
+    }
     // Calculate qy
     // before_preconditioner = krylov_bases * y
 }
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index dde0a387d73..3cddd9a8d05 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -156,6 +156,9 @@ target_link_libraries(
         CUDA::cufft
         nvtx::nvtx
 )
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
+    target_link_libraries(ginkgo_cuda PRIVATE Thrust)
+endif()
 # NVTX3 is header-only and requires dlopen/dlclose in static builds
 target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})
 
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index 832b750f0fd..f5ca4ab2f83 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -32,7 +32,11 @@ struct config {
      * `device_functions.h`.
      */
 #if GINKGO_HIP_PLATFORM_HCC
-    static constexpr uint32 warp_size = warpSize;
+    // workaround for ROCm >= 7, which does not give warpSize in compile time.
+    // We can not define warpSize via compiler because amd_warp_functions.h
+    // defines a struct variable called warpSize, too. No support for 32 on AMD
+    // GPU yet.
+    static constexpr uint32 warp_size = 64;
 #else  // GINKGO_HIP_PLATFORM_NVCC
     static constexpr uint32 warp_size = 32;
 #endif
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index e11e1c11554..ce098dc446d 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -181,12 +181,20 @@ struct hipblas_type_impl<volatile T> {
 
 template <>
 struct hipblas_type_impl<std::complex<float>> {
+#if HIP_VERSION >= 70000000
+    using type = hipFloatComplex;
+#else
     using type = hipblasComplex;
+#endif
 };
 
 template <>
 struct hipblas_type_impl<std::complex<double>> {
+#if HIP_VERSION >= 70000000
+    using type = hipDoubleComplex;
+#else
     using type = hipblasDoubleComplex;
+#endif
 };
 
 template <typename T>
@@ -280,46 +288,81 @@ struct hip_type_impl<matrix_data_entry<ValueType, IndexType>> {
                           IndexType>;
 };
 
+#if HIP_VERSION >= 70000000
+using hipblasDataType = hipDataType;
+#else
+using hipblasDataType = hipblasDatatype_t;
+#endif
+
 template <typename T>
-constexpr hipblasDatatype_t hip_data_type_impl()
+constexpr hipblasDataType hip_data_type_impl()
 {
+#if HIP_VERSION >= 70000000
+    return HIP_C_16F;
+#else
     return HIPBLAS_C_16F;
+#endif
 }
 
 template <>
-constexpr hipblasDatatype_t hip_data_type_impl<half>()
+constexpr hipblasDataType hip_data_type_impl<half>()
 {
+#if HIP_VERSION >= 70000000
+    return HIP_R_16F;
+#else
     return HIPBLAS_R_16F;
+#endif
 }
 
 template <>
-constexpr hipblasDatatype_t hip_data_type_impl<bfloat16>()
+constexpr hipblasDataType hip_data_type_impl<bfloat16>()
 {
-    return HIPBLAS_R_16B;
+#if HIP_VERSION >= 70000000
+    return HIP_R_16F;
+#else
+    return HIPBLAS_R_16F;
+#endif
 }
 
 template <>
-constexpr hipblasDatatype_t hip_data_type_impl<float>()
+constexpr hipblasDataType hip_data_type_impl<float>()
 {
+#if HIP_VERSION >= 70000000
+    return HIP_R_32F;
+#else
     return HIPBLAS_R_32F;
+#endif
 }
 
 template <>
-constexpr hipblasDatatype_t hip_data_type_impl<double>()
+constexpr hipblasDataType hip_data_type_impl<double>()
 {
+#if HIP_VERSION >= 70000000
+    return HIP_R_64F;
+#else
     return HIPBLAS_R_64F;
+#endif
 }
 
 template <>
-constexpr hipblasDatatype_t hip_data_type_impl<std::complex<float>>()
+constexpr hipblasDataType hip_data_type_impl<std::complex<float>>()
 {
+#if HIP_VERSION >= 70000000
+    return HIP_C_32F;
+#else
     return HIPBLAS_C_32F;
+#endif
 }
 
 template <>
-constexpr hipblasDatatype_t hip_data_type_impl<std::complex<double>>()
+constexpr hipblasDataType hip_data_type_impl<std::complex<double>>()
 {
     return HIPBLAS_C_64F;
+#if HIP_VERSION >= 70000000
+    return HIP_C_64F;
+#else
+    return HIPBLAS_C_64F;
+#endif
 }
 
 
@@ -335,7 +378,7 @@ constexpr hipblasDatatype_t hip_data_type_impl<std::complex<double>>()
  * @returns the actual `hipblasDatatype_t`
  */
 template <typename T>
-constexpr hipblasDatatype_t hip_data_type()
+constexpr detail::hipblasDataType hip_data_type()
 {
     return detail::hip_data_type_impl<T>();
 }
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index b923be021dd..649cb7e0ca4 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -154,6 +154,13 @@ TEST_F(HipExecutor, FailsWhenOverallocating)
         gko::AllocationError);
 
     hip->free(ptr);
+#if HIP_VERSION >= 70000000
+    // hipSPARSE handle creation will still carry the last error even if the
+    // error has been returned after ROCm 7.0.0. We use hipGetLastError() to
+    // throw it again. Related Issue:
+    // https://github.com/ROCm/rocm-libraries/issues/2801
+    hipGetLastError();
+#endif
 }
 
 
diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp
index 45ed3cc9120..e2b5c511f46 100644
--- a/test/solver/solver.cpp
+++ b/test/solver/solver.cpp
@@ -947,7 +947,7 @@ TYPED_TEST(Solver, ApplyIsEquivalentToRef)
                 solver.ref->apply(b.ref, x.ref);
                 solver.dev->apply(b.dev, x.dev);
 
-                GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->tol(x));
+                GKO_ASSERT_MTX_NEAR(x.ref, x.dev, 1.2 * this->tol(x));
             });
     });
 }