diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml new file mode 100644 index 00000000000..7e1d357a11a --- /dev/null +++ b/.github/workflows/arm.yml @@ -0,0 +1,67 @@ +name: ARM-build + +on: + push: + branches: + - 'main' + - 'master' + - 'develop' + - 'release/**' + tags: + - '**' + pull_request: + types: [opened,synchronize] + paths-ignore: + - 'doc/**' + workflow_dispatch: + inputs: + debug_enabled: + description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)' + required: false + default: false + +concurrency: + group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }} + cancel-in-progress: true + +jobs: + arm-omp: + strategy: + fail-fast: false + matrix: + config: + - {shared: "ON", build_type: "Debug", name: "arm/omp/debug/shared", mixed: "OFF", half: "ON", bfloat16: "OFF"} + - {shared: "OFF", build_type: "Release", name: "arm/omp/release/static", mixed: "ON", half: "ON", bfloat16: "OFF"} + - {shared: "ON", build_type: "Release", name: "arm/omp/release/shared", mixed: "ON", half: "OFF", bfloat16: "ON"} + - {shared: "ON", build_type: "Release", name: "arm/omp/release/shared-16bit", mixed: "ON", half: "ON", bfloat16: "ON"} + name: ${{ matrix.config.name }} + runs-on: [ubuntu-24.04-arm] + + steps: + - name: Checkout the latest code (shallow clone) + uses: actions/checkout@v4 + + - name: info + run: | + g++ -v + cmake --version + + - name: Debug over SSH (tmate) + uses: mxschmitt/action-tmate@v3.5 + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }} + + - name: configure + run: | + mkdir build + mkdir install + export INSTALL_PREFIX=`pwd`/install + cd build + cmake .. -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-contract=off" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_ENABLE_HALF=${{ matrix.config.half }} -DGINKGO_ENABLE_BFLOAT16=${{ matrix.config.bfloat16 }} + make -j4 + ctest -j4 --output-on-failure + + - name: install + run: | + cd build + make install + make test_install diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d2bce32c72a..58350cd6674 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -218,6 +218,20 @@ build/cuda126/nompi/gcc/cuda/release/shared: BUILD_TYPE: "Release" MODULE_LOAD: "cmake/3.30.8 cuda/12.6.3 gcc/13.3.0" +build/cuda130/nompi/gcc/cuda/release/shared: + extends: + - .build_and_test_tum_template + - .default_variables + - .full_test_condition + - .use_tum-nvidia + variables: + BUILD_CUDA: "ON" + BUILD_HWLOC: "OFF" + ENABLE_HALF: "ON" + ENABLE_BFLOAT16: "ON" + BUILD_TYPE: "Release" + MODULE_LOAD: "cmake/3.30.8 cuda/13.0.2 gcc/14.3.0" + # ROCm 4.5 and friends build/amd/nompi/gcc/rocm45/release/shared: extends: @@ -341,6 +355,21 @@ build/amd/openmpi/gcc/rocm634_wo_omp/release/shared: BUILD_TYPE: "Release" MODULE_LOAD: "cmake/3.29.6 rocm/6.3.4 gcc/13.3.0 openmpi/5.0.7" +# mi50 is not officially supported by ROCm >= 7 +build/amd/nompi/gcc/rocm710/release/shared: + extends: + - .build_and_test_tum_template + - .default_variables + - .full_test_condition + - .use_tum-amd-mi210 + variables: + BUILD_HIP: "ON" + BUILD_HWLOC: "OFF" + BUILD_MPI: "OFF" + BUILD_OMP: "OFF" + BUILD_TYPE: "Release" + MODULE_LOAD: "cmake/3.29.6 rocm/7.1.0 gcc/14.3.0" + # no cuda but latest gcc and clang build/nocuda/nompi/gcc/core/debug/static: extends: diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 019bc9b9524..a609a26fdb5 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -79,6 +79,12 @@ - amd-gpus - tum +.use_tum-amd-mi210: + image: rocky_tum + tags: + - amd-gpus-mi210 + - tum + .use_tum-intel: image: rocky_tum_intel tags: diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 095519afac2..b3d1a84cf8e 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -12,7 +12,13 @@ - export CCACHE_DIR=${CCACHE_DIR} - export CCACHE_MAXSIZE=${CCACHE_MAXSIZE} - source /storage/apps/opt/spack/share/spack/setup-env.sh - - export MODULEPATH=/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core + - mkdir -p lmod/cuda + - echo 'prepend_path("PATH","/storage/apps/usr/local/cuda-13.0.2/bin")' > lmod/cuda/13.0.2.lua + - echo 'prepend_path("CMAKE_PREFIX_PATH","/storage/apps/usr/local/cuda-13.0.2/.")' >> lmod/cuda/13.0.2.lua + - echo 'setenv("CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua + - echo 'setenv("NVHPC_CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua + - export MODULEPATH="$(pwd)/lmod":/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core + - module av .before_script_git_template: before_script: diff --git a/CMakeLists.txt b/CMakeLists.txt index 0596d106412..46a1d979443 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,6 +234,18 @@ if(GINKGO_BUILD_HIP) "Disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7" ) set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF) + elseif( + GINKGO_HIP_PLATFORM_AMD + AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 7.1 + AND GINKGO_HIP_VERSION VERSION_LESS 7.2 + ) + # https://github.com/ROCm/rocm-libraries/pull/1769 should fix this issue in ROCm 7.1.1. + # HIP VERSION does not use the excat version number as ROCm. Need to wait for ROCm 7.1.1 to set proper range for ROCm 7.1.0 + message( + STATUS + "Disable custom thrust namespace for hip 7.1 because hip does not adapt the custom namespace fully." + ) + set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF) else() message(STATUS "Enable custom thrust namespace for hip") set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE ON) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index d9cee938a2e..606daf9363d 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -40,6 +40,9 @@ function(ginkgo_benchmark_cusparse_linops type def) cusparse_linops_${type} PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse ) + if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13) + target_link_libraries(cusparse_linops_${type} PRIVATE Thrust) + endif() ginkgo_compile_features(cusparse_linops_${type}) endfunction() diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index e6de143e0a9..e5f63833662 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -14,6 +14,11 @@ endif() find_package(NVTX REQUIRED) +if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13) + find_package(Thrust REQUIRED) + thrust_create_target(Thrust) +endif() + if( CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp index bdf6de03f38..0ee6d1b5321 100644 --- a/common/cuda_hip/solver/cb_gmres_kernels.cpp +++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -623,14 +623,18 @@ void initialize(std::shared_ptr exec, const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; - initialize_kernel - <<get_stream()>>>( - b->get_size()[0], b->get_size()[1], krylov_dim, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(residual->get_values()), residual->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(stop_status->get_data())); + if (grid_dim != 0) { + initialize_kernel + <<get_stream()>>>( + b->get_size()[0], b->get_size()[1], krylov_dim, + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(residual->get_values()), residual->get_stride(), + as_device_type(givens_sin->get_values()), + givens_sin->get_stride(), + as_device_type(givens_cos->get_values()), + givens_cos->get_stride(), + as_device_type(stop_status->get_data())); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE( @@ -661,12 +665,14 @@ void restart(std::shared_ptr exec, constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); - restart_1_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_device_range(krylov_bases), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride()); + if (grid_dim_1 != 0) { + restart_1_kernel + <<get_stream()>>>( + residual->get_size()[0], residual->get_size()[1], krylov_dim, + acc::as_device_range(krylov_bases), + as_device_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride()); + } kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch( exec, residual, residual_norm, reduction_tmp); @@ -695,21 +701,23 @@ void restart(std::shared_ptr exec, 2 * stride_arnoldi), stride_arnoldi, acc::as_device_range(krylov_bases)); } - const auto grid_dim_2 = ceildiv(std::max(num_rows, 1) * krylov_stride[1], default_block_size); - restart_2_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], - as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(residual_norm->get_const_values()), - as_device_type(residual_norm_collection->get_values()), - acc::as_device_range(krylov_bases), - as_device_type(next_krylov_basis->get_values()), - next_krylov_basis->get_stride(), - as_device_type(final_iter_nums->get_data())); + + if (grid_dim_2 != 0) { + restart_2_kernel + <<get_stream()>>>( + residual->get_size()[0], residual->get_size()[1], + as_device_type(residual->get_const_values()), + residual->get_stride(), + as_device_type(residual_norm->get_const_values()), + as_device_type(residual_norm_collection->get_values()), + acc::as_device_range(krylov_bases), + as_device_type(next_krylov_basis->get_values()), + next_krylov_basis->get_stride(), + as_device_type(final_iter_nums->get_data())); + } } GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); @@ -919,18 +927,21 @@ void givens_rotation(std::shared_ptr exec, const auto block_dim = block_size; const auto grid_dim = static_cast(ceildiv(num_cols, block_size)); - - givens_rotation_kernel - <<get_stream()>>>( - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], - iter, as_device_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(residual_norm->get_values()), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - stop_status->get_const_data()); + if (grid_dim != 0) { + givens_rotation_kernel + <<get_stream()>>>( + hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], + iter, as_device_type(hessenberg_iter->get_values()), + hessenberg_iter->get_stride(), + as_device_type(givens_sin->get_values()), + givens_sin->get_stride(), + as_device_type(givens_cos->get_values()), + givens_cos->get_stride(), + as_device_type(residual_norm->get_values()), + as_device_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride(), + stop_status->get_const_data()); + } } @@ -949,12 +960,14 @@ void arnoldi(std::shared_ptr exec, array* reorth_status, array* num_reorth) { - increase_final_iteration_numbers_kernel<<< - static_cast( - ceildiv(final_iter_nums->get_size(), default_block_size)), - default_block_size, 0, exec->get_stream()>>>( - as_device_type(final_iter_nums->get_data()), - stop_status->get_const_data(), final_iter_nums->get_size()); + if (final_iter_nums->get_size() != 0) { + increase_final_iteration_numbers_kernel<<< + static_cast( + ceildiv(final_iter_nums->get_size(), default_block_size)), + default_block_size, 0, exec->get_stream()>>>( + as_device_type(final_iter_nums->get_data()), + stop_status->get_const_data(), final_iter_nums->get_size()); + } finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, buffer_iter, arnoldi_norm, iter, stop_status->get_const_data(), reorth_status->get_data(), @@ -1007,14 +1020,15 @@ void calculate_qy(std::shared_ptr exec, const auto grid_dim = static_cast( ceildiv(num_rows * stride_before_preconditioner, block_size)); const auto block_dim = block_size; - - calculate_Qy_kernel - <<get_stream()>>>( - num_rows, num_cols, acc::as_device_range(krylov_bases), - as_device_type(y->get_const_values()), y->get_stride(), - as_device_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_device_type(final_iter_nums->get_const_data())); + if (grid_dim != 0) { + calculate_Qy_kernel + <<get_stream()>>>( + num_rows, num_cols, acc::as_device_range(krylov_bases), + as_device_type(y->get_const_values()), y->get_stride(), + as_device_type(before_preconditioner->get_values()), + stride_before_preconditioner, + as_device_type(final_iter_nums->get_const_data())); + } // Calculate qy // before_preconditioner = krylov_bases * y } diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index dde0a387d73..3cddd9a8d05 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -156,6 +156,9 @@ target_link_libraries( CUDA::cufft nvtx::nvtx ) +if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13) + target_link_libraries(ginkgo_cuda PRIVATE Thrust) +endif() # NVTX3 is header-only and requires dlopen/dlclose in static builds target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS}) diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index 832b750f0fd..f5ca4ab2f83 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -32,7 +32,11 @@ struct config { * `device_functions.h`. */ #if GINKGO_HIP_PLATFORM_HCC - static constexpr uint32 warp_size = warpSize; + // workaround for ROCm >= 7, which does not give warpSize in compile time. + // We can not define warpSize via compiler because amd_warp_functions.h + // defines a struct variable called warpSize, too. No support for 32 on AMD + // GPU yet. + static constexpr uint32 warp_size = 64; #else // GINKGO_HIP_PLATFORM_NVCC static constexpr uint32 warp_size = 32; #endif diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index e11e1c11554..ce098dc446d 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -181,12 +181,20 @@ struct hipblas_type_impl { template <> struct hipblas_type_impl> { +#if HIP_VERSION >= 70000000 + using type = hipFloatComplex; +#else using type = hipblasComplex; +#endif }; template <> struct hipblas_type_impl> { +#if HIP_VERSION >= 70000000 + using type = hipDoubleComplex; +#else using type = hipblasDoubleComplex; +#endif }; template @@ -280,46 +288,81 @@ struct hip_type_impl> { IndexType>; }; +#if HIP_VERSION >= 70000000 +using hipblasDataType = hipDataType; +#else +using hipblasDataType = hipblasDatatype_t; +#endif + template -constexpr hipblasDatatype_t hip_data_type_impl() +constexpr hipblasDataType hip_data_type_impl() { +#if HIP_VERSION >= 70000000 + return HIP_C_16F; +#else return HIPBLAS_C_16F; +#endif } template <> -constexpr hipblasDatatype_t hip_data_type_impl() +constexpr hipblasDataType hip_data_type_impl() { +#if HIP_VERSION >= 70000000 + return HIP_R_16F; +#else return HIPBLAS_R_16F; +#endif } template <> -constexpr hipblasDatatype_t hip_data_type_impl() +constexpr hipblasDataType hip_data_type_impl() { - return HIPBLAS_R_16B; +#if HIP_VERSION >= 70000000 + return HIP_R_16F; +#else + return HIPBLAS_R_16F; +#endif } template <> -constexpr hipblasDatatype_t hip_data_type_impl() +constexpr hipblasDataType hip_data_type_impl() { +#if HIP_VERSION >= 70000000 + return HIP_R_32F; +#else return HIPBLAS_R_32F; +#endif } template <> -constexpr hipblasDatatype_t hip_data_type_impl() +constexpr hipblasDataType hip_data_type_impl() { +#if HIP_VERSION >= 70000000 + return HIP_R_64F; +#else return HIPBLAS_R_64F; +#endif } template <> -constexpr hipblasDatatype_t hip_data_type_impl>() +constexpr hipblasDataType hip_data_type_impl>() { +#if HIP_VERSION >= 70000000 + return HIP_C_32F; +#else return HIPBLAS_C_32F; +#endif } template <> -constexpr hipblasDatatype_t hip_data_type_impl>() +constexpr hipblasDataType hip_data_type_impl>() { return HIPBLAS_C_64F; +#if HIP_VERSION >= 70000000 + return HIP_C_64F; +#else + return HIPBLAS_C_64F; +#endif } @@ -335,7 +378,7 @@ constexpr hipblasDatatype_t hip_data_type_impl>() * @returns the actual `hipblasDatatype_t` */ template -constexpr hipblasDatatype_t hip_data_type() +constexpr detail::hipblasDataType hip_data_type() { return detail::hip_data_type_impl(); } diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index b923be021dd..649cb7e0ca4 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -154,6 +154,13 @@ TEST_F(HipExecutor, FailsWhenOverallocating) gko::AllocationError); hip->free(ptr); +#if HIP_VERSION >= 70000000 + // hipSPARSE handle creation will still carry the last error even if the + // error has been returned after ROCm 7.0.0. We use hipGetLastError() to + // throw it again. Related Issue: + // https://github.com/ROCm/rocm-libraries/issues/2801 + hipGetLastError(); +#endif } diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index 45ed3cc9120..e2b5c511f46 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -947,7 +947,7 @@ TYPED_TEST(Solver, ApplyIsEquivalentToRef) solver.ref->apply(b.ref, x.ref); solver.dev->apply(b.dev, x.dev); - GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->tol(x)); + GKO_ASSERT_MTX_NEAR(x.ref, x.dev, 1.2 * this->tol(x)); }); }); }