Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions .github/workflows/arm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: ARM-build

on:
push:
branches:
- 'main'
- 'master'
- 'develop'
- 'release/**'
tags:
- '**'
pull_request:
types: [opened,synchronize]
paths-ignore:
- 'doc/**'
workflow_dispatch:
inputs:
debug_enabled:
description: 'Run the build with tmate debugging enabled by `debug_enabled` keyword (https://github.com/marketplace/actions/debugging-with-tmate)'
required: false
default: false

concurrency:
group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }}
cancel-in-progress: true

jobs:
arm-omp:
strategy:
fail-fast: false
matrix:
config:
- {shared: "ON", build_type: "Debug", name: "arm/omp/debug/shared", mixed: "OFF", half: "ON", bfloat16: "OFF"}
- {shared: "OFF", build_type: "Release", name: "arm/omp/release/static", mixed: "ON", half: "ON", bfloat16: "OFF"}
- {shared: "ON", build_type: "Release", name: "arm/omp/release/shared", mixed: "ON", half: "OFF", bfloat16: "ON"}
- {shared: "ON", build_type: "Release", name: "arm/omp/release/shared-16bit", mixed: "ON", half: "ON", bfloat16: "ON"}
name: ${{ matrix.config.name }}
runs-on: [ubuntu-24.04-arm]

steps:
- name: Checkout the latest code (shallow clone)
uses: actions/checkout@v4

- name: info
run: |
g++ -v
cmake --version

- name: Debug over SSH (tmate)
uses: mxschmitt/[email protected]
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_enabled }}

- name: configure
run: |
mkdir build
mkdir install
export INSTALL_PREFIX=`pwd`/install
cd build
cmake .. -DCMAKE_CXX_FLAGS="-Wpedantic -ffp-contract=off" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_ENABLE_HALF=${{ matrix.config.half }} -DGINKGO_ENABLE_BFLOAT16=${{ matrix.config.bfloat16 }}
make -j4
ctest -j4 --output-on-failure

- name: install
run: |
cd build
make install
make test_install
29 changes: 29 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,20 @@ build/cuda126/nompi/gcc/cuda/release/shared:
BUILD_TYPE: "Release"
MODULE_LOAD: "cmake/3.30.8 cuda/12.6.3 gcc/13.3.0"

build/cuda130/nompi/gcc/cuda/release/shared:
extends:
- .build_and_test_tum_template
- .default_variables
- .full_test_condition
- .use_tum-nvidia
variables:
BUILD_CUDA: "ON"
BUILD_HWLOC: "OFF"
ENABLE_HALF: "ON"
ENABLE_BFLOAT16: "ON"
BUILD_TYPE: "Release"
MODULE_LOAD: "cmake/3.30.8 cuda/13.0.2 gcc/14.3.0"

# ROCm 4.5 and friends
build/amd/nompi/gcc/rocm45/release/shared:
extends:
Expand Down Expand Up @@ -341,6 +355,21 @@ build/amd/openmpi/gcc/rocm634_wo_omp/release/shared:
BUILD_TYPE: "Release"
MODULE_LOAD: "cmake/3.29.6 rocm/6.3.4 gcc/13.3.0 openmpi/5.0.7"

# mi50 is not officially supported by ROCm >= 7
build/amd/nompi/gcc/rocm710/release/shared:
extends:
- .build_and_test_tum_template
- .default_variables
- .full_test_condition
- .use_tum-amd-mi210
variables:
BUILD_HIP: "ON"
BUILD_HWLOC: "OFF"
BUILD_MPI: "OFF"
BUILD_OMP: "OFF"
BUILD_TYPE: "Release"
MODULE_LOAD: "cmake/3.29.6 rocm/7.1.0 gcc/14.3.0"

# no cuda but latest gcc and clang
build/nocuda/nompi/gcc/core/debug/static:
extends:
Expand Down
6 changes: 6 additions & 0 deletions .gitlab/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@
- amd-gpus
- tum

.use_tum-amd-mi210:
image: rocky_tum
tags:
- amd-gpus-mi210
- tum

.use_tum-intel:
image: rocky_tum_intel
tags:
Expand Down
8 changes: 7 additions & 1 deletion .gitlab/scripts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@
- export CCACHE_DIR=${CCACHE_DIR}
- export CCACHE_MAXSIZE=${CCACHE_MAXSIZE}
- source /storage/apps/opt/spack/share/spack/setup-env.sh
- export MODULEPATH=/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
- mkdir -p lmod/cuda
- echo 'prepend_path("PATH","/storage/apps/usr/local/cuda-13.0.2/bin")' > lmod/cuda/13.0.2.lua
- echo 'prepend_path("CMAKE_PREFIX_PATH","/storage/apps/usr/local/cuda-13.0.2/.")' >> lmod/cuda/13.0.2.lua
- echo 'setenv("CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
- echo 'setenv("NVHPC_CUDA_HOME","/storage/apps/usr/local/cuda-13.0.2")' >> lmod/cuda/13.0.2.lua
- export MODULEPATH="$(pwd)/lmod":/storage/apps/opt/rocm-modules:/storage/apps/opt/spack/share/spack/lmod/linux-rocky9-x86_64/Core
- module av

.before_script_git_template:
before_script:
Expand Down
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,18 @@ if(GINKGO_BUILD_HIP)
"Disable custom thrust namespace for hip before 5.7 because hip does not fully support it before 5.7"
)
set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
elseif(
GINKGO_HIP_PLATFORM_AMD
AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 7.1
AND GINKGO_HIP_VERSION VERSION_LESS 7.2
)
# https://github.com/ROCm/rocm-libraries/pull/1769 should fix this issue in ROCm 7.1.1.
# HIP VERSION does not use the excat version number as ROCm. Need to wait for ROCm 7.1.1 to set proper range for ROCm 7.1.0
message(
STATUS
"Disable custom thrust namespace for hip 7.1 because hip does not adapt the custom namespace fully."
)
set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE OFF)
else()
message(STATUS "Enable custom thrust namespace for hip")
set(GINKGO_HIP_CUSTOM_THRUST_NAMESPACE ON)
Expand Down
3 changes: 3 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
cusparse_linops_${type}
PRIVATE Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse
)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
target_link_libraries(cusparse_linops_${type} PRIVATE Thrust)
endif()
ginkgo_compile_features(cusparse_linops_${type})
endfunction()

Expand Down
5 changes: 5 additions & 0 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ endif()

find_package(NVTX REQUIRED)

if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
find_package(Thrust REQUIRED)
thrust_create_target(Thrust)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we also add this to ginkgo_cuda?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do not need it because they are compiled by NVCC.
I still apply it to get consistent setup

endif()

if(
CMAKE_CUDA_HOST_COMPILER
AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER
Expand Down
120 changes: 67 additions & 53 deletions common/cuda_hip/solver/cb_gmres_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -623,14 +623,18 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
const auto block_dim = default_block_size;
constexpr auto block_size = default_block_size;

initialize_kernel<block_size>
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
b->get_size()[0], b->get_size()[1], krylov_dim,
as_device_type(b->get_const_values()), b->get_stride(),
as_device_type(residual->get_values()), residual->get_stride(),
as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
as_device_type(stop_status->get_data()));
if (grid_dim != 0) {
initialize_kernel<block_size>
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
b->get_size()[0], b->get_size()[1], krylov_dim,
as_device_type(b->get_const_values()), b->get_stride(),
as_device_type(residual->get_values()), residual->get_stride(),
as_device_type(givens_sin->get_values()),
givens_sin->get_stride(),
as_device_type(givens_cos->get_values()),
givens_cos->get_stride(),
as_device_type(stop_status->get_data()));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(
Expand Down Expand Up @@ -661,12 +665,14 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
constexpr auto block_size = default_block_size;
const auto stride_arnoldi = arnoldi_norm->get_stride();

restart_1_kernel<block_size>
<<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
residual->get_size()[0], residual->get_size()[1], krylov_dim,
acc::as_device_range(krylov_bases),
as_device_type(residual_norm_collection->get_values()),
residual_norm_collection->get_stride());
if (grid_dim_1 != 0) {
restart_1_kernel<block_size>
<<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
residual->get_size()[0], residual->get_size()[1], krylov_dim,
acc::as_device_range(krylov_bases),
as_device_type(residual_norm_collection->get_values()),
residual_norm_collection->get_stride());
}
kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch(
exec, residual, residual_norm, reduction_tmp);

Expand Down Expand Up @@ -695,21 +701,23 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
2 * stride_arnoldi),
stride_arnoldi, acc::as_device_range(krylov_bases));
}

const auto grid_dim_2 =
ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
default_block_size);
restart_2_kernel<block_size>
<<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
residual->get_size()[0], residual->get_size()[1],
as_device_type(residual->get_const_values()),
residual->get_stride(),
as_device_type(residual_norm->get_const_values()),
as_device_type(residual_norm_collection->get_values()),
acc::as_device_range(krylov_bases),
as_device_type(next_krylov_basis->get_values()),
next_krylov_basis->get_stride(),
as_device_type(final_iter_nums->get_data()));

if (grid_dim_2 != 0) {
restart_2_kernel<block_size>
<<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
residual->get_size()[0], residual->get_size()[1],
as_device_type(residual->get_const_values()),
residual->get_stride(),
as_device_type(residual_norm->get_const_values()),
as_device_type(residual_norm_collection->get_values()),
acc::as_device_range(krylov_bases),
as_device_type(next_krylov_basis->get_values()),
next_krylov_basis->get_stride(),
as_device_type(final_iter_nums->get_data()));
}
}

GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
Expand Down Expand Up @@ -919,18 +927,21 @@ void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
const auto block_dim = block_size;
const auto grid_dim =
static_cast<unsigned int>(ceildiv(num_cols, block_size));

givens_rotation_kernel<block_size>
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
iter, as_device_type(hessenberg_iter->get_values()),
hessenberg_iter->get_stride(),
as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
as_device_type(residual_norm->get_values()),
as_device_type(residual_norm_collection->get_values()),
residual_norm_collection->get_stride(),
stop_status->get_const_data());
if (grid_dim != 0) {
givens_rotation_kernel<block_size>
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
iter, as_device_type(hessenberg_iter->get_values()),
hessenberg_iter->get_stride(),
as_device_type(givens_sin->get_values()),
givens_sin->get_stride(),
as_device_type(givens_cos->get_values()),
givens_cos->get_stride(),
as_device_type(residual_norm->get_values()),
as_device_type(residual_norm_collection->get_values()),
residual_norm_collection->get_stride(),
stop_status->get_const_data());
}
}


Expand All @@ -949,12 +960,14 @@ void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
array<stopping_status>* reorth_status,
array<size_type>* num_reorth)
{
increase_final_iteration_numbers_kernel<<<
static_cast<unsigned int>(
ceildiv(final_iter_nums->get_size(), default_block_size)),
default_block_size, 0, exec->get_stream()>>>(
as_device_type(final_iter_nums->get_data()),
stop_status->get_const_data(), final_iter_nums->get_size());
if (final_iter_nums->get_size() != 0) {
increase_final_iteration_numbers_kernel<<<
static_cast<unsigned int>(
ceildiv(final_iter_nums->get_size(), default_block_size)),
default_block_size, 0, exec->get_stream()>>>(
as_device_type(final_iter_nums->get_data()),
stop_status->get_const_data(), final_iter_nums->get_size());
}
finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
buffer_iter, arnoldi_norm, iter,
stop_status->get_const_data(), reorth_status->get_data(),
Expand Down Expand Up @@ -1007,14 +1020,15 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
const auto grid_dim = static_cast<unsigned int>(
ceildiv(num_rows * stride_before_preconditioner, block_size));
const auto block_dim = block_size;

calculate_Qy_kernel<block_size>
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
num_rows, num_cols, acc::as_device_range(krylov_bases),
as_device_type(y->get_const_values()), y->get_stride(),
as_device_type(before_preconditioner->get_values()),
stride_before_preconditioner,
as_device_type(final_iter_nums->get_const_data()));
if (grid_dim != 0) {
calculate_Qy_kernel<block_size>
<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
num_rows, num_cols, acc::as_device_range(krylov_bases),
as_device_type(y->get_const_values()), y->get_stride(),
as_device_type(before_preconditioner->get_values()),
stride_before_preconditioner,
as_device_type(final_iter_nums->get_const_data()));
}
// Calculate qy
// before_preconditioner = krylov_bases * y
}
Expand Down
3 changes: 3 additions & 0 deletions cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ target_link_libraries(
CUDA::cufft
nvtx::nvtx
)
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13)
target_link_libraries(ginkgo_cuda PRIVATE Thrust)
endif()
# NVTX3 is header-only and requires dlopen/dlclose in static builds
target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})

Expand Down
8 changes: 6 additions & 2 deletions hip/base/config.hip.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -32,7 +32,11 @@ struct config {
* `device_functions.h`.
*/
#if GINKGO_HIP_PLATFORM_HCC
static constexpr uint32 warp_size = warpSize;
// workaround for ROCm >= 7, which does not give warpSize in compile time.
// We can not define warpSize via compiler because amd_warp_functions.h
// defines a struct variable called warpSize, too. No support for 32 on AMD
// GPU yet.
static constexpr uint32 warp_size = 64;
#else // GINKGO_HIP_PLATFORM_NVCC
static constexpr uint32 warp_size = 32;
#endif
Expand Down
Loading
Loading