diff --git a/CMakeLists.txt b/CMakeLists.txt index 601570c24ab..cb95f8cdcb2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,6 +101,45 @@ set(USE_ACCEL CACHE STRING "Build with acceleration support (default: none)") set_property(CACHE USE_ACCEL PROPERTY STRINGS "" opencl cuda hip) +# Add new multi-architecture option +option(MULTI_ARCH "Build for multiple GPU architectures" OFF) + +# Multi-architecture support (only when enabled) +if (MULTI_ARCH AND MULTI_GPU_BUILD) + set(WITH_GPU_LIST + "P100" + CACHE STRING + "List of GPU architectures to build for (semicolon-separated)") + + # Define supported architectures for the property + set(SUPPORTED_CUDA_ARCHITECTURES + K20X + K40 + K80 + P100 + V100 + A100 + H100) + set(SUPPORTED_HIP_ARCHITECTURES Mi50 Mi100 Mi250 Mi300) + + set_property( + CACHE WITH_GPU_LIST PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES} + ${SUPPORTED_HIP_ARCHITECTURES}) + + # Parse the list + string(REPLACE ";" " " WITH_GPU_LIST_STR "${WITH_GPU_LIST}") + list(LENGTH WITH_GPU_LIST GPU_COUNT) + + if (GPU_COUNT GREATER 1) + set(MULTI_GPU_BUILD ON) + message(STATUS "Multi-GPU build enabled for: ${WITH_GPU_LIST_STR}") + else () + set(MULTI_GPU_BUILD OFF) + list(GET WITH_GPU_LIST 0 WITH_GPU) + message(STATUS "Single GPU build for: ${WITH_GPU}") + endif () +endif () + set(SUPPORTED_CUDA_ARCHITECTURES K20X K40 @@ -110,16 +149,24 @@ set(SUPPORTED_CUDA_ARCHITECTURES A100 H100) set(SUPPORTED_HIP_ARCHITECTURES Mi50 Mi100 Mi250 Mi300) -set(WITH_GPU - $,"","P100"> - CACHE - STRING - "Select GPU arch. and embed parameters (default: CUDA/HIP=P100, OPENCL=all)" -) -set(WITH_GPU_PARAMS "${WITH_GPU}") -set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES} - ${SUPPORTED_HIP_ARCHITECTURES}) +if (NOT MULTI_ARCH) + set(WITH_GPU + $,"","P100"> + CACHE + STRING + "Select GPU arch. and embed parameters (default: CUDA/HIP=P100, OPENCL=all)" + ) + set(WITH_GPU_PARAMS "${WITH_GPU}") + set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES} + ${SUPPORTED_HIP_ARCHITECTURES}) +else () + # For multi-arch builds, set WITH_GPU_PARAMS to first architecture for + # compatibility + list(GET WITH_GPU_LIST 0 WITH_GPU_PARAMS) + list(GET WITH_GPU_LIST 0 WITH_GPU) # Set WITH_GPU for compatibility with + # existing code +endif () option(WITH_CUDA_PROFILING "Enable profiling within CUDA" OFF) option(WITH_HIP_PROFILING "Enable profiling within HIP" OFF) @@ -298,12 +345,42 @@ if (USE_ACCEL MATCHES "hip") endif () enable_language(HIP) - # Make sure the GPU required is supported - list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED) - if (GPU_SUPPORTED EQUAL -1) - message( - FATAL_ERROR "GPU architecture requested (${WITH_GPU}) is not supported. " - "Please choose from: ${SUPPORTED_HIP_ARCHITECTURES}") + if (MULTI_ARCH AND MULTI_GPU_BUILD) + # Validate all GPU architectures in the list + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + list(FIND SUPPORTED_HIP_ARCHITECTURES ${GPU_ARCH} GPU_SUPPORTED) + if (GPU_SUPPORTED EQUAL -1) + message( + FATAL_ERROR + "GPU architecture requested (${GPU_ARCH}) is not supported. " + "Please choose from: ${SUPPORTED_HIP_ARCHITECTURES}") + endif () + endforeach () + + set(ACC_ARCH_NUMBERS "") + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + list(APPEND ACC_ARCH_NUMBERS ${GPU_ARCH_NUMBER_${GPU_ARCH}}) + endforeach () + + message(STATUS "Multi-GPU HIP build for architectures: ${WITH_GPU_LIST}") + message(STATUS "HIP architecture numbers: ${ACC_ARCH_NUMBERS}") + message(STATUS "Kernel parameters will be generated for: ${WITH_GPU_LIST}") + + else () + # Make sure the GPU required is supported + list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED) + if (GPU_SUPPORTED EQUAL -1) + message( + FATAL_ERROR + "GPU architecture requested (${WITH_GPU}) is not supported. " + "Please choose from: ${SUPPORTED_HIP_ARCHITECTURES}") + endif () + + set(ACC_ARCH_NUMBER ${GPU_ARCH_NUMBER_${WITH_GPU}}) + message(STATUS "GPU target architecture: " ${WITH_GPU}) + message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS}) + message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) + message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING}) endif () # ROCm is typically installed in /opt/rocm; otherwise let the user set @@ -329,12 +406,6 @@ if (USE_ACCEL MATCHES "hip") message(FATAL_ERROR "HIP version >= 4.4.0 is required.") endif () - set(ACC_ARCH_NUMBER ${GPU_ARCH_NUMBER_${WITH_GPU}}) - message(STATUS "GPU target architecture: " ${WITH_GPU}) - message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS}) - message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) - message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING}) - # =================================== BLAS on GPU backend find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH}) endif () diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4602195f8d9..67e8be86583 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -105,11 +105,7 @@ add_fypp_sources( utils/dbcsr_toollib.F work/dbcsr_work_operations.F) -set(DBCSR_HIP_AND_CUDA_SRCS - acc/libsmm_acc/libsmm_acc_benchmark.cpp - acc/libsmm_acc/libsmm_acc_init.cpp - acc/libsmm_acc/libsmm_acc.cpp - acc/cuda_hip/calculate_norms.cpp +set(DBCSR_HIP_AND_CUDA_COMMON_SRCS acc/cuda_hip/acc_blas.cpp acc/cuda_hip/acc_dev.cpp acc/cuda_hip/acc_error.cpp @@ -119,10 +115,16 @@ set(DBCSR_HIP_AND_CUDA_SRCS acc/cuda_hip/acc_mem.cpp acc/cuda_hip/acc_stream.cpp) -set(DBCSR_CUDA_SRCS ${DBCSR_HIP_AND_CUDA_SRCS} acc/cuda/acc_cuda.cpp - acc/cuda/dbcsr_cuda_nvtx_cu.cpp) +set(DBCSR_ARCH_DEPENDENT_SRCS + acc/cuda_hip/calculate_norms.cpp acc/libsmm_acc/libsmm_acc_benchmark.cpp + acc/libsmm_acc/libsmm_acc_init.cpp acc/libsmm_acc/libsmm_acc.cpp) -set(DBCSR_HIP_SRCS ${DBCSR_HIP_AND_CUDA_SRCS} acc/hip/acc_hip.cpp) +set(DBCSR_CUDA_SRCS + ${DBCSR_HIP_AND_CUDA_COMMON_SRCS} ${DBCSR_ARCH_DEPENDENT_SRCS} + acc/cuda/acc_cuda.cpp acc/cuda/dbcsr_cuda_nvtx_cu.cpp) + +set(DBCSR_HIP_SRCS ${DBCSR_HIP_AND_CUDA_COMMON_SRCS} + ${DBCSR_ARCH_DEPENDENT_SRCS} acc/hip/acc_hip.cpp) if (USE_ACCEL MATCHES "hip") set_source_files_properties(acc/cuda_hip/calculate_norms.cpp @@ -168,123 +170,237 @@ endif () # ================================================================================================= # DBCSR LIBRARY -add_library(dbcsr ${DBCSR_SRCS}) - -# -fPIC can also be used in the static case. Addresses are resolved during the -# linking process -set_target_properties( - dbcsr - PROPERTIES VERSION ${dbcsr_VERSION} - SOVERSION ${dbcsr_APIVERSION} - POSITION_INDEPENDENT_CODE ON) +if (MULTI_ARCH AND MULTI_GPU_BUILD) + # Multi-architecture build: create separate libraries per GPU architecture + + # First, create a common library with all architecture-independent code + set(DBCSR_COMMON_SRCS ${DBCSR_FORTRAN_SRCS}) + if (USE_ACCEL MATCHES "cuda") + # Add only common CUDA sources (manually specify to exclude arch-specific + # ones) + list(APPEND DBCSR_COMMON_SRCS ${DBCSR_HIP_AND_CUDA_COMMON_SRCS} + acc/cuda/acc_cuda.cpp acc/cuda/dbcsr_cuda_nvtx_cu.cpp) + # Note: calculate_norms.cpp and libsmm_acc files are intentionally excluded + elseif (USE_ACCEL MATCHES "hip") + # Add only common HIP sources (manually specify to exclude arch-specific + # ones) + list(APPEND DBCSR_COMMON_SRCS ${DBCSR_HIP_AND_CUDA_COMMON_SRCS} + acc/hip/acc_hip.cpp) + # Note: calculate_norms.cpp and libsmm_acc files are intentionally excluded + elseif (USE_ACCEL MATCHES "opencl") + list(APPEND DBCSR_COMMON_SRCS ${DBCSR_OPENCL_SRCS}) + endif () -if (USE_ACCEL MATCHES "hip") - set_target_properties(dbcsr PROPERTIES HIP_ARCHITECTURES "${ACC_ARCH_NUMBER}") -elseif (USE_ACCEL MATCHES "cuda") - set_target_properties(dbcsr PROPERTIES CUDA_ARCHITECTURES - "${ACC_ARCH_NUMBER}") -endif () + # Create common object library (compiled once, reused for all architectures) + add_library(dbcsr_common OBJECT ${DBCSR_COMMON_SRCS}) + + # Set common properties + set_target_properties(dbcsr_common PROPERTIES POSITION_INDEPENDENT_CODE ON) + + # Now create separate libraries for each GPU architecture + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + # Get architecture number for this GPU + set(ARCH_NUM ${GPU_ARCH_NUMBER_${GPU_ARCH}}) + + # Create architecture-specific object library for calculate_norms.cpp + add_library(dbcsr_arch_${GPU_ARCH} OBJECT) + + if (USE_ACCEL MATCHES "hip|cuda") + target_sources(dbcsr_arch_${GPU_ARCH} + PRIVATE ${DBCSR_ARCH_DEPENDENT_SRCS}) + + target_include_directories( + dbcsr_arch_${GPU_ARCH} + PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/acc/libsmm_acc + ${CMAKE_CURRENT_SOURCE_DIR}/acc/libsmm_acc) + + if (USE_ACCEL MATCHES "hip") + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES LANGUAGE HIP) + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES COMPILE_FLAGS "-fPIE") + set_target_properties(dbcsr_arch_${GPU_ARCH} + PROPERTIES HIP_ARCHITECTURES "${ARCH_NUM}") + elseif (USE_ACCEL MATCHES "cuda") + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES LANGUAGE CUDA) + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES COMPILE_FLAGS "--x cu") + set_target_properties(dbcsr_arch_${GPU_ARCH} + PROPERTIES CUDA_ARCHITECTURES "${ARCH_NUM}") + endif () + + # Set architecture-specific compile definitions + target_compile_definitions( + dbcsr_arch_${GPU_ARCH} + PRIVATE __DBCSR_ACC + $<$:__CUDA> + $<$:__HIP> + ARCH_NUMBER=${ARCH_NUM} + $<$:__CUDA_PROFILING> + $<$:__HIP_PROFILING>) + endif () + + # Create the final library combining common + arch-specific code + add_library(dbcsr_${GPU_ARCH} $ + $) + + # Set library properties + set_target_properties( + dbcsr_${GPU_ARCH} + PROPERTIES VERSION ${dbcsr_VERSION} + SOVERSION ${dbcsr_APIVERSION} + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME "dbcsr_${GPU_ARCH}" + EXPORT_NAME "dbcsr_${GPU_ARCH}") + + set_target_properties(dbcsr_${GPU_ARCH} PROPERTIES LINKER_LANGUAGE Fortran) + + message( + STATUS + "Created library target: dbcsr_${GPU_ARCH} for architecture ${GPU_ARCH} (${ARCH_NUM})" + ) + endforeach () -if (USE_SMM MATCHES "libxsmm" OR (USE_SMM MATCHES "auto" AND LIBXSMM_FOUND)) - target_compile_definitions(dbcsr PRIVATE __LIBXSMM) - target_link_directories(dbcsr PUBLIC ${LIBXSMM_LIBRARY_DIRS}) - if (USE_OPENMP) - target_link_libraries(dbcsr PRIVATE PkgConfig::LIBXSMMEXT) - endif () - target_link_libraries(dbcsr PRIVATE PkgConfig::LIBXSMM) - target_link_libraries(dbcsr PRIVATE ${BLAS_LIBRARIES}) -endif () + # Create a convenience target that builds all GPU variants + add_custom_target(dbcsr_all_gpus) + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + add_dependencies(dbcsr_all_gpus dbcsr_${GPU_ARCH}) + endforeach () -if (BLAS_LIBRARIES MATCHES "mkl_") - target_compile_definitions(dbcsr PRIVATE __MKL) -endif () + # Set the first GPU architecture as the default "dbcsr" target for + # compatibility + list(GET WITH_GPU_LIST 0 FIRST_GPU_ARCH) + add_library(dbcsr ALIAS dbcsr_${FIRST_GPU_ARCH}) + message(STATUS "Default 'dbcsr' target aliased to: dbcsr_${FIRST_GPU_ARCH}") -if (APPLE) - # fix /proc/self/statm can not be opened on macOS - target_compile_definitions(dbcsr PRIVATE __NO_STATM_ACCESS) +else () + # Single architecture build (existing logic) + add_library(dbcsr ${DBCSR_SRCS}) - if (BLAS_LIBRARIES MATCHES "Accelerate") - target_compile_definitions(dbcsr PRIVATE __ACCELERATE) - endif () -endif () + # Set properties for single arch build + set_target_properties( + dbcsr + PROPERTIES VERSION ${dbcsr_VERSION} + SOVERSION ${dbcsr_APIVERSION} + POSITION_INDEPENDENT_CODE ON) -# set -DNDEBUG for Release builds -target_compile_definitions(dbcsr PRIVATE $<$:NDEBUG>) - -target_link_libraries(dbcsr PRIVATE ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) -target_include_directories( - dbcsr PRIVATE base) # do not export those includes, but some srcs do an - # unprefixed include -# make sure dependencies of dbcsr find the dbcsr_api.mod file plus some files -# they usually include: -target_include_directories( - dbcsr - PUBLIC $ - $ - $) -target_compile_definitions(dbcsr PRIVATE __STATM_TOTAL) -set_target_properties(dbcsr PROPERTIES LINKER_LANGUAGE Fortran) - -if (MPI_FOUND) - # once built, a user of the dbcsr library can not influence anything anymore - # by setting those flags: - target_compile_definitions(dbcsr PRIVATE __parallel) - - # If requested, use the MPI_F08 module - if (USE_MPI_F08) - target_compile_definitions(dbcsr PRIVATE __USE_MPI_F08) + if (USE_ACCEL MATCHES "hip") + set_target_properties(dbcsr PROPERTIES HIP_ARCHITECTURES + "${ACC_ARCH_NUMBER}") + elseif (USE_ACCEL MATCHES "cuda") + set_target_properties(dbcsr PROPERTIES CUDA_ARCHITECTURES + "${ACC_ARCH_NUMBER}") endif () - # Instead of resetting the compiler for MPI, we are adding the compiler flags - # otherwise added by the mpifort-wrapper directly; based on hints from: - # https://cmake.org/pipermail/cmake/2012-June/050991.html Here we assume that - # the MPI implementation found uses the same compiler as the Fortran compiler - # we found prior. Otherwise we might be adding incompatible compiler flags at - # this point. when built against MPI, a dbcsr consumer has to specify the MPI - # flags as well, therefore: PUBLIC - target_link_libraries(dbcsr PUBLIC MPI::MPI_Fortran) + set_target_properties(dbcsr PROPERTIES LINKER_LANGUAGE Fortran) endif () -target_link_libraries( - dbcsr - PRIVATE $<$:OpenMP::OpenMP_C> - $<$:OpenMP::OpenMP_CXX> - $<$:OpenMP::OpenMP_Fortran>) +# ================================================================================================= +# APPLY COMMON SETTINGS TO ALL TARGETS + +# Function to apply common settings to a target +function (apply_common_dbcsr_settings target_name) + # Apply all the existing settings... + if (USE_SMM MATCHES "libxsmm" OR (USE_SMM MATCHES "auto" AND LIBXSMM_FOUND)) + target_compile_definitions(${target_name} PRIVATE __LIBXSMM) + target_link_directories(${target_name} PUBLIC ${LIBXSMM_LIBRARY_DIRS}) + if (USE_OPENMP) + target_link_libraries(${target_name} PRIVATE PkgConfig::LIBXSMMEXT) + endif () + target_link_libraries(${target_name} PRIVATE PkgConfig::LIBXSMM) + target_link_libraries(${target_name} PRIVATE ${BLAS_LIBRARIES}) + endif () -# todo, make this a bit better with opencl. -if (USE_ACCEL MATCHES "cuda|hip") - add_subdirectory(acc/libsmm_acc) -endif () + if (BLAS_LIBRARIES MATCHES "mkl_") + target_compile_definitions(${target_name} PRIVATE __MKL) + endif () -if (USE_ACCEL MATCHES "opencl") - add_subdirectory(acc/opencl/smm) -endif () + if (APPLE) + target_compile_definitions(${target_name} PRIVATE __NO_STATM_ACCESS) + if (BLAS_LIBRARIES MATCHES "Accelerate") + target_compile_definitions(${target_name} PRIVATE __ACCELERATE) + endif () + endif () -if (USE_ACCEL) - target_compile_definitions( - dbcsr - PRIVATE __DBCSR_ACC - $<$:__CUDA> - $<$:__OPENCL> - $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> - $<$:__HIP> - $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> - $<$:__CUDA_PROFILING> - $<$:__HIP_PROFILING>) + target_compile_definitions(${target_name} PRIVATE $<$:NDEBUG>) + target_link_libraries(${target_name} PRIVATE ${BLAS_LIBRARIES} + ${LAPACK_LIBRARIES}) + target_include_directories(${target_name} PRIVATE base) + target_include_directories( + ${target_name} + PUBLIC $ + $ + $) + target_compile_definitions(${target_name} PRIVATE __STATM_TOTAL) + + if (MPI_FOUND) + target_compile_definitions(${target_name} PRIVATE __parallel) + if (USE_MPI_F08) + target_compile_definitions(${target_name} PRIVATE __USE_MPI_F08) + endif () + target_link_libraries(${target_name} PUBLIC MPI::MPI_Fortran) + endif () target_link_libraries( - dbcsr - PRIVATE $<$:CUDA::cudart> - $<$:CUDA::cuda_driver> - $<$:CUDA::cublas> - $<$:CUDA::nvrtc> - $<$:CUDA::nvToolsExt> - $<$:roc::hipblas> - $<$:hiprtc> - $<$:hip::host> - $<$:roctx64> - $<$:roctracer64> - $<$:OpenCL::OpenCL>) + ${target_name} + PRIVATE $<$:OpenMP::OpenMP_C> + $<$:OpenMP::OpenMP_CXX> + $<$:OpenMP::OpenMP_Fortran>) + + if (USE_ACCEL) + # For multi-arch, we already set ARCH_NUMBER per target, for single-arch use + # existing logic + if (NOT MULTI_ARCH) + target_compile_definitions( + ${target_name} + PRIVATE __DBCSR_ACC + $<$:__CUDA> + $<$:__OPENCL> + $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> + $<$:__HIP> + $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> + $<$:__CUDA_PROFILING> + $<$:__HIP_PROFILING>) + else () + # For multi-arch common target, set basic acceleration flags + target_compile_definitions( + ${target_name} + PRIVATE __DBCSR_ACC + $<$:__CUDA> + $<$:__OPENCL> + $<$:__HIP> + $<$:__CUDA_PROFILING> + $<$:__HIP_PROFILING>) + endif () + + target_link_libraries( + ${target_name} + PRIVATE $<$:CUDA::cudart> + $<$:CUDA::cuda_driver> + $<$:CUDA::cublas> + $<$:CUDA::nvrtc> + $<$:CUDA::nvToolsExt> + $<$:roc::hipblas> + $<$:hiprtc> + $<$:hip::host> + $<$:roctx64> + $<$:roctracer64> + $<$:OpenCL::OpenCL>) + endif () +endfunction () + +# Apply settings to all targets +if (MULTI_ARCH AND MULTI_GPU_BUILD) + # Apply to common target + apply_common_dbcsr_settings(dbcsr_common) + # Apply to all GPU-specific targets + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + apply_common_dbcsr_settings(dbcsr_${GPU_ARCH}) + endforeach () +else () + # Apply to single target + apply_common_dbcsr_settings(dbcsr) endif () # ================================================================================================= @@ -322,11 +438,27 @@ set(config_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") set(config_namespace "DBCSR::") # Install targets -install( - TARGETS dbcsr - EXPORT DBCSRTargets - LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" - ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") +if (MULTI_ARCH AND MULTI_GPU_BUILD) + # Install all GPU-specific libraries + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + install( + TARGETS dbcsr_${GPU_ARCH} + EXPORT DBCSRTargets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") + endforeach () + message( + STATUS + "Multi-arch install: Installing libraries for all GPU architectures: ${WITH_GPU_LIST}" + ) +else () + # Single architecture install (existing logic) + install( + TARGETS dbcsr + EXPORT DBCSRTargets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") +endif () # See https://gitlab.kitware.com/cmake/cmake/-/issues/19608 # CMAKE_INSTALL_Fortran_MODULES may not be an "official" variable if (NOT CMAKE_INSTALL_Fortran_MODULES) diff --git a/src/acc/libsmm_acc/CMakeLists.txt b/src/acc/libsmm_acc/CMakeLists.txt index e18143d33f5..a06757d8ec0 100644 --- a/src/acc/libsmm_acc/CMakeLists.txt +++ b/src/acc/libsmm_acc/CMakeLists.txt @@ -7,29 +7,89 @@ set(SMM_ACC_KERNELS kernels/smm_acc_dnt_tiny.h kernels/smm_acc_transpose.h) -add_custom_target( - parameters ALL - COMMAND - ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_parameters.py - --gpu_version=${WITH_GPU} --base_dir=${CMAKE_CURRENT_SOURCE_DIR}/parameters - DEPENDS generate_parameters.py parameters/parameters_${WITH_GPU_PARAMS}.json - BYPRODUCTS parameters.h - COMMENT "libsmm_acc: generating parameters for GPU ${WITH_GPU_PARAMS}") - -add_custom_target( - smm_acc_kernels ALL - COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_kernels.py - ${CMAKE_CURRENT_SOURCE_DIR}/kernels - DEPENDS generate_kernels.py ${SMM_ACC_KERNELS} - BYPRODUCTS smm_acc_kernels.h - COMMENT "libsmm_acc: generating kernels") - -add_dependencies(dbcsr smm_acc_kernels parameters) -target_include_directories(dbcsr PRIVATE ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}) - -# Note: this library is only used in some of the tests, it's just to get include -# paths to generated header files. -add_library(libsmm_acc INTERFACE) -target_include_directories(libsmm_acc INTERFACE ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}) +if (MULTI_ARCH AND MULTI_GPU_BUILD) + # Multi-architecture build: create separate targets for each GPU + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + # Set GPU_PARAMS for this architecture + set(GPU_PARAMS ${GPU_ARCH}) + + add_custom_target( + parameters_${GPU_ARCH} ALL + COMMAND + ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_parameters.py + --gpu_version=${GPU_ARCH} + --base_dir=${CMAKE_CURRENT_SOURCE_DIR}/parameters + DEPENDS generate_parameters.py parameters/parameters_${GPU_PARAMS}.json + BYPRODUCTS parameters_${GPU_ARCH}.h + COMMENT "libsmm_acc: generating parameters for GPU ${GPU_ARCH}") + + add_custom_target( + smm_acc_kernels_${GPU_ARCH} ALL + COMMAND + ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_kernels.py + ${CMAKE_CURRENT_SOURCE_DIR}/kernels + DEPENDS generate_kernels.py ${SMM_ACC_KERNELS} + BYPRODUCTS smm_acc_kernels_${GPU_ARCH}.h + COMMENT "libsmm_acc: generating kernels for GPU ${GPU_ARCH}") + + # Create the libsmm_acc target for this architecture + add_custom_target(libsmm_acc_${GPU_ARCH}) + add_dependencies(libsmm_acc_${GPU_ARCH} smm_acc_kernels_${GPU_ARCH} + parameters_${GPU_ARCH}) + + # Create interface library for this architecture + add_library(libsmm_acc_interface_${GPU_ARCH} INTERFACE) + target_include_directories( + libsmm_acc_interface_${GPU_ARCH} INTERFACE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) + + message( + STATUS "Created libsmm_acc targets for GPU architecture: ${GPU_ARCH}") + endforeach () + + # Create a convenience target that builds all GPU variants + add_custom_target(libsmm_acc_all_gpus) + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + add_dependencies(libsmm_acc_all_gpus libsmm_acc_${GPU_ARCH}) + endforeach () + + # For backward compatibility, make dbcsr depend on all libsmm_acc targets + foreach (GPU_ARCH IN LISTS WITH_GPU_LIST) + if (TARGET dbcsr_${GPU_ARCH}) + add_dependencies(dbcsr_${GPU_ARCH} libsmm_acc_${GPU_ARCH}) + target_include_directories( + dbcsr_${GPU_ARCH} PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) + endif () + endforeach () + +else () + # Single architecture build (existing logic) + add_custom_target( + parameters ALL + COMMAND + ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_parameters.py + --gpu_version=${WITH_GPU} + --base_dir=${CMAKE_CURRENT_SOURCE_DIR}/parameters + DEPENDS generate_parameters.py parameters/parameters_${WITH_GPU_PARAMS}.json + BYPRODUCTS parameters.h + COMMENT "libsmm_acc: generating parameters for GPU ${WITH_GPU_PARAMS}") + + add_custom_target( + smm_acc_kernels ALL + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_kernels.py + ${CMAKE_CURRENT_SOURCE_DIR}/kernels + DEPENDS generate_kernels.py ${SMM_ACC_KERNELS} + BYPRODUCTS smm_acc_kernels.h + COMMENT "libsmm_acc: generating kernels") + + add_dependencies(dbcsr smm_acc_kernels parameters) + target_include_directories(dbcsr PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) + + # Note: this library is only used in some of the tests, it's just to get + # include paths to generated header files. + add_library(libsmm_acc INTERFACE) + target_include_directories(libsmm_acc INTERFACE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) +endif ()