diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh deleted file mode 100644 index 1eea0ea9596..00000000000 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_pool.cuh +++ /dev/null @@ -1,107 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX__MEMORY_RESOURCE_DEVICE_MEMORY_POOL_CUH -#define _CUDAX__MEMORY_RESOURCE_DEVICE_MEMORY_POOL_CUH - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -//! @file -//! The \c device_memory_pool class provides a wrapper around a `cudaMempool_t`. -namespace cuda::experimental -{ - -class device_memory_resource; - -//! @brief \c device_memory_pool is an owning wrapper around a -//! cudaMemPool_t. -//! -//! It handles creation and destruction of the underlying pool utilizing the provided \c memory_pool_properties. -class device_memory_pool : public __memory_pool_base -{ - //! @brief Constructs a \c device_memory_pool from a handle taking ownership of the pool - //! @param __handle The handle to the existing pool - _CCCL_HOST_API explicit device_memory_pool(__memory_pool_base::__from_handle_t, ::cudaMemPool_t __handle) noexcept - : __memory_pool_base(__memory_pool_base::__from_handle_t{}, __handle) - {} - -public: - //! @brief Constructs a \c device_memory_pool with the optionally specified initial pool size and release threshold. - //! If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next - //! synchronization event. - //! @throws cuda_error if the CUDA version does not support ``cudaMallocAsync``. - //! @param __device_id The device id of the device the stream pool is constructed on. - //! @param __pool_properties Optional, additional properties of the pool to be created. - _CCCL_HOST_API explicit device_memory_pool(const ::cuda::device_ref __device_id, - memory_pool_properties __properties = {}) - : __memory_pool_base(__properties, - ::CUmemLocation{::CU_MEM_LOCATION_TYPE_DEVICE, __device_id.get()}, - ::CU_MEM_ALLOCATION_TYPE_PINNED) - {} - - //! @brief Disables construction from a plain `cudaMemPool_t`. We want to ensure clean ownership semantics. - device_memory_pool(::cudaMemPool_t) = delete; - - device_memory_pool(device_memory_pool const&) = delete; - device_memory_pool(device_memory_pool&&) = delete; - device_memory_pool& operator=(device_memory_pool const&) = delete; - device_memory_pool& operator=(device_memory_pool&&) = delete; - - //! @brief Construct an `device_memory_pool` object from a native `cudaMemPool_t` handle. - //! - //! @param __handle The native handle - //! - //! @return The constructed `device_memory_pool` object - //! - //! @note The constructed `device_memory_pool` object takes ownership of the native handle. - [[nodiscard]] static device_memory_pool from_native_handle(::cudaMemPool_t __handle) noexcept - { - return device_memory_pool(__memory_pool_base::__from_handle_t{}, __handle); - } - - // Disallow construction from an `int`, e.g., `0`. - static device_memory_pool from_native_handle(int) = delete; - - // Disallow construction from `nullptr`. - static device_memory_pool from_native_handle(::cuda::std::nullptr_t) = delete; - - using resource_type = device_memory_resource; -}; - -} // namespace cuda::experimental - -#include - -#endif // _CUDAX__MEMORY_RESOURCE_DEVICE_MEMORY_POOL_CUH diff --git a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh index b59c1c7f605..d7a27470f1c 100644 --- a/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/device_memory_resource.cuh @@ -31,7 +31,6 @@ #include #include -#include #include #include @@ -48,9 +47,10 @@ namespace cuda::experimental //! Stream ordered memory resource //! ------------------------------ //! -//! ``device_memory_resource`` uses `cudaMallocFromPoolAsync / cudaFreeAsync +//! ``device_memory_resource`` allocates device memory using `cudaMallocFromPoolAsync / cudaFreeAsync //! `__ for allocation/deallocation. A -//! ``device_memory_resource`` is a thin wrapper around a \c cudaMemPool_t. +//! ``device_memory_resource`` is a thin wrapper around a \c cudaMemPool_t with the location type set to \c +//! cudaMemLocationTypeDevice. //! //! .. warning:: //! @@ -90,19 +90,65 @@ public: : __memory_resource_base(__pool) {} - //! @brief Constructs the device_memory_resource from a \c device_memory_pool by calling get(). - //! @param __pool The \c device_memory_pool used to allocate memory. - _CCCL_HOST_API explicit device_memory_resource(device_memory_pool& __pool) noexcept - : __memory_resource_base(__pool.get()) - {} - //! @brief Enables the \c device_accessible property for \c device_memory_resource. //! @relates device_memory_resource _CCCL_HOST_API friend constexpr void get_property(device_memory_resource const&, device_accessible) noexcept {} using default_queries = properties_list; }; + +//! @rst +//! .. _cudax-memory-resource-async: +//! +//! Stream ordered memory resource +//! ------------------------------ +//! +//! ``device_memory_pool`` allocates device memory using `cudaMallocFromPoolAsync / cudaFreeAsync +//! `__ for allocation/deallocation. A +//! When constructed it creates an underlying \c cudaMemPool_t with the location type set to \c +//! cudaMemLocationTypeDevice and owns it. +//! +//! @endrst +struct device_memory_pool : device_memory_resource +{ + using reference_type = device_memory_resource; + + //! @brief Constructs a \c device_memory_pool with the optionally specified initial pool size and release + //! threshold. If the pool size grows beyond the release threshold, unused memory held by the pool will be released at + //! the next synchronization event. + //! @throws cuda_error if the CUDA version does not support ``cudaMallocAsync``. + //! @param __device_id The device id of the device the stream pool is constructed on. + //! @param __pool_properties Optional, additional properties of the pool to be created. + _CCCL_HOST_API device_memory_pool(::cuda::device_ref __device_id, memory_pool_properties __properties = {}) + : device_memory_resource(__create_cuda_mempool( + __properties, + ::CUmemLocation{::CU_MEM_LOCATION_TYPE_DEVICE, __device_id.get()}, + ::CU_MEM_ALLOCATION_TYPE_PINNED)) + {} + + ~device_memory_pool() noexcept + { + ::cuda::__driver::__mempoolDestroy(__pool_); + } + + _CCCL_HOST_API static device_memory_pool from_native_handle(::cudaMemPool_t __pool) noexcept + { + return device_memory_pool(__pool); + } + + device_memory_pool(const device_memory_pool&) = delete; + device_memory_pool& operator=(const device_memory_pool&) = delete; + +private: + device_memory_pool(::cudaMemPool_t __pool) noexcept + : device_memory_resource(__pool) + {} +}; + static_assert(::cuda::mr::synchronous_resource_with, ""); + +static_assert(::cuda::mr::resource_with, ""); + } // namespace cuda::experimental #include diff --git a/cudax/include/cuda/experimental/__memory_resource/legacy_pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/legacy_pinned_memory_resource.cuh index d6e30e33af7..786b1ddc8ee 100644 --- a/cudax/include/cuda/experimental/__memory_resource/legacy_pinned_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/legacy_pinned_memory_resource.cuh @@ -32,7 +32,6 @@ #include #include -#include #include #include diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_pool.cuh deleted file mode 100644 index 3aeebecdce3..00000000000 --- a/cudax/include/cuda/experimental/__memory_resource/managed_memory_pool.cuh +++ /dev/null @@ -1,109 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_POOL_CUH -#define _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_POOL_CUH - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if _CCCL_CTK_AT_LEAST(13, 0) - -# if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -# endif // _CCCL_CUDA_COMPILER(CLANG) - -# include -# include - -# include - -//! @file -//! The \c managed_memory_pool class provides a wrapper around a `cudaMempool_t`. -namespace cuda::experimental -{ - -class managed_memory_resource; - -[[nodiscard]] static ::cudaMemPool_t __get_default_managed_pool() -{ - return ::cuda::__driver::__getDefaultMemPool( - ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED); -} - -//! @brief \c managed_memory_pool is an owning wrapper around a -//! cudaMemPool_t. -//! -//! It handles creation and destruction of the underlying pool utilizing the provided \c memory_pool_properties. -class managed_memory_pool : public __memory_pool_base -{ -private: - //! @brief Constructs a \c managed_memory_pool from a handle taking ownership of the pool - //! @param __handle The handle to the existing pool - _CCCL_HOST_API explicit managed_memory_pool(__memory_pool_base::__from_handle_t, ::cudaMemPool_t __handle) noexcept - : __memory_pool_base(__memory_pool_base::__from_handle_t{}, __handle) - {} - -public: - //! @brief Constructs a \c managed_memory_pool with optional properties. - //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release - //! threshold, unused memory held by the pool will be released at the next synchronization event. - //! @param __properties Optional, additional properties of the pool to be created. - _CCCL_HOST_API explicit managed_memory_pool(memory_pool_properties __properties = {}) - : __memory_pool_base( - __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED) - {} - - // TODO add a constructor that accepts memory location one a type for it is added - - //! @brief Disables construction from a plain `cudaMemPool_t`. We want to ensure clean ownership semantics. - managed_memory_pool(::cudaMemPool_t) = delete; - - managed_memory_pool(managed_memory_pool const&) = delete; - managed_memory_pool(managed_memory_pool&&) = delete; - managed_memory_pool& operator=(managed_memory_pool const&) = delete; - managed_memory_pool& operator=(managed_memory_pool&&) = delete; - - //! @brief Construct an `pinned_memory_pool` object from a native `cudaMemPool_t` handle. - //! - //! @param __handle The native handle - //! - //! @return The constructed `pinned_memory_pool` object - //! - //! @note The constructed `pinned_memory_pool` object takes ownership of the native handle. - [[nodiscard]] static managed_memory_pool from_native_handle(::cudaMemPool_t __handle) noexcept - { - return managed_memory_pool(__memory_pool_base::__from_handle_t{}, __handle); - } - - // Disallow construction from an `int`, e.g., `0`. - static managed_memory_pool from_native_handle(int) = delete; - - // Disallow construction from `nullptr`. - static managed_memory_pool from_native_handle(::cuda::std::nullptr_t) = delete; - - using resource_type = managed_memory_resource; -}; - -} // namespace cuda::experimental - -# include - -#endif // _CCCL_CTK_AT_LEAST(13, 0) - -#endif // _CUDAX__MEMORY_RESOURCE_MANAGED_MEMORY_POOL_CUH diff --git a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh index af1a32ac6a0..d6196d2a756 100644 --- a/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/managed_memory_resource.cuh @@ -28,7 +28,6 @@ # include # include -# include # include # include @@ -38,15 +37,22 @@ namespace cuda::experimental { +[[nodiscard]] static ::cudaMemPool_t __get_default_managed_pool() +{ + return ::cuda::__driver::__getDefaultMemPool( + ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED); +} + //! @rst //! .. _cudax-memory-resource-async: //! //! Stream ordered memory resource //! ------------------------------ //! -//! ``managed_memory_resource`` uses `cudaMallocFromPoolAsync / cudaFreeAsync +//! ``managed_memory_resource`` allocates managed memory using `cudaMallocFromPoolAsync / cudaFreeAsync //! `__ for allocation/deallocation. A -//! ``managed_memory_resource`` is a thin wrapper around a \c cudaMemPool_t. +//! ``managed_memory_resource`` is a thin wrapper around a \c cudaMemPool_t with the allocation type set to \c +//! cudaMemAllocationTypeManaged. //! //! .. warning:: //! @@ -69,12 +75,6 @@ public: : __memory_resource_base(__pool) {} - //! @brief Constructs the managed_memory_resource from a \c managed_memory_pool by calling get(). - //! @param __pool The \c managed_memory_pool used to allocate memory. - _CCCL_HOST_API explicit managed_memory_resource(managed_memory_pool& __pool) noexcept - : __memory_resource_base(__pool.get()) - {} - //! @brief Enables the \c device_accessible property _CCCL_HOST_API friend constexpr void get_property(managed_memory_resource const&, device_accessible) noexcept {} //! @brief Enables the \c host_accessible property @@ -83,9 +83,58 @@ public: using default_queries = properties_list; }; +//! @rst +//! .. _cudax-memory-resource-async: +//! +//! Stream ordered memory resource +//! ------------------------------ +//! +//! ``managed_memory_pool`` allocates managed memory using `cudaMallocFromPoolAsync / cudaFreeAsync +//! `__ for allocation/deallocation. A +//! When constructed it creates an underlying \c cudaMemPool_t with the allocation type set to \c +//! cudaMemAllocationTypeManaged and owns it. +//! +//! @endrst +struct managed_memory_pool : managed_memory_resource +{ + using reference_type = managed_memory_resource; + + //! @brief Constructs a \c managed_memory_pool with optional properties. + //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release + //! threshold, unused memory held by the pool will be released at the next synchronization event. + //! @param __properties Optional, additional properties of the pool to be created. + _CCCL_HOST_API managed_memory_pool(memory_pool_properties __properties = {}) + : managed_memory_resource(__create_cuda_mempool( + __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_NONE, 0}, ::CU_MEM_ALLOCATION_TYPE_MANAGED)) + {} + + // TODO add a constructor that accepts memory location one a type for it is added + + ~managed_memory_pool() noexcept + { + ::cuda::__driver::__mempoolDestroy(__pool_); + } + + _CCCL_HOST_API static managed_memory_pool from_native_handle(::cudaMemPool_t __pool) noexcept + { + return managed_memory_pool(__pool); + } + + managed_memory_pool(const managed_memory_pool&) = delete; + managed_memory_pool& operator=(const managed_memory_pool&) = delete; + +private: + managed_memory_pool(::cudaMemPool_t __pool) noexcept + : managed_memory_resource(__pool) + {} +}; + static_assert(::cuda::mr::resource_with, ""); static_assert(::cuda::mr::resource_with, ""); +static_assert(::cuda::mr::resource_with, ""); +static_assert(::cuda::mr::resource_with, ""); + } // namespace cuda::experimental # include diff --git a/cudax/include/cuda/experimental/__memory_resource/memory_pool_base.cuh b/cudax/include/cuda/experimental/__memory_resource/memory_pool_base.cuh deleted file mode 100644 index 82614ad337e..00000000000 --- a/cudax/include/cuda/experimental/__memory_resource/memory_pool_base.cuh +++ /dev/null @@ -1,496 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX__MEMORY_RESOURCE_MEMORY_POOL_BASE_CUH -#define _CUDAX__MEMORY_RESOURCE_MEMORY_POOL_BASE_CUH - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include - -#if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -#endif // _CCCL_CUDA_COMPILER(CLANG) - -#include -#include -#include -#include - -#include - -#include - -//! @file -//! The \c __memory_pool_base class provides a wrapper around a `cudaMempool_t`. -namespace cuda::experimental -{ - -namespace __detail -{ - -enum class __pool_attr_settable : bool -{ -}; - -template <::cudaMemPoolAttr _Attr, typename _Type, __pool_attr_settable _Settable> -struct __pool_attr_impl -{ - using type = _Type; - - [[nodiscard]] _CCCL_HOST_API constexpr operator ::cudaMemPoolAttr() const noexcept - { - return _Attr; - } - - [[nodiscard]] _CCCL_HOST_API type operator()(::cudaMemPool_t __pool) const - { - size_t __value = ::cuda::__driver::__mempoolGetAttribute(__pool, static_cast<::CUmemPool_attribute>(_Attr)); - return static_cast(__value); - } - - static void set(::cudaMemPool_t __pool, type __value) - { - size_t __value_copy = __value; - if constexpr (_Settable == __pool_attr_settable{true}) - { - ::cuda::__driver::__mempoolSetAttribute(__pool, static_cast<::CUmemPool_attribute>(_Attr), &__value_copy); - } - else - { - ::cuda::std::__throw_invalid_argument("This attribute can't be set"); - } - } -}; - -template <::cudaMemPoolAttr _Attr> -struct __pool_attr : __pool_attr_impl<_Attr, size_t, __pool_attr_settable{true}> -{}; - -template <> -struct __pool_attr<::cudaMemPoolReuseFollowEventDependencies> - : __pool_attr_impl<::cudaMemPoolReuseFollowEventDependencies, bool, __pool_attr_settable{true}> -{}; - -template <> -struct __pool_attr<::cudaMemPoolReuseAllowOpportunistic> - : __pool_attr_impl<::cudaMemPoolReuseAllowOpportunistic, bool, __pool_attr_settable{true}> -{}; - -template <> -struct __pool_attr<::cudaMemPoolReuseAllowInternalDependencies> - : __pool_attr_impl<::cudaMemPoolReuseAllowInternalDependencies, bool, __pool_attr_settable{true}> -{}; - -template <> -struct __pool_attr<::cudaMemPoolAttrReservedMemCurrent> - : __pool_attr_impl<::cudaMemPoolAttrReservedMemCurrent, size_t, __pool_attr_settable{false}> -{}; - -template <> -struct __pool_attr<::cudaMemPoolAttrUsedMemCurrent> - : __pool_attr_impl<::cudaMemPoolAttrUsedMemCurrent, size_t, __pool_attr_settable{false}> -{}; - -inline void __set_attribute_non_zero_only(::cudaMemPool_t __pool, ::CUmemPool_attribute __attr, size_t __value) -{ - if (__value != 0) - { - ::cuda::std::__throw_invalid_argument("This attribute can't be set to a non-zero value."); - } - ::cuda::__driver::__mempoolSetAttribute(__pool, __attr, &__value); -} - -template <> -struct __pool_attr<::cudaMemPoolAttrReservedMemHigh> - : __pool_attr_impl<::cudaMemPoolAttrReservedMemHigh, size_t, __pool_attr_settable{true}> -{ - static void set(::cudaMemPool_t __pool, type __value) - { - ::cuda::experimental::__detail::__set_attribute_non_zero_only(__pool, ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, __value); - } -}; - -template <> -struct __pool_attr<::cudaMemPoolAttrUsedMemHigh> - : __pool_attr_impl<::cudaMemPoolAttrUsedMemHigh, size_t, __pool_attr_settable{true}> -{ - static void set(::cudaMemPool_t __pool, type __value) - { - ::cuda::experimental::__detail::__set_attribute_non_zero_only(__pool, ::CU_MEMPOOL_ATTR_USED_MEM_HIGH, __value); - } -}; - -} // namespace __detail - -namespace memory_pool_attributes -{ -// The threshold at which the pool will release memory. -using release_threshold_t = __detail::__pool_attr<::cudaMemPoolAttrReleaseThreshold>; -static constexpr release_threshold_t release_threshold{}; - -// Allow the pool to reuse the memory across streams as long as there is a stream ordering dependency between the -// streams. -using reuse_follow_event_dependencies_t = __detail::__pool_attr<::cudaMemPoolReuseFollowEventDependencies>; -static constexpr reuse_follow_event_dependencies_t reuse_follow_event_dependencies{}; - -// Allow the pool to reuse already completed frees when there is no dependency between the streams. -using reuse_allow_opportunistic_t = __detail::__pool_attr<::cudaMemPoolReuseAllowOpportunistic>; -static constexpr reuse_allow_opportunistic_t reuse_allow_opportunistic{}; - -// Allow the pool to insert stream dependencies to reuse the memory across streams. -using reuse_allow_internal_dependencies_t = __detail::__pool_attr<::cudaMemPoolReuseAllowInternalDependencies>; -static constexpr reuse_allow_internal_dependencies_t reuse_allow_internal_dependencies{}; - -// The current amount of memory reserved in the pool. -using reserved_mem_current_t = __detail::__pool_attr<::cudaMemPoolAttrReservedMemCurrent>; -static constexpr reserved_mem_current_t reserved_mem_current{}; - -// The high water mark for the reserved memory in the pool. -using reserved_mem_high_t = __detail::__pool_attr<::cudaMemPoolAttrReservedMemHigh>; -static constexpr reserved_mem_high_t reserved_mem_high{}; - -// The current amount of memory used in the pool. -using used_mem_current_t = __detail::__pool_attr<::cudaMemPoolAttrUsedMemCurrent>; -static constexpr used_mem_current_t used_mem_current{}; - -// The high water mark for the used memory in the pool. -using used_mem_high_t = __detail::__pool_attr<::cudaMemPoolAttrUsedMemHigh>; -static constexpr used_mem_high_t used_mem_high{}; -}; // namespace memory_pool_attributes - -//! @brief Checks whether the current device supports \c cudaMallocAsync. -//! @param __device The id of the device for which to query support. -//! @throws cuda_error if \c cudaDeviceGetAttribute failed. -//! @returns true if \c cudaDevAttrMemoryPoolsSupported is not zero. -inline void __verify_device_supports_stream_ordered_allocations(device_ref __device) -{ - if (!__device.attribute(::cuda::device_attributes::memory_pools_supported)) - { - ::cuda::__throw_cuda_error(::cudaErrorNotSupported, "cudaMallocAsync is not supported on the given device"); - } -} - -//! @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present -//! CUDA driver/runtime version. -//! @param __device The id of the device to check for support. -//! @param __handle_type An IPC export handle type to check for support. -//! @throws cuda_error if the specified `cudaMemAllocationHandleType` is not supported on the specified device. -inline void __verify_device_supports_export_handle_type( - device_ref __device, ::cudaMemAllocationHandleType __handle_type, ::CUmemLocation __location) -{ - if (__handle_type == ::cudaMemAllocationHandleType::cudaMemHandleTypeNone) - { - return; - } - if (__location.type != ::CU_MEM_LOCATION_TYPE_DEVICE -#if _CCCL_CTK_AT_LEAST(12, 6) - && __location.type != ::CU_MEM_LOCATION_TYPE_HOST_NUMA -#endif - ) - { - ::cuda::__throw_cuda_error( - ::cudaErrorNotSupported, "Requested IPC memory handle type not supported for the given location"); - } - auto __supported_handles = __device.attribute(::cuda::device_attributes::memory_pool_supported_handle_types); - if ((static_cast(__handle_type) & __supported_handles) != static_cast(__handle_type)) - { - ::cuda::__throw_cuda_error( - ::cudaErrorNotSupported, "Requested IPC memory handle type not supported on a given device"); - } -} - -//! @brief Enable access to this memory pool from the supplied devices -//! -//! Device on which this pool resides can be included in the span. -//! -//! @param __pool The memory pool to set access for -//! @param __devices A span of `device_ref`s listing devices to enable access for -//! @param __flags The access flags to set -//! @throws cuda_error if ``cudaMemPoolSetAccess`` fails. -inline void -__mempool_set_access(::CUmemoryPool __pool, ::cuda::std::span __devices, ::CUmemAccess_flags __flags) -{ - ::std::vector<::CUmemAccessDesc> __descs; - __descs.reserve(__devices.size()); - for (size_t __i = 0; __i < __devices.size(); ++__i) - { - __descs.push_back({::CUmemLocation{::CU_MEM_LOCATION_TYPE_DEVICE, __devices[__i].get()}, __flags}); - } - ::cuda::__driver::__mempoolSetAccess(__pool, __descs.data(), __descs.size()); -} - -//! @brief Query if memory from a pool is accessible by the supplied device -//! -//! @param __pool The memory pool to query access for -//! @param __dev The device to query access for -//! @returns true if the memory pool is accessible from the device -[[nodiscard]] inline bool __mempool_get_access(::cudaMemPool_t __pool, device_ref __dev) -{ - ::CUmemAccess_flags __result; - ::CUmemLocation __loc; - __loc.type = ::CU_MEM_LOCATION_TYPE_DEVICE; - __loc.id = __dev.get(); - __result = ::cuda::__driver::__mempoolGetAccess(__pool, &__loc); - return __result == ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; -} - -//! @brief \c memory_pool_properties is a wrapper around properties passed to \c __memory_pool_base to create a -//! cudaMemPool_t. -struct memory_pool_properties -{ - size_t initial_pool_size = 0; - size_t release_threshold = ::cuda::std::numeric_limits::max(); - cudaMemAllocationHandleType allocation_handle_type = cudaMemAllocationHandleType::cudaMemHandleTypeNone; -}; - -class __memory_pool_base -{ -private: - ::cudaMemPool_t __pool_handle_ = nullptr; - - //! @brief Creates the CUDA memory pool from the passed in arguments. - //! @throws cuda_error If the creation of the CUDA memory pool failed. - //! @returns The created CUDA memory pool. - [[nodiscard]] static cudaMemPool_t __create_cuda_mempool( - memory_pool_properties __properties, ::CUmemLocation __location, CUmemAllocationType __allocation_type) noexcept - { - ::CUmemPoolProps __pool_properties{}; - __pool_properties.allocType = __allocation_type; - __pool_properties.handleTypes = ::CUmemAllocationHandleType(__properties.allocation_handle_type); - __pool_properties.location = __location; - - if (__properties.initial_pool_size > __properties.release_threshold) - { - ::cuda::std::__throw_invalid_argument("Initial pool size must be less than the release threshold"); - } - - ::CUmemoryPool __cuda_pool_handle{}; - ::cudaError_t __error = ::cuda::__driver::__mempoolCreateNoThrow(&__cuda_pool_handle, &__pool_properties); - if (__error != ::cudaSuccess) - { - // Mempool creation failed, lets try to figure out why - ::cuda::experimental::__verify_device_supports_stream_ordered_allocations(__location.id); - ::cuda::experimental::__verify_device_supports_export_handle_type( - __location.id, __properties.allocation_handle_type, __location); - - // Could not find the reason, throw a generic error - ::cuda::__throw_cuda_error(__error, "Failed to create a memory pool"); - } - - ::cuda::__driver::__mempoolSetAttribute( - __cuda_pool_handle, ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &__properties.release_threshold); - - // allocate the requested initial size to prime the pool. - // We need to use a new stream so we do not wait on other work - if (__properties.initial_pool_size != 0) - { - ::CUdeviceptr __ptr = ::cuda::__driver::__mallocFromPoolAsync( - __properties.initial_pool_size, __cuda_pool_handle, __cccl_allocation_stream().get()); - if (::cuda::__driver::__freeAsyncNoThrow(__ptr, __cccl_allocation_stream().get()) != ::cudaSuccess) - { - ::cuda::__throw_cuda_error(::cudaErrorMemoryAllocation, "Failed to allocate initial pool size"); - } - } - return __cuda_pool_handle; - } - -protected: - struct __from_handle_t - {}; - - //! @brief Constructs a \c __memory_pool_base from a handle taking ownership of the pool - //! @param __handle The handle to the existing pool - explicit __memory_pool_base(__from_handle_t, ::cudaMemPool_t __handle) noexcept - : __pool_handle_(__handle) - {} - -public: - //! @brief Constructs a \c __memory_pool_base with the optionally specified initial pool size and release threshold. - //! If the pool size grows beyond the release threshold, unused memory held by the pool will be released at the next - //! synchronization event. - //! @throws cuda_error if the CUDA version does not support ``cudaMallocAsync``. - //! @param __id The device id of the device the stream pool is constructed on. - //! @param __pool_properties Optional, additional properties of the pool to be created. - explicit __memory_pool_base( - memory_pool_properties __properties, ::CUmemLocation __location, CUmemAllocationType __allocation_type) - : __pool_handle_(__create_cuda_mempool(__properties, __location, __allocation_type)) - {} - - __memory_pool_base(__memory_pool_base const&) = delete; - __memory_pool_base(__memory_pool_base&&) = delete; - __memory_pool_base& operator=(__memory_pool_base const&) = delete; - __memory_pool_base& operator=(__memory_pool_base&&) = delete; - - ~__memory_pool_base() noexcept - { - ::cuda::__driver::__mempoolDestroy(__pool_handle_); - } - - //! @brief Tries to release memory. - //! @param __min_bytes_to_keep the minimal guaranteed size of the pool. - //! @note If the pool has less than \p __minBytesToKeep reserved, the trim_to operation is a no-op. Otherwise the - //! pool will be guaranteed to have at least \p __minBytesToKeep bytes reserved after the operation. - _CCCL_HOST_API void trim_to(const size_t __min_bytes_to_keep) - { - ::cuda::__driver::__mempoolTrimTo(__pool_handle_, __min_bytes_to_keep); - } - - //! @brief Gets the value of an attribute of the pool. - //! @param __attr the attribute to get. - //! @return The value of the attribute. - template - [[nodiscard]] _CCCL_HOST_API auto attribute(_Attr __attr) const - { - return __attr(__pool_handle_); - } - - //! @brief Gets the value of an attribute of the pool. - //! @param __attribute the attribute to get. - //! @return The value of the attribute. - template <::cudaMemPoolAttr _Attr> - _CCCL_HOST_API auto attribute() const - { - return attribute(__detail::__pool_attr<_Attr>()); - } - - //! @brief Sets an attribute of the pool to a given value. - //! @param __attribute the attribute to be set. - //! @param __value the new value of that attribute. - template - _CCCL_HOST_API void set_attribute(_Attr __attr, typename _Attr::type __value) - { - __attr.set(__pool_handle_, __value); - } - - //! @brief Sets an attribute of the pool to a given value. - //! @param __attribute the attribute to be set. - //! @param __value the new value of that attribute. - template <::cudaMemPoolAttr _Attr> - _CCCL_HOST_API void set_attribute(typename __detail::__pool_attr<_Attr>::type __value) - { - return set_attribute(__detail::__pool_attr<_Attr>(), __value); - } - - //! @brief Enable access to this memory pool from the supplied devices - //! - //! Device on which this pool resides can be included in the span. - //! - //! @param __devices A span of `device_ref`s listing devices to enable access for - _CCCL_HOST_API void enable_access_from(::cuda::std::span __devices) - { - ::cuda::experimental::__mempool_set_access( - __pool_handle_, {__devices.data(), __devices.size()}, ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE); - } - - //! @brief Enable access to this memory pool from the supplied device - //! - //! @param __device device_ref indicating for which device the access should be enabled - _CCCL_HOST_API void enable_access_from(device_ref __device) - { - ::cuda::experimental::__mempool_set_access(__pool_handle_, {&__device, 1}, ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE); - } - - //! @brief Disable access to this memory pool from the supplied devices - //! - //! Device on which this pool resides can be included in the span. - //! - //! @param __devices A span of `device_ref`s listing devices to disable access for - _CCCL_HOST_API void disable_access_from(::cuda::std::span __devices) - { - ::cuda::experimental::__mempool_set_access( - __pool_handle_, {__devices.data(), __devices.size()}, ::CU_MEM_ACCESS_FLAGS_PROT_NONE); - } - - //! @brief Disable access to this memory pool from the supplied device - //! - //! @param __device device_ref indicating for which device the access should be disable - _CCCL_HOST_API void disable_access_from(device_ref __device) - { - ::cuda::experimental::__mempool_set_access(__pool_handle_, {&__device, 1}, ::CU_MEM_ACCESS_FLAGS_PROT_NONE); - } - - //! @brief Query if memory allocated through this memory resource is accessible by the supplied device - //! - //! @param __device device for which the access is queried - [[nodiscard]] _CCCL_HOST_API bool is_accessible_from(device_ref __device) - { - return ::cuda::experimental::__mempool_get_access(__pool_handle_, __device); - } - - //! @brief Equality comparison with another \c __memory_pool_base. - //! @returns true if the stored ``cudaMemPool_t`` are equal. - [[nodiscard]] _CCCL_HOST_API constexpr bool operator==(__memory_pool_base const& __rhs) const noexcept - { - return __pool_handle_ == __rhs.__pool_handle_; - } - -#if _CCCL_STD_VER <= 2017 - //! @brief Inequality comparison with another \c __memory_pool_base. - //! @returns true if the stored ``cudaMemPool_t`` are not equal. - [[nodiscard]] _CCCL_HOST_API constexpr bool operator!=(__memory_pool_base const& __rhs) const noexcept - { - return __pool_handle_ != __rhs.__pool_handle_; - } -#endif // _CCCL_STD_VER <= 2017 - - //! @brief Equality comparison with a \c cudaMemPool_t. - //! @param __rhs A \c cudaMemPool_t. - //! @returns true if the stored ``cudaMemPool_t`` is equal to \p __rhs. - [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator==(__memory_pool_base const& __lhs, ::cudaMemPool_t __rhs) noexcept - { - return __lhs.__pool_handle_ == __rhs; - } - -#if _CCCL_STD_VER <= 2017 - //! @copydoc __memory_pool_base::operator==(__memory_pool_base const&, ::cudaMemPool_t) - [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator==(::cudaMemPool_t __lhs, __memory_pool_base const& __rhs) noexcept - { - return __rhs.__pool_handle_ == __lhs; - } - - //! @copydoc __memory_pool_base::operator==(__memory_pool_base const&, ::cudaMemPool_t) - [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator!=(__memory_pool_base const& __lhs, ::cudaMemPool_t __rhs) noexcept - { - return __lhs.__pool_handle_ != __rhs; - } - - //! @copydoc __memory_pool_base::operator==(__memory_pool_base const&, ::cudaMemPool_t) - [[nodiscard]] _CCCL_HOST_API friend constexpr bool - operator!=(::cudaMemPool_t __lhs, __memory_pool_base const& __rhs) noexcept - { - return __rhs.__pool_handle_ != __lhs; - } -#endif // _CCCL_STD_VER <= 2017 - - //! @brief Returns the underlying handle to the CUDA memory pool. - [[nodiscard]] _CCCL_HOST_API constexpr cudaMemPool_t get() const noexcept - { - return __pool_handle_; - } -}; - -} // namespace cuda::experimental - -#include - -#endif // _CUDAX__MEMORY_RESOURCE_MEMORY_POOL_BASE_CUH diff --git a/cudax/include/cuda/experimental/__memory_resource/memory_resource_base.cuh b/cudax/include/cuda/experimental/__memory_resource/memory_resource_base.cuh index 987ca1f8558..dc7167de01c 100644 --- a/cudax/include/cuda/experimental/__memory_resource/memory_resource_base.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/memory_resource_base.cuh @@ -26,6 +26,7 @@ # include #endif // _CCCL_CUDA_COMPILER(CLANG) +#include #include #include #include @@ -33,7 +34,6 @@ #include #include -#include #include #include #include @@ -43,6 +43,266 @@ namespace cuda::experimental { +enum class __pool_attr_settable : bool +{ +}; + +template <::cudaMemPoolAttr _Attr, typename _Type, __pool_attr_settable _Settable> +struct __pool_attr_impl +{ + using type = _Type; + + [[nodiscard]] _CCCL_HOST_API constexpr operator ::cudaMemPoolAttr() const noexcept + { + return _Attr; + } + + [[nodiscard]] _CCCL_HOST_API type operator()(::cudaMemPool_t __pool) const + { + size_t __value = ::cuda::__driver::__mempoolGetAttribute(__pool, static_cast<::CUmemPool_attribute>(_Attr)); + return static_cast(__value); + } + + static void set(::cudaMemPool_t __pool, type __value) + { + size_t __value_copy = __value; + if constexpr (_Settable == __pool_attr_settable{true}) + { + ::cuda::__driver::__mempoolSetAttribute(__pool, static_cast<::CUmemPool_attribute>(_Attr), &__value_copy); + } + else + { + ::cuda::std::__throw_invalid_argument("This attribute can't be set"); + } + } +}; + +template <::cudaMemPoolAttr _Attr> +struct __pool_attr : __pool_attr_impl<_Attr, size_t, __pool_attr_settable{true}> +{}; + +template <> +struct __pool_attr<::cudaMemPoolReuseFollowEventDependencies> + : __pool_attr_impl<::cudaMemPoolReuseFollowEventDependencies, bool, __pool_attr_settable{true}> +{}; + +template <> +struct __pool_attr<::cudaMemPoolReuseAllowOpportunistic> + : __pool_attr_impl<::cudaMemPoolReuseAllowOpportunistic, bool, __pool_attr_settable{true}> +{}; + +template <> +struct __pool_attr<::cudaMemPoolReuseAllowInternalDependencies> + : __pool_attr_impl<::cudaMemPoolReuseAllowInternalDependencies, bool, __pool_attr_settable{true}> +{}; + +template <> +struct __pool_attr<::cudaMemPoolAttrReservedMemCurrent> + : __pool_attr_impl<::cudaMemPoolAttrReservedMemCurrent, size_t, __pool_attr_settable{false}> +{}; + +template <> +struct __pool_attr<::cudaMemPoolAttrUsedMemCurrent> + : __pool_attr_impl<::cudaMemPoolAttrUsedMemCurrent, size_t, __pool_attr_settable{false}> +{}; + +inline void __set_attribute_non_zero_only(::cudaMemPool_t __pool, ::CUmemPool_attribute __attr, size_t __value) +{ + if (__value != 0) + { + ::cuda::std::__throw_invalid_argument("This attribute can't be set to a non-zero value."); + } + ::cuda::__driver::__mempoolSetAttribute(__pool, __attr, &__value); +} + +template <> +struct __pool_attr<::cudaMemPoolAttrReservedMemHigh> + : __pool_attr_impl<::cudaMemPoolAttrReservedMemHigh, size_t, __pool_attr_settable{true}> +{ + static void set(::cudaMemPool_t __pool, type __value) + { + ::cuda::experimental::__set_attribute_non_zero_only(__pool, ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, __value); + } +}; + +template <> +struct __pool_attr<::cudaMemPoolAttrUsedMemHigh> + : __pool_attr_impl<::cudaMemPoolAttrUsedMemHigh, size_t, __pool_attr_settable{true}> +{ + static void set(::cudaMemPool_t __pool, type __value) + { + ::cuda::experimental::__set_attribute_non_zero_only(__pool, ::CU_MEMPOOL_ATTR_USED_MEM_HIGH, __value); + } +}; + +namespace memory_pool_attributes +{ +// The threshold at which the pool will release memory. +using release_threshold_t = __pool_attr<::cudaMemPoolAttrReleaseThreshold>; +static constexpr release_threshold_t release_threshold{}; + +// Allow the pool to reuse the memory across streams as long as there is a stream ordering dependency between the +// streams. +using reuse_follow_event_dependencies_t = __pool_attr<::cudaMemPoolReuseFollowEventDependencies>; +static constexpr reuse_follow_event_dependencies_t reuse_follow_event_dependencies{}; + +// Allow the pool to reuse already completed frees when there is no dependency between the streams. +using reuse_allow_opportunistic_t = __pool_attr<::cudaMemPoolReuseAllowOpportunistic>; +static constexpr reuse_allow_opportunistic_t reuse_allow_opportunistic{}; + +// Allow the pool to insert stream dependencies to reuse the memory across streams. +using reuse_allow_internal_dependencies_t = __pool_attr<::cudaMemPoolReuseAllowInternalDependencies>; +static constexpr reuse_allow_internal_dependencies_t reuse_allow_internal_dependencies{}; + +// The current amount of memory reserved in the pool. +using reserved_mem_current_t = __pool_attr<::cudaMemPoolAttrReservedMemCurrent>; +static constexpr reserved_mem_current_t reserved_mem_current{}; + +// The high water mark for the reserved memory in the pool. +using reserved_mem_high_t = __pool_attr<::cudaMemPoolAttrReservedMemHigh>; +static constexpr reserved_mem_high_t reserved_mem_high{}; + +// The current amount of memory used in the pool. +using used_mem_current_t = __pool_attr<::cudaMemPoolAttrUsedMemCurrent>; +static constexpr used_mem_current_t used_mem_current{}; + +// The high water mark for the used memory in the pool. +using used_mem_high_t = __pool_attr<::cudaMemPoolAttrUsedMemHigh>; +static constexpr used_mem_high_t used_mem_high{}; +}; // namespace memory_pool_attributes + +//! @brief Checks whether the current device supports stream-ordered allocations. +//! @param __device The device for which to query support. +//! @throws cuda_error if \c cudaDeviceGetAttribute failed. +//! @returns true if \c cudaDevAttrMemoryPoolsSupported is not zero. +inline void __verify_device_supports_stream_ordered_allocations(const device_ref __device) +{ + if (!__device.attribute(::cuda::device_attributes::memory_pools_supported)) + { + ::cuda::__throw_cuda_error( + ::cudaErrorNotSupported, "stream-ordered allocations are not supported on the given device"); + } +} + +//! @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present +//! CUDA driver/runtime version. +//! @param __device The device to check for support. +//! @param __handle_type An IPC export handle type to check for support. +//! @throws cuda_error if the specified `cudaMemAllocationHandleType` is not supported on the specified device. +inline void __verify_device_supports_export_handle_type( + const device_ref __device, ::cudaMemAllocationHandleType __handle_type, ::CUmemLocation __location) +{ + if (__handle_type == ::cudaMemAllocationHandleType::cudaMemHandleTypeNone) + { + return; + } + if (__location.type != ::CU_MEM_LOCATION_TYPE_DEVICE +#if _CCCL_CTK_AT_LEAST(12, 6) + && __location.type != ::CU_MEM_LOCATION_TYPE_HOST_NUMA +#endif + ) + { + ::cuda::__throw_cuda_error( + ::cudaErrorNotSupported, "Requested IPC memory handle type not supported for the given location"); + } + auto __supported_handles = __device.attribute(::cuda::device_attributes::memory_pool_supported_handle_types); + if ((static_cast(__handle_type) & __supported_handles) != static_cast(__handle_type)) + { + ::cuda::__throw_cuda_error( + ::cudaErrorNotSupported, "Requested IPC memory handle type not supported on a given device"); + } +} + +//! @brief Enable access to this memory pool from the supplied devices +//! +//! Device on which this pool resides can be included in the span. +//! +//! @param __pool The memory pool to set access for +//! @param __devices A span of `device_ref`s listing devices to enable access for +//! @param __flags The access flags to set +//! @throws cuda_error if ``cudaMemPoolSetAccess`` fails. +inline void +__mempool_set_access(::CUmemoryPool __pool, ::cuda::std::span __devices, ::CUmemAccess_flags __flags) +{ + ::std::vector<::CUmemAccessDesc> __descs; + __descs.reserve(__devices.size()); + for (size_t __i = 0; __i < __devices.size(); ++__i) + { + __descs.push_back({::CUmemLocation{::CU_MEM_LOCATION_TYPE_DEVICE, __devices[__i].get()}, __flags}); + } + ::cuda::__driver::__mempoolSetAccess(__pool, __descs.data(), __descs.size()); +} + +//! @brief Query if memory from a pool is accessible by the supplied device +//! +//! @param __pool The memory pool to query access for +//! @param __dev The device to query access for +//! @returns true if the memory pool is accessible from the device +[[nodiscard]] inline bool __mempool_get_access(::cudaMemPool_t __pool, device_ref __dev) +{ + ::CUmemAccess_flags __result; + ::CUmemLocation __loc; + __loc.type = ::CU_MEM_LOCATION_TYPE_DEVICE; + __loc.id = __dev.get(); + __result = ::cuda::__driver::__mempoolGetAccess(__pool, &__loc); + return __result == ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE; +} + +//! @brief \c memory_pool_properties is a type that can controls memory pool to control the creation options. +//! Compared to attributes, properties can not be set after the pool is created. +struct memory_pool_properties +{ + size_t initial_pool_size = 0; + size_t release_threshold = ::cuda::std::numeric_limits::max(); + cudaMemAllocationHandleType allocation_handle_type = ::cudaMemAllocationHandleType::cudaMemHandleTypeNone; +}; + +//! @brief Creates the CUDA memory pool from the passed in arguments. +//! @throws cuda_error If the creation of the CUDA memory pool failed. +//! @returns The created CUDA memory pool. +[[nodiscard]] static cudaMemPool_t __create_cuda_mempool( + memory_pool_properties __properties, ::CUmemLocation __location, CUmemAllocationType __allocation_type) noexcept +{ + ::CUmemPoolProps __pool_properties{}; + __pool_properties.allocType = __allocation_type; + __pool_properties.handleTypes = ::CUmemAllocationHandleType(__properties.allocation_handle_type); + __pool_properties.location = __location; + + if (__properties.initial_pool_size > __properties.release_threshold) + { + ::cuda::std::__throw_invalid_argument("Initial pool size must be less than the release threshold"); + } + + ::CUmemoryPool __cuda_pool_handle{}; + ::cudaError_t __error = ::cuda::__driver::__mempoolCreateNoThrow(&__cuda_pool_handle, &__pool_properties); + if (__error != ::cudaSuccess) + { + // Mempool creation failed, lets try to figure out why + ::cuda::experimental::__verify_device_supports_stream_ordered_allocations(__location.id); + ::cuda::experimental::__verify_device_supports_export_handle_type( + __location.id, __properties.allocation_handle_type, __location); + + // Could not find the reason, throw a generic error + ::cuda::__throw_cuda_error(__error, "Failed to create a memory pool"); + } + + ::cuda::__driver::__mempoolSetAttribute( + __cuda_pool_handle, ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &__properties.release_threshold); + + // allocate the requested initial size to prime the pool. + // We need to use a new stream so we do not wait on other work + if (__properties.initial_pool_size != 0) + { + ::CUdeviceptr __ptr = ::cuda::__driver::__mallocFromPoolAsync( + __properties.initial_pool_size, __cuda_pool_handle, __cccl_allocation_stream().get()); + if (::cuda::__driver::__freeAsyncNoThrow(__ptr, __cccl_allocation_stream().get()) != ::cudaSuccess) + { + ::cuda::__throw_cuda_error(::cudaErrorMemoryAllocation, "Failed to allocate initial pool size"); + } + } + return __cuda_pool_handle; +} + class __memory_resource_base { protected: @@ -62,7 +322,7 @@ public: //! @brief Constructs the __memory_resource_base from a \c cudaMemPool_t. //! @param __pool The \c cudaMemPool_t used to allocate memory. - explicit __memory_resource_base(::cudaMemPool_t __pool) noexcept + _CCCL_HOST_API explicit __memory_resource_base(::cudaMemPool_t __pool) noexcept : __pool_(__pool) {} @@ -155,6 +415,57 @@ public: deallocate(__stream, __ptr, __bytes); } + //! @brief Tries to release memory. + //! @param __min_bytes_to_keep the minimal guaranteed size of the pool. + //! @note If the pool has less than \p __minBytesToKeep reserved, the trim_to operation is a no-op. Otherwise the + //! pool will be guaranteed to have at least \p __minBytesToKeep bytes reserved after the operation. + _CCCL_HOST_API void trim_to(const size_t __min_bytes_to_keep) + { + ::cuda::__driver::__mempoolTrimTo(__pool_, __min_bytes_to_keep); + } + + //! @brief Gets the value of an attribute of the pool. + //! @param __attr the attribute to get. + //! @return The value of the attribute. + template + [[nodiscard]] _CCCL_HOST_API auto attribute(_Attr __attr) const + { + return __attr(__pool_); + } + + //! @brief Gets the value of an attribute of the pool. + //! @param __attribute the attribute to get. + //! @return The value of the attribute. + template <::cudaMemPoolAttr _Attr> + _CCCL_HOST_API auto attribute() const + { + return attribute(__pool_attr<_Attr>()); + } + + //! @brief Sets an attribute of the pool to a given value. + //! @param __attribute the attribute to be set. + //! @param __value the new value of that attribute. + template + _CCCL_HOST_API void set_attribute(_Attr __attr, typename _Attr::type __value) + { + __attr.set(__pool_, __value); + } + + //! @brief Sets an attribute of the pool to a given value. + //! @param __attribute the attribute to be set. + //! @param __value the new value of that attribute. + template <::cudaMemPoolAttr _Attr> + _CCCL_HOST_API void set_attribute(typename __pool_attr<_Attr>::type __value) + { + return set_attribute(__pool_attr<_Attr>(), __value); + } + + //! @brief Returns the underlying handle to the CUDA memory pool. + [[nodiscard]] _CCCL_API constexpr cudaMemPool_t get() const noexcept + { + return __pool_; + } + //! @brief Deallocate memory pointed to by \p __ptr. //! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`. //! @param __bytes The number of bytes that was passed to the allocation call that returned \p __ptr. @@ -240,11 +551,6 @@ public: return __pool_ != __rhs.__pool_; } #endif // _CCCL_STD_VER <= 2017 - - [[nodiscard]] _CCCL_HOST_API constexpr cudaMemPool_t get() const noexcept - { - return __pool_; - } }; } // namespace cuda::experimental diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_pool.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_pool.cuh deleted file mode 100644 index a863c7037b1..00000000000 --- a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_pool.cuh +++ /dev/null @@ -1,143 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDAX__MEMORY_RESOURCE_PINNED_MEMORY_POOL_CUH -#define _CUDAX__MEMORY_RESOURCE_PINNED_MEMORY_POOL_CUH - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if _CCCL_CTK_AT_LEAST(12, 6) - -# if _CCCL_CUDA_COMPILER(CLANG) -# include -# include -# endif // _CCCL_CUDA_COMPILER(CLANG) - -# include -# include - -# include - -//! @file -//! The \c pinned_memory_pool class provides a wrapper around a `cudaMempool_t`. -namespace cuda::experimental -{ - -class pinned_memory_resource; - -//! @brief \c pinned_memory_pool is an owning wrapper around a -//! cudaMemPool_t. -//! -//! It handles creation and destruction of the underlying pool utilizing the provided \c memory_pool_properties. -class pinned_memory_pool : public __memory_pool_base -{ -private: - //! @brief Constructs a \c pinned_memory_pool from a handle taking ownership of the pool - //! @param __handle The handle to the existing pool - _CCCL_HOST_API explicit pinned_memory_pool(__memory_pool_base::__from_handle_t, ::cudaMemPool_t __handle) noexcept - : __memory_pool_base(__memory_pool_base::__from_handle_t{}, __handle) - {} - -public: -# if _CCCL_CTK_AT_LEAST(13, 0) - //! @brief Constructs a \c pinned_memory_pool with optional properties. - //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release - //! threshold, unused memory held by the pool will be released at the next synchronization event. - - //! @note Memory from this pool is accessible from all devices right away, which differs from the default behavior of - //! pinned memory pools where memory is not accessible from devices until `cudaMemPoolSetAccess` is called. - //! - //! @param __properties Optional, additional properties of the pool to be created. - _CCCL_HOST_API explicit pinned_memory_pool(memory_pool_properties __properties = {}) - : __memory_pool_base( - __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_HOST, 0}, ::CU_MEM_ALLOCATION_TYPE_PINNED) - { - enable_access_from(devices); - } -# endif // _CCCL_CTK_AT_LEAST(13, 0) - - //! @brief Constructs a \c pinned_memory_pool with the specified NUMA node id and optional properties. - //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release - //! threshold, unused memory held by the pool will be released at the next synchronization event. - //! - //! @note Memory from this pool is accessible from all devices right away, which differs from the default behavior of - //! pinned memory pools where memory is not accessible from devices until `cudaMemPoolSetAccess` is called. - //! - //! @param __numa_id The NUMA node id of the NUMA node the pool is constructed on. - //! @param __pool_properties Optional, additional properties of the pool to be created. - _CCCL_HOST_API explicit pinned_memory_pool(int __numa_id, memory_pool_properties __properties = {}) - : __memory_pool_base( - __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_HOST_NUMA, __numa_id}, ::CU_MEM_ALLOCATION_TYPE_PINNED) - { - enable_access_from(devices); - } - - //! @brief Disables construction from a plain `cudaMemPool_t`. We want to ensure clean ownership semantics. - pinned_memory_pool(::cudaMemPool_t) = delete; - - pinned_memory_pool(pinned_memory_pool const&) = delete; - pinned_memory_pool(pinned_memory_pool&&) = delete; - pinned_memory_pool& operator=(pinned_memory_pool const&) = delete; - pinned_memory_pool& operator=(pinned_memory_pool&&) = delete; - - //! @brief Construct an `pinned_memory_pool` object from a native `cudaMemPool_t` handle. - //! - //! @param __handle The native handle - //! - //! @return The constructed `pinned_memory_pool` object - //! - //! @note The constructed `pinned_memory_pool` object takes ownership of the native handle. - [[nodiscard]] static pinned_memory_pool from_native_handle(::cudaMemPool_t __handle) noexcept - { - return pinned_memory_pool(__memory_pool_base::__from_handle_t{}, __handle); - } - - // Disallow construction from an `int`, e.g., `0`. - static pinned_memory_pool from_native_handle(int) = delete; - - // Disallow construction from `nullptr`. - static pinned_memory_pool from_native_handle(::cuda::std::nullptr_t) = delete; - - using resource_type = pinned_memory_resource; -}; - -[[nodiscard]] static ::cudaMemPool_t __get_default_host_pinned_pool() -{ -# if _CCCL_CTK_AT_LEAST(13, 0) - static ::cudaMemPool_t __default_pool = []() { - ::cudaMemPool_t __pool = ::cuda::__driver::__getDefaultMemPool( - ::CUmemLocation{::CU_MEM_LOCATION_TYPE_HOST, 0}, ::CU_MEM_ALLOCATION_TYPE_PINNED); - // TODO should we be more careful with setting access from all devices? Maybe only if it was not set for any device? - ::cuda::experimental::__mempool_set_access(__pool, ::cuda::devices, ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE); - return __pool; - }(); - - return __default_pool; -# else // _CCCL_CTK_BELOW(13, 0) - static pinned_memory_pool __default_pool(0); - return __default_pool.get(); -# endif // _CCCL_CTK_BELOW(13, 0) -} - -} // namespace cuda::experimental - -# include - -#endif // _CCCL_CTK_AT_LEAST(12, 6) - -#endif // _CUDAX__MEMORY_RESOURCE_PINNED_MEMORY_POOL_CUH diff --git a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh index 1fa0e32ba56..8493155550d 100644 --- a/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/pinned_memory_resource.cuh @@ -32,7 +32,6 @@ #include #include -#include #include @@ -43,15 +42,18 @@ namespace cuda::experimental #if _CCCL_CUDACC_AT_LEAST(12, 6) +static ::cudaMemPool_t __get_default_host_pinned_pool(); + //! @rst //! .. _cudax-memory-resource-async: //! //! Stream ordered memory resource //! ------------------------------ //! -//! ``pinned_memory_resource`` uses `cudaMallocFromPoolAsync / cudaFreeAsync +//! ``pinned_memory_resource`` allocates pinned memory using `cudaMallocFromPoolAsync / cudaFreeAsync //! `__ for allocation/deallocation. A -//! ``pinned_memory_resource`` is a thin wrapper around a \c cudaMemPool_t. +//! ``pinned_memory_resource`` is a thin wrapper around a \c cudaMemPool_t with the location type set to \c +//! cudaMemLocationTypeHost or \c cudaMemLocationTypeHostNuma. //! //! .. warning:: //! @@ -74,12 +76,6 @@ public: : __memory_resource_base(__pool) {} - //! @brief Constructs the pinned_memory_resource from a \c pinned_memory_pool by calling get(). - //! @param __pool The \c pinned_memory_pool used to allocate memory. - _CCCL_HOST_API explicit pinned_memory_resource(pinned_memory_pool& __pool) noexcept - : __memory_resource_base(__pool.get()) - {} - //! @brief Enables the \c device_accessible property _CCCL_HOST_API friend constexpr void get_property(pinned_memory_resource const&, device_accessible) noexcept {} //! @brief Enables the \c host_accessible property @@ -88,9 +84,98 @@ public: using default_queries = properties_list; }; +//! @rst +//! .. _cudax-memory-resource-async: +//! +//! Stream ordered memory resource +//! ------------------------------ +//! +//! ``pinned_memory_pool`` allocates pinned memory using `cudaMallocFromPoolAsync / cudaFreeAsync +//! `__ for allocation/deallocation. +//! When constructed it creates an underlying \c cudaMemPool_t with the location type set to \c cudaMemLocationTypeHost +//! or \c cudaMemLocationTypeHostNuma and owns it. +//! +//! @endrst +struct pinned_memory_pool : pinned_memory_resource +{ + using reference_type = pinned_memory_resource; + +# if _CCCL_CTK_AT_LEAST(13, 0) + //! @brief Constructs a \c pinned_memory_pool with optional properties. + //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release + //! threshold, unused memory held by the pool will be released at the next synchronization event. + + //! @note Memory from this pool is accessible from all devices right away, which differs from the default behavior of + //! pinned memory pools where memory is not accessible from devices until `cudaMemPoolSetAccess` is called. + //! + //! @param __properties Optional, additional properties of the pool to be created. + _CCCL_HOST_API pinned_memory_pool(memory_pool_properties __properties = {}) + : pinned_memory_resource(__create_cuda_mempool( + __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_HOST, 0}, ::CU_MEM_ALLOCATION_TYPE_PINNED)) + { + enable_access_from(cuda::devices); + } +# endif // _CCCL_CTK_AT_LEAST(13, 0) + + //! @brief Constructs a \c pinned_memory_pool with the specified NUMA node id and optional properties. + //! Properties include the initial pool size and the release threshold. If the pool size grows beyond the release + //! threshold, unused memory held by the pool will be released at the next synchronization event. + //! + //! @note Memory from this pool is accessible from all devices right away, which differs from the default behavior of + //! pinned memory pools where memory is not accessible from devices until `cudaMemPoolSetAccess` is called. + //! + //! @param __numa_id The NUMA node id of the NUMA node the pool is constructed on. + //! @param __pool_properties Optional, additional properties of the pool to be created. + _CCCL_HOST_API pinned_memory_pool(int __numa_id, memory_pool_properties __properties = {}) + : pinned_memory_resource(__create_cuda_mempool( + __properties, ::CUmemLocation{::CU_MEM_LOCATION_TYPE_HOST_NUMA, __numa_id}, ::CU_MEM_ALLOCATION_TYPE_PINNED)) + { + enable_access_from(cuda::devices); + } + + ~pinned_memory_pool() noexcept + { + ::cuda::__driver::__mempoolDestroy(__pool_); + } + + _CCCL_HOST_API static pinned_memory_pool from_native_handle(::cudaMemPool_t __pool) noexcept + { + return pinned_memory_pool(__pool); + } + + pinned_memory_pool(const pinned_memory_pool&) = delete; + pinned_memory_pool& operator=(const pinned_memory_pool&) = delete; + +private: + pinned_memory_pool(::cudaMemPool_t __pool) noexcept + : pinned_memory_resource(__pool) + {} +}; + static_assert(::cuda::mr::resource_with, ""); static_assert(::cuda::mr::resource_with, ""); +static_assert(::cuda::mr::resource_with, ""); +static_assert(::cuda::mr::resource_with, ""); + +[[nodiscard]] static ::cudaMemPool_t __get_default_host_pinned_pool() +{ +# if _CCCL_CTK_AT_LEAST(13, 0) + static ::cudaMemPool_t __default_pool = []() { + ::cudaMemPool_t __pool = ::cuda::__driver::__getDefaultMemPool( + ::CUmemLocation{::CU_MEM_LOCATION_TYPE_HOST, 0}, ::CU_MEM_ALLOCATION_TYPE_PINNED); + // TODO should we be more careful with setting access from all devices? Maybe only if it was not set for any device? + ::cuda::experimental::__mempool_set_access(__pool, ::cuda::devices, ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE); + return __pool; + }(); + + return __default_pool; +# else // _CCCL_CTK_BELOW(13, 0) + static pinned_memory_pool __default_pool(0); + return __default_pool.get(); +# endif // _CCCL_CTK_BELOW(13, 0) +} + #endif // _CCCL_CUDACC_AT_LEAST(12, 6) } // namespace cuda::experimental diff --git a/cudax/include/cuda/experimental/memory_resource.cuh b/cudax/include/cuda/experimental/memory_resource.cuh index a3f5070c5b1..55c8f2491ef 100644 --- a/cudax/include/cuda/experimental/memory_resource.cuh +++ b/cudax/include/cuda/experimental/memory_resource.cuh @@ -23,13 +23,10 @@ #endif // !LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE #include -#include #include #include #include -#include #include -#include #include #include #include diff --git a/cudax/test/memory_resource/device_memory_resource.cu b/cudax/test/memory_resource/device_memory_resource.cu index 55ab54a7922..2cbf4667a1e 100644 --- a/cudax/test/memory_resource/device_memory_resource.cu +++ b/cudax/test/memory_resource/device_memory_resource.cu @@ -144,8 +144,7 @@ C2H_CCCLRT_TEST("device_memory_resource construction", "[memory_resource]") cudax::memory_pool_properties props = { 42, }; - cudax::device_memory_pool pool{current_device, props}; - test_resource from_initial_pool_size{pool}; + cudax::device_memory_pool from_initial_pool_size{current_device, props}; ::cudaMemPool_t get = from_initial_pool_size.get(); CHECK(get != current_default_pool); @@ -166,8 +165,7 @@ C2H_CCCLRT_TEST("device_memory_resource construction", "[memory_resource]") 20, 42, }; - cudax::device_memory_pool pool{current_device, props}; - test_resource with_threshold{pool}; + cudax::device_memory_pool with_threshold{current_device, props}; ::cudaMemPool_t get = with_threshold.get(); CHECK(get != current_default_pool); @@ -190,8 +188,7 @@ C2H_CCCLRT_TEST("device_memory_resource construction", "[memory_resource]") 42, ::cudaMemHandleTypePosixFileDescriptor, }; - cudax::device_memory_pool pool{current_device, props}; - test_resource with_allocation_handle{pool}; + cudax::device_memory_pool with_allocation_handle{current_device, props}; ::cudaMemPool_t get = with_allocation_handle.get(); CHECK(get != current_default_pool); @@ -462,9 +459,8 @@ C2H_CCCLRT_TEST("Async memory resource access", "") if (peers.size() > 0) { cudax::device_memory_pool pool{cuda::devices[0]}; - cudax::device_memory_resource resource{pool}; cudax::stream stream{peers.front()}; - CUDAX_CHECK(resource.is_accessible_from(cuda::devices[0])); + CUDAX_CHECK(pool.is_accessible_from(cuda::devices[0])); auto allocate_and_check_access = [&](auto& resource) { auto* ptr1 = resource.allocate(stream, sizeof(int)); @@ -477,42 +473,41 @@ C2H_CCCLRT_TEST("Async memory resource access", "") resource.deallocate_sync(ptr2, sizeof(int)); }; - resource.enable_access_from(peers); + pool.enable_access_from(peers); CUDAX_CHECK(pool.is_accessible_from(peers.front())); + allocate_and_check_access(pool); + + cudax::device_memory_resource resource{pool}; CUDAX_CHECK(resource.is_accessible_from(peers.front())); allocate_and_check_access(resource); - cudax::device_memory_resource another_resource{pool}; - CUDAX_CHECK(another_resource.is_accessible_from(peers.front())); - allocate_and_check_access(another_resource); - - resource.disable_access_from(peers.front()); + pool.disable_access_from(peers.front()); + CUDAX_CHECK(!pool.is_accessible_from(peers.front())); CUDAX_CHECK(!resource.is_accessible_from(peers.front())); - CUDAX_CHECK(!another_resource.is_accessible_from(peers.front())); if (peers.size() > 1) { - CUDAX_CHECK(resource.is_accessible_from(peers[1])); + CUDAX_CHECK(pool.is_accessible_from(peers[1])); } - resource.disable_access_from(peers); + pool.disable_access_from(peers); - resource.enable_access_from(peers.front()); + pool.enable_access_from(peers.front()); + CUDAX_CHECK(pool.is_accessible_from(peers.front())); CUDAX_CHECK(resource.is_accessible_from(peers.front())); - CUDAX_CHECK(another_resource.is_accessible_from(peers.front())); // Check if enable can include the device on which the pool resides { std::vector peers_ext(peers.begin(), peers.end()); peers_ext.push_back(cuda::devices[0]); - resource.enable_access_from(peers_ext); + pool.enable_access_from(peers_ext); // Check the resource using the default pool cudax::device_memory_resource default_pool_resource{cuda::device_ref{0}}; cudax::device_memory_resource another_default_pool_resource{cuda::device_ref{0}}; - default_pool_resource.enable_access_from(peers_ext.front()); + pool.enable_access_from(peers_ext.front()); CUDAX_CHECK(default_pool_resource.is_accessible_from(peers_ext.front())); allocate_and_check_access(default_pool_resource); diff --git a/cudax/test/memory_resource/memory_pools.cu b/cudax/test/memory_resource/memory_pools.cu index bffe71947c3..778fd340a92 100644 --- a/cudax/test/memory_resource/memory_pools.cu +++ b/cudax/test/memory_resource/memory_pools.cu @@ -69,12 +69,12 @@ PoolType construct_pool([[maybe_unused]] int device_id, cudax::memory_pool_prope #if _CCCL_CTK_AT_LEAST(12, 6) if constexpr (cuda::std::is_same_v) { - return PoolType(0, props); + return cudax::pinned_memory_pool(0, props); } else { # if _CCCL_CTK_AT_LEAST(13, 0) - return PoolType(props); + return cudax::managed_memory_pool(props); # endif // _CCCL_CTK_AT_LEAST(13, 0) } #endif // _CCCL_CTK_AT_LEAST(12, 6) @@ -259,13 +259,6 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool comparison", "[memory_resource]", TEST_ CHECK(first == first); CHECK(first != second); } - - { // comparison against a cudaMemPool_t - CHECK(first == first.get()); - CHECK(first.get() == first); - CHECK(first != current_default_pool); - CHECK(current_default_pool != first); - } } C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_TYPES) @@ -281,7 +274,7 @@ C2H_CCCLRT_TEST_LIST("device_memory_pool accessors", "[memory_resource]", TEST_T } using memory_pool = TestType; - using memory_resource = typename memory_pool::resource_type; + using memory_resource = typename memory_pool::reference_type; SECTION("device_memory_pool::set_attribute") { memory_pool pool = construct_pool(current_device);