-
Notifications
You must be signed in to change notification settings - Fork 283
Drop experimental TMA exposure in cuda::barrier #6225
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,230 +38,4 @@ | |
#include <cuda/ptx> | ||
#include <cuda/std/barrier> | ||
|
||
// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping | ||
// functions. These functions take a pointer to CUtensorMap, so do not need to | ||
// know its size. This type is defined in cuda.h (driver API) as: | ||
// | ||
// typedef struct CUtensorMap_st { [ .. snip .. ] } CUtensorMap; | ||
// | ||
// We need to forward-declare both CUtensorMap_st (the struct) and CUtensorMap | ||
// (the typedef): | ||
struct CUtensorMap_st; | ||
typedef struct CUtensorMap_st CUtensorMap; | ||
|
||
#include <cuda/std/__cccl/prologue.h> | ||
|
||
_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL | ||
|
||
// Experimental exposure of TMA PTX: | ||
// | ||
// - cp_async_bulk_global_to_shared | ||
// - cp_async_bulk_shared_to_global | ||
// - cp_async_bulk_tensor_{1,2,3,4,5}d_global_to_shared | ||
// - cp_async_bulk_tensor_{1,2,3,4,5}d_shared_to_global | ||
// - fence_proxy_async_shared_cta | ||
// - cp_async_bulk_commit_group | ||
// - cp_async_bulk_wait_group_read<0, …, 7> | ||
|
||
// These PTX wrappers are only available when the code is compiled compute | ||
// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is | ||
// necessary to prevent cudafe from ripping out the device functions before | ||
// device compilation begins. | ||
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk | ||
inline _CCCL_DEVICE void cp_async_bulk_global_to_shared( | ||
void* __dest, const void* __src, ::cuda::std::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar) | ||
{ | ||
_CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16."); | ||
_CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::shared), | ||
"Destination must be shared memory address."); | ||
_CCCL_ASSERT(::cuda::device::is_address_from(__src, ::cuda::device::address_space::global), | ||
"Source must be global memory address."); | ||
|
||
::cuda::ptx::cp_async_bulk( | ||
::cuda::ptx::space_cluster, | ||
::cuda::ptx::space_global, | ||
__dest, | ||
__src, | ||
__size, | ||
::cuda::device::barrier_native_handle(__bar)); | ||
} | ||
Comment on lines
-73
to
-89
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find it a bit sad to leave some of the assertions behind here. But those functions were not used anyway in our There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding assertions in the code generator could be very hard because they are specific for each instruction. For example, I wrote the code for |
||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk | ||
inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, ::cuda::std::uint32_t __size) | ||
{ | ||
_CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16."); | ||
_CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::global), | ||
"Destination must be global memory address."); | ||
_CCCL_ASSERT(::cuda::device::is_address_from(__src, ::cuda::device::address_space::shared), | ||
"Source must be shared memory address."); | ||
|
||
::cuda::ptx::cp_async_bulk(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __dest, __src, __size); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared( | ||
void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor( | ||
::cuda::ptx::space_cluster, | ||
::cuda::ptx::space_global, | ||
__dest, | ||
__tensor_map, | ||
__coords, | ||
::cuda::device::barrier_native_handle(__bar)); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared( | ||
void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor( | ||
::cuda::ptx::space_cluster, | ||
::cuda::ptx::space_global, | ||
__dest, | ||
__tensor_map, | ||
__coords, | ||
::cuda::device::barrier_native_handle(__bar)); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared( | ||
void* __dest, | ||
const CUtensorMap* __tensor_map, | ||
int __c0, | ||
int __c1, | ||
int __c2, | ||
::cuda::barrier<::cuda::thread_scope_block>& __bar) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor( | ||
::cuda::ptx::space_cluster, | ||
::cuda::ptx::space_global, | ||
__dest, | ||
__tensor_map, | ||
__coords, | ||
::cuda::device::barrier_native_handle(__bar)); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared( | ||
void* __dest, | ||
const CUtensorMap* __tensor_map, | ||
int __c0, | ||
int __c1, | ||
int __c2, | ||
int __c3, | ||
::cuda::barrier<::cuda::thread_scope_block>& __bar) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor( | ||
::cuda::ptx::space_cluster, | ||
::cuda::ptx::space_global, | ||
__dest, | ||
__tensor_map, | ||
__coords, | ||
::cuda::device::barrier_native_handle(__bar)); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared( | ||
void* __dest, | ||
const CUtensorMap* __tensor_map, | ||
int __c0, | ||
int __c1, | ||
int __c2, | ||
int __c3, | ||
int __c4, | ||
::cuda::barrier<::cuda::thread_scope_block>& __bar) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor( | ||
::cuda::ptx::space_cluster, | ||
::cuda::ptx::space_global, | ||
__dest, | ||
__tensor_map, | ||
__coords, | ||
::cuda::device::barrier_native_handle(__bar)); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void | ||
cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void | ||
cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global( | ||
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global( | ||
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor | ||
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global( | ||
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src) | ||
{ | ||
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4}; | ||
|
||
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar | ||
inline _CCCL_DEVICE void fence_proxy_async_shared_cta() | ||
{ | ||
::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group | ||
inline _CCCL_DEVICE void cp_async_bulk_commit_group() | ||
{ | ||
::cuda::ptx::cp_async_bulk_commit_group(); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group | ||
template <int __n_prior> | ||
inline _CCCL_DEVICE void cp_async_bulk_wait_group_read() | ||
{ | ||
static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported."); | ||
::cuda::ptx::cp_async_bulk_wait_group_read(::cuda::ptx::n32_t<__n_prior>{}); | ||
} | ||
|
||
#endif // __cccl_lib_experimental_ctk12_cp_async_exposure | ||
|
||
_CCCL_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL | ||
|
||
#include <cuda/std/__cccl/epilogue.h> | ||
|
||
#endif // _CUDA_BARRIER |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a borderline breaking change. In principle, we require users to include the headers for what they should use, so I think they need to ensure they have the right header to use
CUtensorMap
. But I am also fine leaving the forward declaration in.