Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 0 additions & 226 deletions libcudacxx/include/cuda/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -38,230 +38,4 @@
#include <cuda/ptx>
#include <cuda/std/barrier>

// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
// functions. These functions take a pointer to CUtensorMap, so do not need to
// know its size. This type is defined in cuda.h (driver API) as:
//
// typedef struct CUtensorMap_st { [ .. snip .. ] } CUtensorMap;
//
// We need to forward-declare both CUtensorMap_st (the struct) and CUtensorMap
// (the typedef):
struct CUtensorMap_st;
typedef struct CUtensorMap_st CUtensorMap;
Comment on lines -41 to -50
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a borderline breaking change. In principle, we require users to include the headers for what they should use, so I think they need to ensure they have the right header to use CUtensorMap. But I am also fine leaving the forward declaration in.


#include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL

// Experimental exposure of TMA PTX:
//
// - cp_async_bulk_global_to_shared
// - cp_async_bulk_shared_to_global
// - cp_async_bulk_tensor_{1,2,3,4,5}d_global_to_shared
// - cp_async_bulk_tensor_{1,2,3,4,5}d_shared_to_global
// - fence_proxy_async_shared_cta
// - cp_async_bulk_commit_group
// - cp_async_bulk_wait_group_read<0, …, 7>

// These PTX wrappers are only available when the code is compiled compute
// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
// necessary to prevent cudafe from ripping out the device functions before
// device compilation begins.
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
void* __dest, const void* __src, ::cuda::std::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
_CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::shared),
"Destination must be shared memory address.");
_CCCL_ASSERT(::cuda::device::is_address_from(__src, ::cuda::device::address_space::global),
"Source must be global memory address.");

::cuda::ptx::cp_async_bulk(
::cuda::ptx::space_cluster,
::cuda::ptx::space_global,
__dest,
__src,
__size,
::cuda::device::barrier_native_handle(__bar));
}
Comment on lines -73 to -89
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find it a bit sad to leave some of the assertions behind here. But those functions were not used anyway in our barrier and memcpy_async implementation etc. I think we should consider adding some assertions to the PTX exposure in cuda::ptx. @ahendriksen do you think we can add such assertions there?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding assertions in the code generator could be very hard because they are specific for each instruction. For example, I wrote the code for warp_shuffle by hand for this reason


// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, ::cuda::std::uint32_t __size)
{
_CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
_CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::global),
"Destination must be global memory address.");
_CCCL_ASSERT(::cuda::device::is_address_from(__src, ::cuda::device::address_space::shared),
"Source must be shared memory address.");

::cuda::ptx::cp_async_bulk(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __dest, __src, __size);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const ::cuda::std::int32_t __coords[]{__c0};

::cuda::ptx::cp_async_bulk_tensor(
::cuda::ptx::space_cluster,
::cuda::ptx::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1};

::cuda::ptx::cp_async_bulk_tensor(
::cuda::ptx::space_cluster,
::cuda::ptx::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2};

::cuda::ptx::cp_async_bulk_tensor(
::cuda::ptx::space_cluster,
::cuda::ptx::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
int __c3,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3};

::cuda::ptx::cp_async_bulk_tensor(
::cuda::ptx::space_cluster,
::cuda::ptx::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
void* __dest,
const CUtensorMap* __tensor_map,
int __c0,
int __c1,
int __c2,
int __c3,
int __c4,
::cuda::barrier<::cuda::thread_scope_block>& __bar)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

::cuda::ptx::cp_async_bulk_tensor(
::cuda::ptx::space_cluster,
::cuda::ptx::space_global,
__dest,
__tensor_map,
__coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void
cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
{
const ::cuda::std::int32_t __coords[]{__c0};

::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void
cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1};

::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2};

::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3};

::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
{
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
{
::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
inline _CCCL_DEVICE void cp_async_bulk_commit_group()
{
::cuda::ptx::cp_async_bulk_commit_group();
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
template <int __n_prior>
inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
{
static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
::cuda::ptx::cp_async_bulk_wait_group_read(::cuda::ptx::n32_t<__n_prior>{});
}

#endif // __cccl_lib_experimental_ctk12_cp_async_exposure

_CCCL_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA_BARRIER
1 change: 0 additions & 1 deletion libcudacxx/include/cuda/std/__cccl/ptx_isa.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@
#if __cccl_ptx_isa >= 800
# if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
# define __cccl_lib_local_barrier_arrive_tx
# define __cccl_lib_experimental_ctk12_cp_async_exposure
# endif
#endif // __cccl_ptx_isa >= 800

Expand Down
19 changes: 13 additions & 6 deletions libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
// <cuda/barrier>

#include <cuda/barrier>
#include <cuda/ptx>
#include <cuda/std/utility> // cuda::std::move

#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
Expand All @@ -22,7 +23,6 @@
TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)

using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace cde = cuda::device::experimental;

static constexpr int buf_len = 1024;
__device__ alignas(128) int gmem_buffer[buf_len];
Expand Down Expand Up @@ -57,7 +57,13 @@ __device__ void test()
uint64_t token;
if (threadIdx.x == 0)
{
cde::cp_async_bulk_global_to_shared(smem_buffer, gmem_buffer, sizeof(smem_buffer), bar);
cuda::ptx::cp_async_bulk(
cuda::ptx::space_cluster,
cuda::ptx::space_global,
smem_buffer,
gmem_buffer,
sizeof(smem_buffer),
cuda::device::barrier_native_handle(bar));
token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_buffer));
}
else
Expand All @@ -71,15 +77,16 @@ __device__ void test()
{
smem_buffer[i] += i;
}
cde::fence_proxy_async_shared_cta();
cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
__syncthreads();

// Write back to global memory:
if (threadIdx.x == 0)
{
cde::cp_async_bulk_shared_to_global(gmem_buffer, smem_buffer, sizeof(smem_buffer));
cde::cp_async_bulk_commit_group();
cde::cp_async_bulk_wait_group_read<0>();
cuda::ptx::cp_async_bulk(
cuda::ptx::space_global, cuda::ptx::space_shared, gmem_buffer, smem_buffer, sizeof(smem_buffer));
cuda::ptx::cp_async_bulk_commit_group();
cuda::ptx::cp_async_bulk_wait_group_read<0>({});
}
__threadfence();
__syncthreads();
Expand Down

This file was deleted.

Loading
Loading