NVIDIA · bernhardmgruber · Oct 14, 2025 · bernhardmgruber · Oct 14, 2025 · bernhardmgruber
@@ -38,230 +38,4 @@
 #include <cuda/ptx>
 #include <cuda/std/barrier>
 
-// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
-// functions. These functions take a pointer to CUtensorMap, so do not need to
-// know its size. This type is defined in cuda.h (driver API) as:
-//
-//     typedef struct CUtensorMap_st {  [ .. snip .. ] } CUtensorMap;
-//
-// We need to forward-declare both CUtensorMap_st (the struct) and CUtensorMap
-// (the typedef):
-struct CUtensorMap_st;
-typedef struct CUtensorMap_st CUtensorMap;
-
-#include <cuda/std/__cccl/prologue.h>
-
-_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
-
-// Experimental exposure of TMA PTX:
-//
-// - cp_async_bulk_global_to_shared
-// - cp_async_bulk_shared_to_global
-// - cp_async_bulk_tensor_{1,2,3,4,5}d_global_to_shared
-// - cp_async_bulk_tensor_{1,2,3,4,5}d_shared_to_global
-// - fence_proxy_async_shared_cta
-// - cp_async_bulk_commit_group
-// - cp_async_bulk_wait_group_read<0, …, 7>
-
-// These PTX wrappers are only available when the code is compiled compute
-// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
-// necessary to prevent cudafe from ripping out the device functions before
-// device compilation begins.
-#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
-  void* __dest, const void* __src, ::cuda::std::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
-{
-  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
-  _CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::shared),
-               "Destination must be shared memory address.");
-  _CCCL_ASSERT(::cuda::device::is_address_from(__src, ::cuda::device::address_space::global),
-               "Source must be global memory address.");
-
-  ::cuda::ptx::cp_async_bulk(
-    ::cuda::ptx::space_cluster,
-    ::cuda::ptx::space_global,
-    __dest,
-    __src,
-    __size,
-    ::cuda::device::barrier_native_handle(__bar));
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, ::cuda::std::uint32_t __size)
-{
-  _CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
-  _CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::global),
-               "Destination must be global memory address.");
-  _CCCL_ASSERT(::cuda::device::is_address_from(__src, ::cuda::device::address_space::shared),
-               "Source must be shared memory address.");
-
-  ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __dest, __src, __size);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
-  void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
-{
-  const ::cuda::std::int32_t __coords[]{__c0};
-
-  ::cuda::ptx::cp_async_bulk_tensor(
-    ::cuda::ptx::space_cluster,
-    ::cuda::ptx::space_global,
-    __dest,
-    __tensor_map,
-    __coords,
-    ::cuda::device::barrier_native_handle(__bar));
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
-  void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1};
-
-  ::cuda::ptx::cp_async_bulk_tensor(
-    ::cuda::ptx::space_cluster,
-    ::cuda::ptx::space_global,
-    __dest,
-    __tensor_map,
-    __coords,
-    ::cuda::device::barrier_native_handle(__bar));
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
-  void* __dest,
-  const CUtensorMap* __tensor_map,
-  int __c0,
-  int __c1,
-  int __c2,
-  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2};
-
-  ::cuda::ptx::cp_async_bulk_tensor(
-    ::cuda::ptx::space_cluster,
-    ::cuda::ptx::space_global,
-    __dest,
-    __tensor_map,
-    __coords,
-    ::cuda::device::barrier_native_handle(__bar));
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
-  void* __dest,
-  const CUtensorMap* __tensor_map,
-  int __c0,
-  int __c1,
-  int __c2,
-  int __c3,
-  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3};
-
-  ::cuda::ptx::cp_async_bulk_tensor(
-    ::cuda::ptx::space_cluster,
-    ::cuda::ptx::space_global,
-    __dest,
-    __tensor_map,
-    __coords,
-    ::cuda::device::barrier_native_handle(__bar));
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
-  void* __dest,
-  const CUtensorMap* __tensor_map,
-  int __c0,
-  int __c1,
-  int __c2,
-  int __c3,
-  int __c4,
-  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
-
-  ::cuda::ptx::cp_async_bulk_tensor(
-    ::cuda::ptx::space_cluster,
-    ::cuda::ptx::space_global,
-    __dest,
-    __tensor_map,
-    __coords,
-    ::cuda::device::barrier_native_handle(__bar));
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void
-cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
-{
-  const ::cuda::std::int32_t __coords[]{__c0};
-
-  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void
-cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1};
-
-  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
-  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2};
-
-  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
-  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3};
-
-  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
-  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
-{
-  const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
-
-  ::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
-inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
-{
-  ::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-inline _CCCL_DEVICE void cp_async_bulk_commit_group()
-{
-  ::cuda::ptx::cp_async_bulk_commit_group();
-}
-
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-template <int __n_prior>
-inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
-{
-  static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
-  ::cuda::ptx::cp_async_bulk_wait_group_read(::cuda::ptx::n32_t<__n_prior>{});
-}
-
-#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
-
-_CCCL_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
-
-#include <cuda/std/__cccl/epilogue.h>
-
 #endif // _CUDA_BARRIER
@@ -105,7 +105,6 @@
 #if __cccl_ptx_isa >= 800
 #  if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
 #    define __cccl_lib_local_barrier_arrive_tx
-#    define __cccl_lib_experimental_ctk12_cp_async_exposure
 #  endif
 #endif // __cccl_ptx_isa >= 800
 

@@ -14,6 +14,7 @@
 // <cuda/barrier>
 
 #include <cuda/barrier>
+#include <cuda/ptx>
 #include <cuda/std/utility> // cuda::std::move
 
 #include "test_macros.h" // TEST_NV_DIAG_SUPPRESS
@@ -22,7 +23,6 @@
 TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
 
 using barrier = cuda::barrier<cuda::thread_scope_block>;
-namespace cde = cuda::device::experimental;
 
 static constexpr int buf_len = 1024;
 __device__ alignas(128) int gmem_buffer[buf_len];
@@ -57,7 +57,13 @@ __device__ void test()
   uint64_t token;
   if (threadIdx.x == 0)
   {
-    cde::cp_async_bulk_global_to_shared(smem_buffer, gmem_buffer, sizeof(smem_buffer), bar);
+    cuda::ptx::cp_async_bulk(
+      cuda::ptx::space_cluster,
+      cuda::ptx::space_global,
+      smem_buffer,
+      gmem_buffer,
+      sizeof(smem_buffer),
+      cuda::device::barrier_native_handle(bar));
     token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_buffer));
   }
   else
@@ -71,15 +77,16 @@ __device__ void test()
   {
     smem_buffer[i] += i;
   }
-  cde::fence_proxy_async_shared_cta();
+  cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
   __syncthreads();
 
   // Write back to global memory:
   if (threadIdx.x == 0)
   {
-    cde::cp_async_bulk_shared_to_global(gmem_buffer, smem_buffer, sizeof(smem_buffer));
-    cde::cp_async_bulk_commit_group();
-    cde::cp_async_bulk_wait_group_read<0>();
+    cuda::ptx::cp_async_bulk(
+      cuda::ptx::space_global, cuda::ptx::space_shared, gmem_buffer, smem_buffer, sizeof(smem_buffer));
+    cuda::ptx::cp_async_bulk_commit_group();
+    cuda::ptx::cp_async_bulk_wait_group_read<0>({});
   }
   __threadfence();
   __syncthreads();