Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 32 additions & 32 deletions cudax/examples/stdexec_stream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,48 +17,54 @@
#include <cuda_runtime_api.h>

namespace cudax = cuda::experimental;
namespace task = cudax::execution;
namespace ex = cudax::execution;

// This example demonstrates how to use the experimental CUDA implementation of
// C++26's std::execution async tasking framework.

struct say_hello
int main()
{
__device__ int operator()() const
try
{
printf("Hello from lambda on device!\n");
return value;
}
auto tctx = ex::thread_context{};
auto sctx = ex::stream_context{cuda::device_ref{0}};
auto gpu = sctx.get_scheduler();

int value;
};
const auto bulk_shape = 10;
const auto bulk_fn = [] __device__(const int index, int i) noexcept {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < bulk_shape)
{
printf("Hello from bulk task on device! index = %d, i = %d\n", index, i);
}
};

__host__ void run()
{
/*
try
{
task::thread_context tctx;
task::stream_context sctx{cuda::device_ref{0}};
auto sch = sctx.get_scheduler();
auto start =
// begin work on the GPU:
ex::schedule(gpu)

auto start = //
task::schedule(sch) // begin work on the GPU
| task::then(say_hello{42}) // enqueue a function object on the GPU
| task::then([] __device__(int i) noexcept -> int { // enqueue a lambda on the GPU
printf("Hello again from lambda on device! i = %d\n", i);
return i + 1;
// execute a device lambda on the GPU:
| ex::then([] __device__() noexcept -> int {
printf("Hello from lambda on device!\n");
return 42;
})
| task::continues_on(tctx.get_scheduler()) // continue work on the CPU
| task::then([] __host__ __device__(int i) noexcept -> int { // run a lambda on the CPU

// do some parallel work on the GPU:
| ex::bulk(ex::par, bulk_shape, bulk_fn) //

// transfer execution back to the CPU:
| ex::continues_on(tctx.get_scheduler())

// execute a host/device lambda on the CPU:
| ex::then([] __host__ __device__(int i) noexcept -> int {
NV_IF_TARGET(NV_IS_HOST,
(printf("Hello from lambda on host! i = %d\n", i);),
(printf("OOPS! still on the device! i = %d\n", i);))
return i;
return i + 1;
});

// run the task, wait for it to finish, and get the result
auto [i] = task::sync_wait(std::move(start)).value();
auto [i] = ex::sync_wait(std::move(start)).value();
printf("All done on the host! result = %d\n", i);
}
catch (cuda::cuda_error const& e)
Expand All @@ -73,10 +79,4 @@ __host__ void run()
{
std::printf("Unknown exception\n");
}
*/
}

int main()
{
run();
}
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,11 @@ _CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
// Size and value make function
template <class _Tp, class... _Properties, class _Env = ::cuda::std::execution::env<>>
async_buffer<_Tp, _Properties...> make_async_buffer(
stream_ref __stream, any_resource<_Properties...> __mr, size_t __size, const _Tp& __value, const _Env& __env = {})
stream_ref __stream,
any_resource<_Properties...> __mr,
size_t __size,
const _Tp& __value,
[[maybe_unused]] const _Env& __env = {})
{
auto __res = async_buffer<_Tp, _Properties...>{__stream, __mr, __size, no_init};
__fill_n<_Tp, !::cuda::mr::__is_device_accessible<_Properties...>>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public:
//! @throws Any exception thrown by the underlying domain's `apply_sender`.
_CCCL_EXEC_CHECK_DISABLE
template <class _Domain, class _Tag, class _Sndr, class... _Args>
_CCCL_NODEBUG_API constexpr auto operator()(_Domain, _Tag, _Sndr&& __sndr, _Args&&... __args) const
_CCCL_API constexpr auto operator()(_Domain, _Tag, _Sndr&& __sndr, _Args&&... __args) const
noexcept(noexcept(__apply_domain_t<_Domain, _Tag, _Sndr, _Args...>{}.apply_sender(
_Tag{}, static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...)))
-> __apply_sender_result_t<__apply_domain_t<_Domain, _Tag, _Sndr, _Args...>, _Tag, _Sndr, _Args...>
Expand Down
25 changes: 17 additions & 8 deletions cudax/include/cuda/experimental/__execution/bulk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t
template <class _Sndr, class _Shape>
struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t
{
[[nodiscard]] _CCCL_HOST_API constexpr auto query(get_launch_config_t) const noexcept
[[nodiscard]] _CCCL_HOST_API static constexpr auto __get_launch_config(_Shape __shape) noexcept
{
constexpr int __block_threads = 256;
const int __grid_blocks = ::cuda::ceil_div(static_cast<int>(__shape_), __block_threads);
const int __grid_blocks = ::cuda::ceil_div(static_cast<int>(__shape), __block_threads);
return experimental::make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks));
}

using __launch_config_t = decltype(__get_launch_config(_Shape()));

[[nodiscard]] _CCCL_API constexpr auto query(get_launch_config_t) const noexcept -> __launch_config_t
{
NV_IF_TARGET(NV_IS_HOST,
(return __get_launch_config(__shape_);),
(_CCCL_ASSERT(false, "cannot get a launch configuration from device"); ::cuda::std::terminate();))
}

_CCCL_EXEC_CHECK_DISABLE
_CCCL_TEMPLATE(class _Query, class... _Args)
_CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with<env_of_t<_Sndr>, _Query, _Args...>)
Expand Down Expand Up @@ -152,7 +161,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
execution::set_stopped(static_cast<_Rcvr&&>(__state_->__rcvr_));
}

[[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __fwd_env_t<env_of_t<_Rcvr>>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand the consequences of this macro change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It changes the functions' attributes so that debuggers won't step over them. Otherwise it has no effect.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ericniebler You mean, it stops inlining. We do need to restore this at some point : - )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would like to keep these changes to the function attributes. i learned many things while stepping through the experimental execution code in cuda-gdb. it turns out i rarely want "nodebug" apis. i very occasionally want "trivial" (force-inline nodebug) apis, but most apis should just be _CCCL_API.

[[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> __fwd_env_t<env_of_t<_Rcvr>>
{
return __fwd_env(execution::get_env(__state_->__rcvr_));
}
Expand Down Expand Up @@ -188,7 +197,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
struct _CCCL_TYPE_VISIBILITY_DEFAULT __closure_base_t
{
template <class _Sndr>
[[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr&& __sndr, __closure_base_t __self)
[[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr&& __sndr, __closure_base_t __self)
{
static_assert(__is_sender<_Sndr>);

Expand All @@ -202,7 +211,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
{{}, static_cast<__closure_base_t&&>(__self), static_cast<_Sndr&&>(__sndr)}};
}

_CCCL_NO_UNIQUE_ADDRESS _Policy __policy_;
/*_CCCL_NO_UNIQUE_ADDRESS*/ _Policy __policy_;
_Shape __shape_;
_Fn __fn_;
};
Expand Down Expand Up @@ -250,7 +259,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
return {__state_.__shape_, __sndr_};
}

_CCCL_NO_UNIQUE_ADDRESS _BulkTag __tag_;
/*_CCCL_NO_UNIQUE_ADDRESS*/ _BulkTag __tag_;
__closure_base_t<_Policy, _Shape, _Fn> __state_;
_Sndr __sndr_;
};
Expand All @@ -267,7 +276,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
// This function call operator creates a sender adaptor closure object that can appear
// on the right-hand side of a pipe operator, like: sndr | bulk(par, shape, fn).
template <class _Policy, class _Shape, class _Fn>
[[nodiscard]] _CCCL_NODEBUG_API auto operator()(_Policy __policy, _Shape __shape, _Fn __fn) const
[[nodiscard]] _CCCL_API auto operator()(_Policy __policy, _Shape __shape, _Fn __fn) const
{
static_assert(::cuda::std::integral<_Shape>);
static_assert(::cuda::std::is_execution_policy_v<_Policy>);
Expand Down Expand Up @@ -389,7 +398,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bulk_t : __bulk_t<bulk_t>
{
_CCCL_EXEC_CHECK_DISABLE
template <class... _Ts>
_CCCL_NODEBUG_API void operator()(_Shape __begin, _Shape __end, _Ts&&... __values) noexcept(
_CCCL_API void operator()(_Shape __begin, _Shape __end, _Ts&&... __values) noexcept(
__nothrow_callable<_Fn&, _Shape, decltype(__values)&...>)
{
for (; __begin != __end; ++__begin)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ struct get_completion_behavior_t
return __attrs.query(*this, __env);
}

[[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
[[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
{
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ using __make_completion_signatures_t _CCCL_NODEBUG_ALIAS =
decltype(execution::__make_unique(execution::__normalize(static_cast<_Sigs*>(nullptr))...));

template <class... _ExplicitSigs, class... _DeducedSigs>
[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto make_completion_signatures(_DeducedSigs*...) noexcept
[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto make_completion_signatures(_DeducedSigs*...) noexcept
-> __make_completion_signatures_t<_ExplicitSigs..., _DeducedSigs...>
{
return {};
Expand All @@ -206,7 +206,7 @@ using __concat_completion_signatures_t _CCCL_NODEBUG_ALIAS =
struct __concat_completion_signatures_fn
{
template <class... _Sigs>
_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(const _Sigs&...) const noexcept
_CCCL_API _CCCL_CONSTEVAL auto operator()(const _Sigs&...) const noexcept
-> __concat_completion_signatures_t<_Sigs...>
{
return {};
Expand All @@ -217,13 +217,13 @@ extern const completion_signatures<>& __empty_completion_signatures;

struct __concat_completion_signatures_impl
{
_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> (*)()
_CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> (*)()
{
return nullptr;
}

template <class... _Sigs>
_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(const completion_signatures<_Sigs...>&) const noexcept
_CCCL_API _CCCL_CONSTEVAL auto operator()(const completion_signatures<_Sigs...>&) const noexcept
-> __make_completion_signatures_t<_Sigs...> (*)()
{
return nullptr;
Expand All @@ -235,7 +235,7 @@ struct __concat_completion_signatures_impl
class... _Cs,
class... _Ds,
class... _Rest>
_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(
_CCCL_API _CCCL_CONSTEVAL auto operator()(
const completion_signatures<_As...>&,
const completion_signatures<_Bs...>&,
const completion_signatures<_Cs...>& = __empty_completion_signatures,
Expand All @@ -252,7 +252,7 @@ struct __concat_completion_signatures_impl
class _Cp = ::cuda::std::__ignore_t,
class _Dp = ::cuda::std::__ignore_t,
class... _Rest>
_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto
_CCCL_API _CCCL_CONSTEVAL auto
operator()(const _Ap&, const _Bp& = {}, const _Cp& = {}, const _Dp& = {}, const _Rest&...) const noexcept
{
if constexpr (!__valid_completion_signatures<_Ap>)
Expand Down Expand Up @@ -645,7 +645,7 @@ template <class... _What, class... _Values>
#else // ^^^ constexpr exceptions ^^^ / vvv no constexpr exceptions vvv

template <class... _What, class... _Values>
[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto invalid_completion_signature(_Values...)
[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto invalid_completion_signature(_Values...)
{
return _ERROR<_What...>{};
}
Expand Down
16 changes: 8 additions & 8 deletions cudax/include/cuda/experimental/__execution/conditional.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,17 @@ public:
struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;

template <class _Sndr, class _Pred, class _Then, class _Else>
_CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const;
_CCCL_API constexpr auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const;

template <class _Pred, class _Then, class _Else>
_CCCL_NODEBUG_API constexpr auto operator()(_Pred __pred, _Then __then, _Else __else) const;
_CCCL_API constexpr auto operator()(_Pred __pred, _Then __then, _Else __else) const;
};

template <class _Pred, class _Then, class _Else, class _Sndr>
struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__sndr_t<conditional_t::__closure_base_t<_Pred, _Then, _Else>, _Sndr>
{
using __params_t _CCCL_NODEBUG_ALIAS = conditional_t::__closure_base_t<_Pred, _Then, _Else>;
_CCCL_NO_UNIQUE_ADDRESS conditional_t __tag_;
/*_CCCL_NO_UNIQUE_ADDRESS*/ conditional_t __tag_;
__params_t __params_;
_Sndr __sndr_;

Expand Down Expand Up @@ -260,7 +260,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__closure_base_t
_Else on_false;

template <class _Sndr>
_CCCL_NODEBUG_API auto __mk_sender(_Sndr&& __sndr) -> __sndr_t<__closure_base_t, _Sndr>
_CCCL_API auto __mk_sender(_Sndr&& __sndr) -> __sndr_t<__closure_base_t, _Sndr>
{
using __sndr_t = conditional_t::__sndr_t<__closure_base_t, _Sndr>;

Expand All @@ -275,28 +275,28 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__closure_base_t
}

template <class _Sndr>
_CCCL_NODEBUG_API auto operator()(_Sndr __sndr) -> __sndr_t<__closure_base_t, _Sndr>
_CCCL_API auto operator()(_Sndr __sndr) -> __sndr_t<__closure_base_t, _Sndr>
{
return __mk_sender(static_cast<_Sndr&&>(__sndr));
}

template <class _Sndr>
_CCCL_NODEBUG_API friend auto operator|(_Sndr __sndr, __closure_base_t __self) -> __sndr_t<__closure_base_t, _Sndr>
_CCCL_API friend auto operator|(_Sndr __sndr, __closure_base_t __self) -> __sndr_t<__closure_base_t, _Sndr>
{
return __self.__mk_sender(static_cast<_Sndr&&>(__sndr));
}
};

template <class _Sndr, class _Pred, class _Then, class _Else>
_CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const
_CCCL_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const
{
using __params_t _CCCL_NODEBUG_ALIAS = __closure_base_t<_Pred, _Then, _Else>;
__params_t __params{static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)};
return static_cast<__params_t&&>(__params).__mk_sender(static_cast<_Sndr&&>(__sndr));
}

template <class _Pred, class _Then, class _Else>
_CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Pred __pred, _Then __then, _Else __else) const
_CCCL_API constexpr auto conditional_t::operator()(_Pred __pred, _Then __then, _Else __else) const
{
return __closure_base_t<_Pred, _Then, _Else>{
static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)};
Expand Down
17 changes: 8 additions & 9 deletions cudax/include/cuda/experimental/__execution/continues_on.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,17 +177,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t
_CCCL_API void __set_result(_Tag, _As&&... __as) noexcept
{
using __tupl_t _CCCL_NODEBUG_ALIAS = ::cuda::std::__tuple<_Tag, decay_t<_As>...>;
if constexpr (__nothrow_decay_copyable<_As...>)
_CCCL_TRY
{
__state_->__result_.template __emplace<__tupl_t>(_Tag{}, static_cast<_As&&>(__as)...);
}
else
_CCCL_CATCH_ALL
{
_CCCL_TRY
{
__state_->__result_.template __emplace<__tupl_t>(_Tag{}, static_cast<_As&&>(__as)...);
}
_CCCL_CATCH_ALL
// Avoid ODR-using this completion operation if this code path is not taken.
if constexpr (!__nothrow_decay_copyable<_As...>)
{
execution::set_error(static_cast<_Rcvr&&>(__state_->__rcvr_), ::std::current_exception());
}
Expand Down Expand Up @@ -315,7 +312,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t
_CCCL_EXEC_CHECK_DISABLE
template <class... _Env>
[[nodiscard]] _CCCL_API constexpr auto query(get_domain_override_t, _Env&&...) const noexcept
-> __call_result_t<get_completion_domain_t<set_value_t>, env_of_t<_Sndr>, _Env...>
-> __completion_domain_of_t<set_value_t, _Sndr, __fwd_env_t<_Env>...>
{
return {};
}
Expand Down Expand Up @@ -382,6 +379,8 @@ public:
}
};

//////////////////////////////////////////////////////////////////////////////////////////
// continues_on sender
template <class _Sch, class _Sndr>
struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t::__sndr_t
{
Expand Down Expand Up @@ -424,7 +423,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t::__sndr_t
return __attrs_t<_Sch, _Sndr>{this};
}

_CCCL_NO_UNIQUE_ADDRESS continues_on_t __tag_;
/*_CCCL_NO_UNIQUE_ADDRESS*/ continues_on_t __tag_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eric explained that tests were failing without removing this macro. I'm treating this removal as a temporary phenomenon. There are other ways to get this effect, e.g., the "compressed pair" pattern that the reference implementation of mdspan classically used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i need to track down the compiler bug, file it, and then selectively disable _CCCL_NO_UNIQUE_ADDRESS acrosss all of CCCL wherever the bug can potentially manifest. then i can start using _CCCL_NO_UNIQUE_ADDRESS again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ericniebler The other option is to use a different programming technique that guarantees that empty members occupy zero bytes. A classic one is a "compressed pair" or "compressed tuple" for storing the members, that does not store empty members.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's true. it will be important when this code is no longer experimental to have all the space optimizations before we lock in the ABI.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to note that I am on the verge of completely dropping any use of _CCCL_NO_UNIQUE_ADDRESS

Its just a complete trainwreck in the waiting

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@miscco wrote:

Its just a complete trainwreck in the waiting

100% agree; as a language feature it means well, but in practice for us it's nothing but trouble.

_Sch __sch_;
_Sndr __sndr_;
};
Expand Down
Loading
Loading