diff --git a/cudax/examples/stdexec_stream.cu b/cudax/examples/stdexec_stream.cu index 49e55573589..a8ecc792f37 100644 --- a/cudax/examples/stdexec_stream.cu +++ b/cudax/examples/stdexec_stream.cu @@ -17,48 +17,54 @@ #include namespace cudax = cuda::experimental; -namespace task = cudax::execution; +namespace ex = cudax::execution; // This example demonstrates how to use the experimental CUDA implementation of // C++26's std::execution async tasking framework. -struct say_hello +int main() { - __device__ int operator()() const + try { - printf("Hello from lambda on device!\n"); - return value; - } + auto tctx = ex::thread_context{}; + auto sctx = ex::stream_context{cuda::device_ref{0}}; + auto gpu = sctx.get_scheduler(); - int value; -}; + const auto bulk_shape = 10; + const auto bulk_fn = [] __device__(const int index, int i) noexcept { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < bulk_shape) + { + printf("Hello from bulk task on device! index = %d, i = %d\n", index, i); + } + }; -__host__ void run() -{ - /* - try - { - task::thread_context tctx; - task::stream_context sctx{cuda::device_ref{0}}; - auto sch = sctx.get_scheduler(); + auto start = + // begin work on the GPU: + ex::schedule(gpu) - auto start = // - task::schedule(sch) // begin work on the GPU - | task::then(say_hello{42}) // enqueue a function object on the GPU - | task::then([] __device__(int i) noexcept -> int { // enqueue a lambda on the GPU - printf("Hello again from lambda on device! i = %d\n", i); - return i + 1; + // execute a device lambda on the GPU: + | ex::then([] __device__() noexcept -> int { + printf("Hello from lambda on device!\n"); + return 42; }) - | task::continues_on(tctx.get_scheduler()) // continue work on the CPU - | task::then([] __host__ __device__(int i) noexcept -> int { // run a lambda on the CPU + + // do some parallel work on the GPU: + | ex::bulk(ex::par, bulk_shape, bulk_fn) // + + // transfer execution back to the CPU: + | ex::continues_on(tctx.get_scheduler()) + + // execute a host/device lambda on the CPU: + | ex::then([] __host__ __device__(int i) noexcept -> int { NV_IF_TARGET(NV_IS_HOST, (printf("Hello from lambda on host! i = %d\n", i);), (printf("OOPS! still on the device! i = %d\n", i);)) - return i; + return i + 1; }); // run the task, wait for it to finish, and get the result - auto [i] = task::sync_wait(std::move(start)).value(); + auto [i] = ex::sync_wait(std::move(start)).value(); printf("All done on the host! result = %d\n", i); } catch (cuda::cuda_error const& e) @@ -73,10 +79,4 @@ __host__ void run() { std::printf("Unknown exception\n"); } - */ -} - -int main() -{ - run(); } diff --git a/cudax/include/cuda/experimental/__container/async_buffer.cuh b/cudax/include/cuda/experimental/__container/async_buffer.cuh index 0078af4a9c1..441cdb0bcf8 100644 --- a/cudax/include/cuda/experimental/__container/async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/async_buffer.cuh @@ -669,7 +669,11 @@ _CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT // Size and value make function template > async_buffer<_Tp, _Properties...> make_async_buffer( - stream_ref __stream, any_resource<_Properties...> __mr, size_t __size, const _Tp& __value, const _Env& __env = {}) + stream_ref __stream, + any_resource<_Properties...> __mr, + size_t __size, + const _Tp& __value, + [[maybe_unused]] const _Env& __env = {}) { auto __res = async_buffer<_Tp, _Properties...>{__stream, __mr, __size, no_init}; __fill_n<_Tp, !::cuda::mr::__is_device_accessible<_Properties...>>( diff --git a/cudax/include/cuda/experimental/__execution/apply_sender.cuh b/cudax/include/cuda/experimental/__execution/apply_sender.cuh index 948a514641a..b9c4f93ca8f 100644 --- a/cudax/include/cuda/experimental/__execution/apply_sender.cuh +++ b/cudax/include/cuda/experimental/__execution/apply_sender.cuh @@ -63,7 +63,7 @@ public: //! @throws Any exception thrown by the underlying domain's `apply_sender`. _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API constexpr auto operator()(_Domain, _Tag, _Sndr&& __sndr, _Args&&... __args) const + _CCCL_API constexpr auto operator()(_Domain, _Tag, _Sndr&& __sndr, _Args&&... __args) const noexcept(noexcept(__apply_domain_t<_Domain, _Tag, _Sndr, _Args...>{}.apply_sender( _Tag{}, static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...))) -> __apply_sender_result_t<__apply_domain_t<_Domain, _Tag, _Sndr, _Args...>, _Tag, _Sndr, _Args...> diff --git a/cudax/include/cuda/experimental/__execution/bulk.cuh b/cudax/include/cuda/experimental/__execution/bulk.cuh index a5289dd7a17..16c1cb3121d 100644 --- a/cudax/include/cuda/experimental/__execution/bulk.cuh +++ b/cudax/include/cuda/experimental/__execution/bulk.cuh @@ -68,13 +68,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t template struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t { - [[nodiscard]] _CCCL_HOST_API constexpr auto query(get_launch_config_t) const noexcept + [[nodiscard]] _CCCL_HOST_API static constexpr auto __get_launch_config(_Shape __shape) noexcept { constexpr int __block_threads = 256; - const int __grid_blocks = ::cuda::ceil_div(static_cast(__shape_), __block_threads); + const int __grid_blocks = ::cuda::ceil_div(static_cast(__shape), __block_threads); return experimental::make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks)); } + using __launch_config_t = decltype(__get_launch_config(_Shape())); + + [[nodiscard]] _CCCL_API constexpr auto query(get_launch_config_t) const noexcept -> __launch_config_t + { + NV_IF_TARGET(NV_IS_HOST, + (return __get_launch_config(__shape_);), + (_CCCL_ASSERT(false, "cannot get a launch configuration from device"); ::cuda::std::terminate();)) + } + _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Query, class... _Args) _CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with, _Query, _Args...>) @@ -152,7 +161,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t execution::set_stopped(static_cast<_Rcvr&&>(__state_->__rcvr_)); } - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __fwd_env_t> + [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> __fwd_env_t> { return __fwd_env(execution::get_env(__state_->__rcvr_)); } @@ -188,7 +197,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t struct _CCCL_TYPE_VISIBILITY_DEFAULT __closure_base_t { template - [[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr&& __sndr, __closure_base_t __self) + [[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr&& __sndr, __closure_base_t __self) { static_assert(__is_sender<_Sndr>); @@ -202,7 +211,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t {{}, static_cast<__closure_base_t&&>(__self), static_cast<_Sndr&&>(__sndr)}}; } - _CCCL_NO_UNIQUE_ADDRESS _Policy __policy_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _Policy __policy_; _Shape __shape_; _Fn __fn_; }; @@ -250,7 +259,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t return {__state_.__shape_, __sndr_}; } - _CCCL_NO_UNIQUE_ADDRESS _BulkTag __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _BulkTag __tag_; __closure_base_t<_Policy, _Shape, _Fn> __state_; _Sndr __sndr_; }; @@ -267,7 +276,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t // This function call operator creates a sender adaptor closure object that can appear // on the right-hand side of a pipe operator, like: sndr | bulk(par, shape, fn). template - [[nodiscard]] _CCCL_NODEBUG_API auto operator()(_Policy __policy, _Shape __shape, _Fn __fn) const + [[nodiscard]] _CCCL_API auto operator()(_Policy __policy, _Shape __shape, _Fn __fn) const { static_assert(::cuda::std::integral<_Shape>); static_assert(::cuda::std::is_execution_policy_v<_Policy>); @@ -389,7 +398,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bulk_t : __bulk_t { _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API void operator()(_Shape __begin, _Shape __end, _Ts&&... __values) noexcept( + _CCCL_API void operator()(_Shape __begin, _Shape __end, _Ts&&... __values) noexcept( __nothrow_callable<_Fn&, _Shape, decltype(__values)&...>) { for (; __begin != __end; ++__begin) diff --git a/cudax/include/cuda/experimental/__execution/completion_behavior.cuh b/cudax/include/cuda/experimental/__execution/completion_behavior.cuh index 4c98e6c1a19..f4e34f696b0 100644 --- a/cudax/include/cuda/experimental/__execution/completion_behavior.cuh +++ b/cudax/include/cuda/experimental/__execution/completion_behavior.cuh @@ -139,7 +139,7 @@ struct get_completion_behavior_t return __attrs.query(*this, __env); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } diff --git a/cudax/include/cuda/experimental/__execution/completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/completion_signatures.cuh index 6a8108b344a..8fcd9a966e7 100644 --- a/cudax/include/cuda/experimental/__execution/completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__execution/completion_signatures.cuh @@ -189,7 +189,7 @@ using __make_completion_signatures_t _CCCL_NODEBUG_ALIAS = decltype(execution::__make_unique(execution::__normalize(static_cast<_Sigs*>(nullptr))...)); template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto make_completion_signatures(_DeducedSigs*...) noexcept +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto make_completion_signatures(_DeducedSigs*...) noexcept -> __make_completion_signatures_t<_ExplicitSigs..., _DeducedSigs...> { return {}; @@ -206,7 +206,7 @@ using __concat_completion_signatures_t _CCCL_NODEBUG_ALIAS = struct __concat_completion_signatures_fn { template - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(const _Sigs&...) const noexcept + _CCCL_API _CCCL_CONSTEVAL auto operator()(const _Sigs&...) const noexcept -> __concat_completion_signatures_t<_Sigs...> { return {}; @@ -217,13 +217,13 @@ extern const completion_signatures<>& __empty_completion_signatures; struct __concat_completion_signatures_impl { - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> (*)() + _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> (*)() { return nullptr; } template - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(const completion_signatures<_Sigs...>&) const noexcept + _CCCL_API _CCCL_CONSTEVAL auto operator()(const completion_signatures<_Sigs...>&) const noexcept -> __make_completion_signatures_t<_Sigs...> (*)() { return nullptr; @@ -235,7 +235,7 @@ struct __concat_completion_signatures_impl class... _Cs, class... _Ds, class... _Rest> - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()( + _CCCL_API _CCCL_CONSTEVAL auto operator()( const completion_signatures<_As...>&, const completion_signatures<_Bs...>&, const completion_signatures<_Cs...>& = __empty_completion_signatures, @@ -252,7 +252,7 @@ struct __concat_completion_signatures_impl class _Cp = ::cuda::std::__ignore_t, class _Dp = ::cuda::std::__ignore_t, class... _Rest> - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto + _CCCL_API _CCCL_CONSTEVAL auto operator()(const _Ap&, const _Bp& = {}, const _Cp& = {}, const _Dp& = {}, const _Rest&...) const noexcept { if constexpr (!__valid_completion_signatures<_Ap>) @@ -645,7 +645,7 @@ template #else // ^^^ constexpr exceptions ^^^ / vvv no constexpr exceptions vvv template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto invalid_completion_signature(_Values...) +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto invalid_completion_signature(_Values...) { return _ERROR<_What...>{}; } diff --git a/cudax/include/cuda/experimental/__execution/conditional.cuh b/cudax/include/cuda/experimental/__execution/conditional.cuh index a01ddec863f..d429937e52c 100644 --- a/cudax/include/cuda/experimental/__execution/conditional.cuh +++ b/cudax/include/cuda/experimental/__execution/conditional.cuh @@ -206,17 +206,17 @@ public: struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; template - _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const; + _CCCL_API constexpr auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const; template - _CCCL_NODEBUG_API constexpr auto operator()(_Pred __pred, _Then __then, _Else __else) const; + _CCCL_API constexpr auto operator()(_Pred __pred, _Then __then, _Else __else) const; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__sndr_t, _Sndr> { using __params_t _CCCL_NODEBUG_ALIAS = conditional_t::__closure_base_t<_Pred, _Then, _Else>; - _CCCL_NO_UNIQUE_ADDRESS conditional_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ conditional_t __tag_; __params_t __params_; _Sndr __sndr_; @@ -260,7 +260,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__closure_base_t _Else on_false; template - _CCCL_NODEBUG_API auto __mk_sender(_Sndr&& __sndr) -> __sndr_t<__closure_base_t, _Sndr> + _CCCL_API auto __mk_sender(_Sndr&& __sndr) -> __sndr_t<__closure_base_t, _Sndr> { using __sndr_t = conditional_t::__sndr_t<__closure_base_t, _Sndr>; @@ -275,20 +275,20 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__closure_base_t } template - _CCCL_NODEBUG_API auto operator()(_Sndr __sndr) -> __sndr_t<__closure_base_t, _Sndr> + _CCCL_API auto operator()(_Sndr __sndr) -> __sndr_t<__closure_base_t, _Sndr> { return __mk_sender(static_cast<_Sndr&&>(__sndr)); } template - _CCCL_NODEBUG_API friend auto operator|(_Sndr __sndr, __closure_base_t __self) -> __sndr_t<__closure_base_t, _Sndr> + _CCCL_API friend auto operator|(_Sndr __sndr, __closure_base_t __self) -> __sndr_t<__closure_base_t, _Sndr> { return __self.__mk_sender(static_cast<_Sndr&&>(__sndr)); } }; template -_CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const +_CCCL_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const { using __params_t _CCCL_NODEBUG_ALIAS = __closure_base_t<_Pred, _Then, _Else>; __params_t __params{static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)}; @@ -296,7 +296,7 @@ _CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred _ } template -_CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Pred __pred, _Then __then, _Else __else) const +_CCCL_API constexpr auto conditional_t::operator()(_Pred __pred, _Then __then, _Else __else) const { return __closure_base_t<_Pred, _Then, _Else>{ static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)}; diff --git a/cudax/include/cuda/experimental/__execution/continues_on.cuh b/cudax/include/cuda/experimental/__execution/continues_on.cuh index dc9eae3145f..51fc6fc44e1 100644 --- a/cudax/include/cuda/experimental/__execution/continues_on.cuh +++ b/cudax/include/cuda/experimental/__execution/continues_on.cuh @@ -177,17 +177,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t _CCCL_API void __set_result(_Tag, _As&&... __as) noexcept { using __tupl_t _CCCL_NODEBUG_ALIAS = ::cuda::std::__tuple<_Tag, decay_t<_As>...>; - if constexpr (__nothrow_decay_copyable<_As...>) + _CCCL_TRY { __state_->__result_.template __emplace<__tupl_t>(_Tag{}, static_cast<_As&&>(__as)...); } - else + _CCCL_CATCH_ALL { - _CCCL_TRY - { - __state_->__result_.template __emplace<__tupl_t>(_Tag{}, static_cast<_As&&>(__as)...); - } - _CCCL_CATCH_ALL + // Avoid ODR-using this completion operation if this code path is not taken. + if constexpr (!__nothrow_decay_copyable<_As...>) { execution::set_error(static_cast<_Rcvr&&>(__state_->__rcvr_), ::std::current_exception()); } @@ -315,7 +312,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t _CCCL_EXEC_CHECK_DISABLE template [[nodiscard]] _CCCL_API constexpr auto query(get_domain_override_t, _Env&&...) const noexcept - -> __call_result_t, env_of_t<_Sndr>, _Env...> + -> __completion_domain_of_t...> { return {}; } @@ -382,6 +379,8 @@ public: } }; +////////////////////////////////////////////////////////////////////////////////////////// +// continues_on sender template struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t::__sndr_t { @@ -424,7 +423,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t::__sndr_t return __attrs_t<_Sch, _Sndr>{this}; } - _CCCL_NO_UNIQUE_ADDRESS continues_on_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ continues_on_t __tag_; _Sch __sch_; _Sndr __sndr_; }; diff --git a/cudax/include/cuda/experimental/__execution/cpos.cuh b/cudax/include/cuda/experimental/__execution/cpos.cuh index 2e679caa89c..2adedee35c1 100644 --- a/cudax/include/cuda/experimental/__execution/cpos.cuh +++ b/cudax/include/cuda/experimental/__execution/cpos.cuh @@ -37,13 +37,13 @@ template <__disposition _Disposition> struct __completion_tag { template <__disposition _OtherDisposition> - _CCCL_NODEBUG_API constexpr auto operator==(__completion_tag<_OtherDisposition>) const noexcept -> bool + _CCCL_TRIVIAL_API constexpr auto operator==(__completion_tag<_OtherDisposition>) const noexcept -> bool { return _Disposition == _OtherDisposition; } template <__disposition _OtherDisposition> - _CCCL_NODEBUG_API constexpr auto operator!=(__completion_tag<_OtherDisposition>) const noexcept -> bool + _CCCL_TRIVIAL_API constexpr auto operator!=(__completion_tag<_OtherDisposition>) const noexcept -> bool { return _Disposition != _OtherDisposition; } @@ -63,7 +63,7 @@ struct set_value_t : __completion_tag<__disposition::__value> _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Rcvr, class... _Ts) _CCCL_REQUIRES(__has_set_value_mbr<_Rcvr, _Ts...>) - _CCCL_NODEBUG_API constexpr void operator()(_Rcvr&& __rcvr, _Ts&&... __ts) const noexcept + _CCCL_TRIVIAL_API constexpr void operator()(_Rcvr&& __rcvr, _Ts&&... __ts) const noexcept { static_assert(__same_as(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)), void>); static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...))); @@ -83,7 +83,7 @@ struct set_error_t : __completion_tag<__disposition::__error> _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Rcvr, class _Ey) _CCCL_REQUIRES(__has_set_error_mbr<_Rcvr, _Ey>) - _CCCL_NODEBUG_API constexpr void operator()(_Rcvr&& __rcvr, _Ey&& __e) const noexcept + _CCCL_TRIVIAL_API constexpr void operator()(_Rcvr&& __rcvr, _Ey&& __e) const noexcept { static_assert(__same_as(__rcvr).set_error(static_cast<_Ey&&>(__e))), void>); static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e)))); @@ -103,7 +103,7 @@ struct set_stopped_t : __completion_tag<__disposition::__stopped> _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Rcvr) _CCCL_REQUIRES(__has_set_stopped_mbr<_Rcvr>) - _CCCL_NODEBUG_API constexpr void operator()(_Rcvr&& __rcvr) const noexcept + _CCCL_TRIVIAL_API constexpr void operator()(_Rcvr&& __rcvr) const noexcept { static_assert(__same_as(__rcvr).set_stopped()), void>); static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped())); @@ -123,7 +123,7 @@ struct start_t _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _OpState) _CCCL_REQUIRES(__has_start_mbr<_OpState>) - _CCCL_NODEBUG_API constexpr void operator()(_OpState& __opstate) const noexcept + _CCCL_TRIVIAL_API constexpr void operator()(_OpState& __opstate) const noexcept { static_assert(__same_as); static_assert(noexcept(__opstate.start())); @@ -134,8 +134,9 @@ struct start_t // connect struct connect_t { + _CCCL_EXEC_CHECK_DISABLE template - [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr&& __sndr, _Rcvr __rcvr) const + [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto operator()(_Sndr&& __sndr, _Rcvr __rcvr) const noexcept(noexcept(transform_sender(declval<_Sndr>(), get_env(declval<_Rcvr>())).connect(declval<_Rcvr>()))) -> decltype(transform_sender(declval<_Sndr>(), get_env(declval<_Rcvr>())).connect(declval<_Rcvr>())) { @@ -147,7 +148,7 @@ struct schedule_t { _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API constexpr auto operator()(_Sch&& __sch) const noexcept + _CCCL_TRIVIAL_API constexpr auto operator()(_Sch&& __sch) const noexcept { static_assert(noexcept(static_cast<_Sch&&>(__sch).schedule())); return static_cast<_Sch&&>(__sch).schedule(); diff --git a/cudax/include/cuda/experimental/__execution/domain.cuh b/cudax/include/cuda/experimental/__execution/domain.cuh index 0c67ac7ab4c..db3e269bc94 100644 --- a/cudax/include/cuda/experimental/__execution/domain.cuh +++ b/cudax/include/cuda/experimental/__execution/domain.cuh @@ -89,7 +89,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT default_domain //! @return The result of applying the sender operation. _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API static constexpr auto apply_sender(_Tag, _Sndr&& __sndr, _Args&&... __args) noexcept( + _CCCL_API static constexpr auto apply_sender(_Tag, _Sndr&& __sndr, _Args&&... __args) noexcept( noexcept(_Tag{}.apply_sender(declval<_Sndr>(), declval<_Args>()...))) // -> __apply_sender_result_t<_Tag, _Sndr, _Args...> { @@ -107,7 +107,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT default_domain _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _OpTag, class _Sndr, class _Env) _CCCL_REQUIRES(__has_transform_sender, _OpTag, _Sndr, _Env>) - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto transform_sender(_OpTag, _Sndr&& __sndr, const _Env& __env) // + [[nodiscard]] _CCCL_API static constexpr auto transform_sender(_OpTag, _Sndr&& __sndr, const _Env& __env) // noexcept(__nothrow_transform_sender, _OpTag, _Sndr, _Env>) -> __transform_sender_result_t, _OpTag, _Sndr, _Env> { @@ -117,7 +117,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT default_domain //! @overload _CCCL_EXEC_CHECK_DISABLE template - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto + [[nodiscard]] _CCCL_API static constexpr auto transform_sender(::cuda::std::__ignore_t, _Sndr&& __sndr, ::cuda::std::__ignore_t = {}) // noexcept(__nothrow_movable<_Sndr>) -> _Sndr { @@ -149,10 +149,10 @@ private: }; template -struct __hide_scheduler : __hide_query<_Env, get_scheduler_t> +struct __hide_scheduler : __hide_query<_Env, get_scheduler_t, get_domain_t> { _CCCL_API explicit constexpr __hide_scheduler(_Env&& __env) noexcept - : __hide_query<_Env, get_scheduler_t>{static_cast<_Env&&>(__env), {}} + : __hide_query<_Env, get_scheduler_t, get_domain_t>{static_cast<_Env&&>(__env), {}, {}} {} }; @@ -186,10 +186,9 @@ using __scheduler_domain_t _CCCL_NODEBUG_ALIAS = decay_t) - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env&) const noexcept + [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env&) const noexcept -> decay_t<__query_result_t<_Env, get_domain_t>> { using __domain_t = decay_t<__query_result_t<_Env, get_domain_t>>; @@ -200,10 +199,9 @@ struct get_domain_t //! @brief If there is not a @c get_domain_t query in @c __env, but there is a //! scheduler, return the domain of the scheduler if it has one, and @c default_domain //! otherwise. - _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Env) _CCCL_REQUIRES((!__queryable_with<_Env, get_domain_t>) _CCCL_AND __callable) - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env&) const noexcept + [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env&) const noexcept { using __sch_t = __scheduler_of_t; using __env_t = __hide_scheduler; // to prevent recursion @@ -213,7 +211,7 @@ struct get_domain_t return __domain_t{}; } - _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept + _CCCL_API static constexpr auto query(forwarding_query_t) noexcept { return true; } @@ -342,10 +340,7 @@ _CCCL_GLOBAL_CONSTANT get_completion_domain_t get_completion_domain template <> _CCCL_GLOBAL_CONSTANT get_completion_domain_t get_completion_domain{}; -template -using __completion_domain_of_t = __call_result_t, env_of_t<_Sndr>, const _Env&...>; - -// Used by the schedule_from and continues_on senders +// Used by the continues_on sender struct get_domain_override_t { _CCCL_EXEC_CHECK_DISABLE diff --git a/cudax/include/cuda/experimental/__execution/env.cuh b/cudax/include/cuda/experimental/__execution/env.cuh index 06eb5fa4d80..97267644143 100644 --- a/cudax/include/cuda/experimental/__execution/env.cuh +++ b/cudax/include/cuda/experimental/__execution/env.cuh @@ -80,32 +80,32 @@ namespace __detail { struct _CCCL_TYPE_VISIBILITY_DEFAULT __env_ref_fn { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(env<>) const noexcept -> env<> + [[nodiscard]] _CCCL_API constexpr auto operator()(env<>) const noexcept -> env<> { return {}; } _CCCL_TEMPLATE(class _Env, class = _Env*) // not considered if _Env is a reference type _CCCL_REQUIRES((!::cuda::__is_specialization_of_v<_Env, __fwd_env_>) ) - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Env&& __env) const noexcept -> _Env + [[nodiscard]] _CCCL_API constexpr auto operator()(_Env&& __env) const noexcept -> _Env { return static_cast<_Env&&>(__env); } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env& __env) const noexcept -> __env_ref_<_Env> + [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env& __env) const noexcept -> __env_ref_<_Env> { return __env_ref_<_Env>{__env}; } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept -> __env_ref_<_Env> + [[nodiscard]] _CCCL_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept -> __env_ref_<_Env> { return __env; } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const __fwd_env_<_Env>& __env) const noexcept + [[nodiscard]] _CCCL_API constexpr auto operator()(const __fwd_env_<_Env>& __env) const noexcept -> __fwd_env_<_Env const&> { return __fwd_env_<_Env const&>{__env.__env_}; @@ -142,20 +142,19 @@ namespace __detail { struct _CCCL_TYPE_VISIBILITY_DEFAULT __fwd_env_fn { - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(env<>) const noexcept -> env<> + [[nodiscard]] _CCCL_API constexpr auto operator()(env<>) const noexcept -> env<> { return {}; } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept - -> __fwd_env_<_Env const&> + [[nodiscard]] _CCCL_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept -> __fwd_env_<_Env const&> { return __fwd_env_<_Env const&>{__env.__env_}; } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Env&& __env) const noexcept + [[nodiscard]] _CCCL_API constexpr auto operator()(_Env&& __env) const noexcept { static_assert(__nothrow_movable<_Env>); // If the environment is already a forwarding environment, we can just return it. @@ -181,11 +180,8 @@ _CCCL_GLOBAL_CONSTANT __detail::__fwd_env_fn __fwd_env{}; //! @brief __sch_env_t is a utility that builds an environment from a scheduler. It //! defines the `get_scheduler` query and provides a default for the `get_domain` query. -template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t; - template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t<_Sch> +struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t { [[nodiscard]] _CCCL_API constexpr auto query(get_scheduler_t) const noexcept -> _Sch { @@ -200,39 +196,18 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t<_Sch> _Sch __sch_; }; -template -struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t<_Sch, _PrevSch> -{ - [[nodiscard]] _CCCL_API constexpr auto query(get_scheduler_t) const noexcept -> _Sch - { - return __sch_; - } - - [[nodiscard]] _CCCL_API constexpr auto query(get_domain_t) const noexcept - { - return __query_result_or_t<_Sch, get_completion_domain_t, default_domain>{}; - } - - [[nodiscard]] _CCCL_API constexpr auto query(get_previous_scheduler_t) const noexcept -> _PrevSch - { - return __prev_sch_; - } - - _Sch __sch_; - _PrevSch __prev_sch_; -}; - -template -_CCCL_HOST_DEVICE __sch_env_t(_Sch, _PrevSch...) -> __sch_env_t<_Sch, _PrevSch...>; +template +_CCCL_HOST_DEVICE __sch_env_t(_Sch) -> __sch_env_t<_Sch>; struct __mk_sch_env_t { template - [[nodiscard]] _CCCL_API constexpr auto operator()(_Sch __sch, const _Env&... __env) const noexcept + [[nodiscard]] _CCCL_API constexpr auto operator()([[maybe_unused]] _Sch __sch, const _Env&... __env) const noexcept { - if constexpr ((__callable || ...)) + if constexpr (__completes_inline>, _Env...> + && (__callable || ...)) { - return __sch_env_t{__sch, get_scheduler(__env)...}; + return __sch_env_t{get_scheduler(__env)...}; } else { @@ -272,17 +247,14 @@ _CCCL_HOST_DEVICE __sch_attrs_t(_Sch) -> __sch_attrs_t<_Sch>; // __inln_attrs //! @brief __inln_attrs_t is a utility that builds an attributes queryable for a sender -//! that completes inline. It delegates the @c get_completion_scheduler and @c get_completion_domain -//! queries to the receiver's environment. -//! -//! @tparam _Tags The completion tags for which @c get_completion_signatures should return -//! the current scheduler, and @c get_completion_domain should return the current domain. -template +//! that completes inline. It implements get_completion_behavior to return +//! completion_behavior::inline_completion, and relies on the logic of +//! get_completion_scheduler and get_completion_domain to provide the current scheduler +//! and domain based on the environment. struct _CCCL_TYPE_VISIBILITY_DEFAULT __inln_attrs_t { - _CCCL_TEMPLATE(class _Tag, class _Env, class _Sch = __scheduler_of_t<_Env>) - _CCCL_REQUIRES(__one_of<_Tag, _Tags...> _CCCL_AND __callable) - [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_Tag>, const _Env& __env) const noexcept + template > + [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t, const _Env& __env) const noexcept -> __call_result_or_t, _Sch, _Sch, __hide_scheduler> { _Sch __sch = get_scheduler(__env); @@ -291,9 +263,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __inln_attrs_t return __call_or(get_completion_scheduler, __sch, __sch, __hide_scheduler{__env}); } - _CCCL_TEMPLATE(class _Tag, class _Env) - _CCCL_REQUIRES(__one_of<_Tag, _Tags...> _CCCL_AND __callable) - [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_Tag>, const _Env& __env) const noexcept + template + [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t, const _Env& __env) const noexcept -> __call_result_t { return {}; diff --git a/cudax/include/cuda/experimental/__execution/fwd.cuh b/cudax/include/cuda/experimental/__execution/fwd.cuh index 2b53da29b15..09748559cb0 100644 --- a/cudax/include/cuda/experimental/__execution/fwd.cuh +++ b/cudax/include/cuda/experimental/__execution/fwd.cuh @@ -119,7 +119,7 @@ template struct _CCCL_TYPE_VISIBILITY_DEFAULT completion_signatures; template -_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto get_completion_signatures(); +_CCCL_API _CCCL_CONSTEVAL auto get_completion_signatures(); template using completion_signatures_of_t _CCCL_NODEBUG_ALIAS = decltype(execution::get_completion_signatures<_Sndr, _Env...>()); @@ -193,7 +193,6 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT start_detached_t; struct _CCCL_TYPE_VISIBILITY_DEFAULT get_allocator_t; struct _CCCL_TYPE_VISIBILITY_DEFAULT get_stop_token_t; struct _CCCL_TYPE_VISIBILITY_DEFAULT get_scheduler_t; -struct _CCCL_TYPE_VISIBILITY_DEFAULT get_previous_scheduler_t; struct _CCCL_TYPE_VISIBILITY_DEFAULT get_delegation_scheduler_t; struct _CCCL_TYPE_VISIBILITY_DEFAULT get_forward_progress_guarantee_t; template @@ -211,7 +210,13 @@ template using __scheduler_of_t _CCCL_NODEBUG_ALIAS = decay_t<__call_result_t>; template -using __previous_scheduler_of_t _CCCL_NODEBUG_ALIAS = decay_t<__call_result_t>; +using __domain_of_t _CCCL_NODEBUG_ALIAS = decay_t<::cuda::std::__type_call< + ::cuda::std::__type_try_catch<::cuda::std::__type_bind_front_quote<__call_result_t, get_domain_t>, + ::cuda::std::__type_always>, + _Env>>; + +template +using __completion_domain_of_t = __call_result_t, env_of_t<_Sndr>, _Env...>; // get_forward_progress_guarantee: enum class forward_progress_guarantee @@ -226,7 +231,7 @@ namespace __detail struct __get_tag { template - _CCCL_NODEBUG_API constexpr auto operator()(int, _Tag, ::cuda::std::__ignore_t, _Child&&...) const -> _Tag + _CCCL_API constexpr auto operator()(int, _Tag, ::cuda::std::__ignore_t, _Child&&...) const -> _Tag { return _Tag{}; } @@ -236,7 +241,8 @@ template > extern __fn_ptr_t<_Tag> __tag_of_v; } // namespace __detail -template +_CCCL_TEMPLATE(class _Sndr) +_CCCL_REQUIRES(__is_sender<_Sndr>) using tag_of_t _CCCL_NODEBUG_ALIAS = decltype(__detail::__tag_of_v<_Sndr>()); template diff --git a/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh index 8cab2d7d058..3940b00aaa5 100644 --- a/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh @@ -94,7 +94,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __sender_type_check_failure // private: friend struct __compile_time_error<__sender_type_check_failure>; - _CCCL_NODEBUG_API constexpr auto __what() const noexcept -> const char* + _CCCL_API constexpr auto __what() const noexcept -> const char* { return "This sender is not well-formed. It does not meet the requirements of a sender type."; } @@ -111,7 +111,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT dependent_sender_error : __compile_time_err private: friend struct __compile_time_error; - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto __what() const noexcept -> char const* + [[nodiscard]] _CCCL_API constexpr auto __what() const noexcept -> char const* { return __what_; } @@ -122,7 +122,7 @@ private: template struct _CCCL_TYPE_VISIBILITY_DEFAULT __dependent_sender_error : dependent_sender_error { - _CCCL_NODEBUG_API constexpr __dependent_sender_error() noexcept + _CCCL_API constexpr __dependent_sender_error() noexcept : dependent_sender_error{"This sender needs to know its execution " // "environment before it can know how it will complete."} {} @@ -192,7 +192,7 @@ template else template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __dependent_sender() -> __dependent_sender_error<_Sndr...> +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __dependent_sender() -> __dependent_sender_error<_Sndr...> { return __dependent_sender_error<_Sndr...>{}; } @@ -216,7 +216,7 @@ struct _A_GET_COMPLETION_SIGNATURES_CUSTOMIZATION_RETURNED_A_TYPE_THAT_IS_NOT_A_ {}; template -_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __checked_complsigs() +_CCCL_API _CCCL_CONSTEVAL auto __checked_complsigs() { _CUDAX_LET_COMPLETIONS(auto(__cs) = _Completions()) { @@ -260,7 +260,7 @@ struct _COULD_NOT_DETERMINE_COMPLETION_SIGNATURES_FOR_THIS_SENDER _CCCL_EXEC_CHECK_DISABLE template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __get_completion_signatures_helper() +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __get_completion_signatures_helper() { if constexpr (__has_get_completion_signatures<_Sndr, _Env...>) { @@ -288,7 +288,7 @@ template } template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto get_completion_signatures() +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto get_completion_signatures() { static_assert(sizeof...(_Env) <= 1, "At most one environment is allowed."); if constexpr (0 == sizeof...(_Env)) @@ -304,7 +304,7 @@ template } template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto get_child_completion_signatures() +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto get_child_completion_signatures() { return get_completion_signatures<::cuda::std::__copy_cvref_t<_Parent, _Child>, __fwd_env_t<_Env>...>(); } diff --git a/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh b/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh index 24700597d4b..c0714f8cf53 100644 --- a/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh +++ b/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh @@ -36,10 +36,10 @@ namespace cuda::experimental::execution { //! Scheduler that returns a sender that always completes inline (successfully). -struct _CCCL_TYPE_VISIBILITY_DEFAULT inline_scheduler : __inln_attrs_t +struct _CCCL_TYPE_VISIBILITY_DEFAULT inline_scheduler : __inln_attrs_t { private: - struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t : __inln_attrs_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t : __inln_attrs_t {}; template diff --git a/cudax/include/cuda/experimental/__execution/just.cuh b/cudax/include/cuda/experimental/__execution/just.cuh index e78f02d1702..0aa9193f379 100644 --- a/cudax/include/cuda/experimental/__execution/just.cuh +++ b/cudax/include/cuda/experimental/__execution/just.cuh @@ -82,7 +82,7 @@ private: public: _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API constexpr auto operator()(_Ts... __ts) const; + _CCCL_API constexpr auto operator()(_Ts... __ts) const; }; struct just_t : __just_t @@ -132,10 +132,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_t<_JustTag, _SetTag>::__sndr_base_t [[nodiscard]] _CCCL_API static constexpr auto get_env() noexcept { - return __inln_attrs_t<__set_tag_t>{}; + return __inln_attrs_t{}; } - _CCCL_NO_UNIQUE_ADDRESS __just_tag_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ __just_tag_t __tag_; ::cuda::std::__tuple<_Ts...> __values_; }; @@ -159,7 +159,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT just_stopped_t::__sndr_t _CCCL_EXEC_CHECK_DISABLE template template -_CCCL_NODEBUG_API constexpr auto __just_t<_JustTag, _SetTag>::operator()(_Ts... __ts) const +_CCCL_API constexpr auto __just_t<_JustTag, _SetTag>::operator()(_Ts... __ts) const { using __sndr_t = typename _JustTag::template __sndr_t<_Ts...>; return __sndr_t{{{}, {static_cast<_Ts&&>(__ts)...}}}; diff --git a/cudax/include/cuda/experimental/__execution/just_from.cuh b/cudax/include/cuda/experimental/__execution/just_from.cuh index 3b1a2e276db..1d0dcddbe01 100644 --- a/cudax/include/cuda/experimental/__execution/just_from.cuh +++ b/cudax/include/cuda/experimental/__execution/just_from.cuh @@ -70,13 +70,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_from_t template struct _CCCL_TYPE_VISIBILITY_DEFAULT __complete_fn { - _Rcvr& __rcvr_; - template - _CCCL_API auto operator()(_Ts&&... __ts) const noexcept + _CCCL_API void operator()(_Ts&&... __ts) const noexcept { _SetTag{}(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Ts&&>(__ts)...); } + + _Rcvr& __rcvr_; }; template @@ -99,7 +99,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_from_t public: template - _CCCL_NODEBUG_API constexpr auto operator()(_Fn __fn) const noexcept; + _CCCL_API constexpr auto operator()(_Fn __fn) const noexcept; }; struct just_from_t : __just_from_t @@ -148,10 +148,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_from_t<_JustFromTag, _SetTag>::__snd [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept { - return __inln_attrs_t<_SetTag>{}; + return __inln_attrs_t{}; } - _CCCL_NO_UNIQUE_ADDRESS __just_from_tag_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ __just_from_tag_t __tag_; _Fn __fn_; }; @@ -171,7 +171,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT just_stopped_from_t::__sndr_t template template -_CCCL_NODEBUG_API constexpr auto __just_from_t<_JustFromTag, _SetTag>::operator()(_Fn __fn) const noexcept +_CCCL_API constexpr auto __just_from_t<_JustFromTag, _SetTag>::operator()(_Fn __fn) const noexcept { using __sndr_t = typename _JustFromTag::template __sndr_t<_Fn>; using __completions _CCCL_NODEBUG_ALIAS = __call_result_t<_Fn, __probe_fn>; diff --git a/cudax/include/cuda/experimental/__execution/lazy.cuh b/cudax/include/cuda/experimental/__execution/lazy.cuh index 4b44d8daf59..d32854b7fbc 100644 --- a/cudax/include/cuda/experimental/__execution/lazy.cuh +++ b/cudax/include/cuda/experimental/__execution/lazy.cuh @@ -91,7 +91,7 @@ template <> struct __lazy_tupl<::cuda::std::index_sequence<>> { template - _CCCL_NODEBUG_API static auto __apply(_Fn&& __fn, _Self&&, _Us&&... __us) // + _CCCL_API static auto __apply(_Fn&& __fn, _Self&&, _Us&&... __us) // noexcept(__nothrow_callable<_Fn, _Us...>) -> __call_result_t<_Fn, _Us...> { return static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...); @@ -104,7 +104,7 @@ struct __lazy_tupl<::cuda::std::index_sequence<_Idx...>, _Ts...> : __detail::__l template using __at _CCCL_NODEBUG_ALIAS = ::cuda::std::__type_index_c<_Ny, _Ts...>; - _CCCL_NODEBUG_API __lazy_tupl() noexcept {} + _CCCL_API __lazy_tupl() noexcept {} _CCCL_API ~__lazy_tupl() { @@ -112,13 +112,13 @@ struct __lazy_tupl<::cuda::std::index_sequence<_Idx...>, _Ts...> : __detail::__l } template - _CCCL_NODEBUG_API _Ty* __get() noexcept + _CCCL_API _Ty* __get() noexcept { return reinterpret_cast<_Ty*>(this->__detail::__lazy_box<_Ny, _Ty>::__data_); } template - _CCCL_NODEBUG_API __at<_Ny>& __emplace(_Us&&... __us) // + _CCCL_API __at<_Ny>& __emplace(_Us&&... __us) // noexcept(__nothrow_constructible<__at<_Ny>, _Us...>) { using _Ty _CCCL_NODEBUG_ALIAS = __at<_Ny>; @@ -128,7 +128,7 @@ struct __lazy_tupl<::cuda::std::index_sequence<_Idx...>, _Ts...> : __detail::__l } template - _CCCL_NODEBUG_API static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) // + _CCCL_API static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) // noexcept(__nothrow_callable<_Fn, _Us..., ::cuda::std::__copy_cvref_t<_Self, _Ts>...>) -> __call_result_t<_Fn, _Us..., ::cuda::std::__copy_cvref_t<_Self, _Ts>...> { diff --git a/cudax/include/cuda/experimental/__execution/let_value.cuh b/cudax/include/cuda/experimental/__execution/let_value.cuh index 4420fa6d271..06285b46ab3 100644 --- a/cudax/include/cuda/experimental/__execution/let_value.cuh +++ b/cudax/include/cuda/experimental/__execution/let_value.cuh @@ -106,7 +106,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_t { using __base_t = __rcvr_ref_t<__rcvr_with_env_t<_Rcvr, _Env2>>; - _CCCL_NODEBUG_API explicit constexpr __sndr2_rcvr_t(__rcvr_with_env_t<_Rcvr, _Env2>& __rcvr) noexcept + _CCCL_API explicit constexpr __sndr2_rcvr_t(__rcvr_with_env_t<_Rcvr, _Env2>& __rcvr) noexcept : __base_t(__ref_rcvr(__rcvr)) {} }; @@ -341,13 +341,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_t // extended (host/device) lambda { template - [[nodiscard]] _CCCL_NODEBUG_API auto operator()(_Sndr __sndr) const -> __call_result_t<_LetTag, _Sndr, _Fn> + [[nodiscard]] _CCCL_API auto operator()(_Sndr __sndr) const -> __call_result_t<_LetTag, _Sndr, _Fn> { return _LetTag{}(static_cast<_Sndr&&>(__sndr), __fn_); } template - [[nodiscard]] _CCCL_NODEBUG_API friend auto operator|(_Sndr __sndr, const __closure_t& __self) + [[nodiscard]] _CCCL_API friend auto operator|(_Sndr __sndr, const __closure_t& __self) -> __call_result_t<_LetTag, _Sndr, _Fn> { return _LetTag{}(static_cast<_Sndr&&>(__sndr), __self.__fn_); @@ -365,10 +365,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_base_t : __let_t //! @tparam _Fn The function to be called when the predecessor sender //! completes. template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const; + [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const; template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Fn __fn) const noexcept; + [[nodiscard]] _CCCL_API constexpr auto operator()(_Fn __fn) const noexcept; }; struct let_value_t : __let_base_t @@ -507,7 +507,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_t::__sndr_t return {__sndr_}; } - _CCCL_NO_UNIQUE_ADDRESS _LetTag __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _LetTag __tag_; _Fn __fn_; _Sndr __sndr_; }; @@ -541,7 +541,7 @@ using __all_non_dependent_t = ::cuda::std::__fold_and<(!dependent_sender<_Sndr>) template template -[[nodiscard]] _CCCL_NODEBUG_API constexpr auto __let_base_t<_LetTag>::operator()(_Sndr __sndr, _Fn __fn) const +[[nodiscard]] _CCCL_API constexpr auto __let_base_t<_LetTag>::operator()(_Sndr __sndr, _Fn __fn) const { using __sndr_t = typename _LetTag::template __sndr_t<_Sndr, _Fn>; @@ -571,7 +571,7 @@ template template template -[[nodiscard]] _CCCL_NODEBUG_API constexpr auto __let_base_t<_LetTag>::operator()(_Fn __fn) const noexcept +[[nodiscard]] _CCCL_API constexpr auto __let_base_t<_LetTag>::operator()(_Fn __fn) const noexcept { using __closure_t = typename _LetTag::template __closure_t<_Fn>; return __closure_t{{static_cast<_Fn&&>(__fn)}}; diff --git a/cudax/include/cuda/experimental/__execution/on.cuh b/cudax/include/cuda/experimental/__execution/on.cuh index 33102d482e0..47243d0fd77 100644 --- a/cudax/include/cuda/experimental/__execution/on.cuh +++ b/cudax/include/cuda/experimental/__execution/on.cuh @@ -50,7 +50,8 @@ namespace cuda::experimental::execution //! resource where the `on` sender was started. //! //! @code -//! auto result = on(gpu_scheduler, some_computation) | sync_wait(); +//! auto sndr = on(gpu_scheduler, some_computation); +//! auto [result] = sync_wait(std::move(sndr)).value(); //! @endcode //! //! ## Form 2: `on(sender, scheduler, closure)` or `sender | on(scheduler, closure)` @@ -60,7 +61,8 @@ namespace cuda::experimental::execution //! execution back to where the original sender completed. //! //! @code -//! auto result = some_computation | on(gpu_scheduler, then([](auto value) { return process(value); })); +//! auto sndr = some_computation | on(gpu_scheduler, then([](auto value) { /*...*/ })); +//! auto [result] = sync_wait(std::move(sndr)).value(); //! @endcode //! //! ## Behavior @@ -138,17 +140,17 @@ struct on_t } }; - // Helper alias for the environment of the receiver used to connect the child sender - // in the on(sch, sndr) case. - template - using __env2_t = __join_env_t<__call_result_t<__mk_sch_env_t, _Sch, _Env>, _Env>; - template [[nodiscard]] _CCCL_API static constexpr auto __mk_env2(_Sch __sch, _Env&& __env) { return __join_env(__mk_sch_env(__sch, __env), static_cast<_Env&&>(__env)); } + // Helper alias for the environment of the receiver used to connect the child sender + // in the on(sch, sndr) case. + template + using __env2_t = decltype(__mk_env2(declval<_Sch>(), declval<_Env>())); + public: template struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; @@ -157,19 +159,19 @@ public: struct _CCCL_TYPE_VISIBILITY_DEFAULT __closure_t { template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) && + [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr) && { return on_t{}(static_cast<_Sndr&&>(__sndr), __sch_, static_cast<_Closure&&>(__closure_)); } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) const& + [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr) const& { return on_t{}(static_cast<_Sndr&&>(__sndr), __sch_, __closure_); } template - [[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self) + [[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self) { return on_t{}(static_cast<_Sndr&&>(__sndr), __self.__sch_, static_cast<_Closure&&>(__self.__closure_)); } @@ -180,20 +182,20 @@ public: _CCCL_TEMPLATE(class _Sch, class _Sndr) _CCCL_REQUIRES(__is_sender<_Sndr>) - _CCCL_NODEBUG_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const + _CCCL_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const { return __sndr_t<_Sch, _Sndr>{{}, __sch, __sndr}; } _CCCL_TEMPLATE(class _Sch, class _Closure) _CCCL_REQUIRES((!__is_sender<_Closure>) ) - _CCCL_NODEBUG_API constexpr auto operator()(_Sch __sch, _Closure __closure) const + _CCCL_API constexpr auto operator()(_Sch __sch, _Closure __closure) const { return __closure_t<_Sch, _Closure>{__sch, static_cast<_Closure&&>(__closure)}; } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Sch __sch, _Closure __closure) const + [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr, _Sch __sch, _Closure __closure) const { using __sndr_t = on_t::__sndr_t<_Sch, _Sndr, _Closure>; return __sndr_t{{}, {__sch, static_cast<_Closure&&>(__closure)}, static_cast<_Sndr&&>(__sndr)}; @@ -214,7 +216,7 @@ public: } template - [[nodiscard]] _CCCL_API static constexpr auto transform_sender(start_t, _Sndr&& __sndr, const _Env& __env) + [[nodiscard]] _CCCL_API static constexpr auto transform_sender(set_value_t, _Sndr&& __sndr, const _Env& __env) { auto&& [__ign, __data, __child] = __sndr; if constexpr (__is_scheduler) @@ -222,6 +224,7 @@ public: // The on(sch, sndr) case: auto __old_sch = __call_or(get_scheduler, __not_a_scheduler{}, __env); using __sndr_t = __lowered_sndr_t; + static_assert(sender_for<__sndr_t, continues_on_t>); return __sndr_t{::cuda::std::forward_like<_Sndr>(__child), __data, __old_sch}; } else @@ -263,7 +266,7 @@ private: { // When it completes successfully, the on(sch, sndr) sender completes where it // starts. - return get_completion_scheduler(__inln_attrs_t{}, __env); + return get_scheduler(__env); } else { @@ -312,7 +315,7 @@ private: { // When it completes successfully, the on(sch, sndr) sender completes where it // starts. - return __call_or(execution::get_completion_domain<_SetTag>, default_domain{}, execution::get_scheduler(__env)); + return __call_or(execution::get_domain, default_domain{}, __env); } else { @@ -369,7 +372,7 @@ private: template [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&) const noexcept - -> __completion_domain_for_t, _Env> + -> __completion_domain_for_t<_SetTag, _Env> { return {}; } @@ -408,12 +411,12 @@ private: public: using sender_concept = sender_t; - _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __attrs_t + _CCCL_API constexpr auto get_env() const noexcept -> __attrs_t { return {__sndr_}; } - _CCCL_NO_UNIQUE_ADDRESS on_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ on_t __tag_; _Sch __sch_; _Sndr __sndr_; }; @@ -424,12 +427,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT on_t::__sndr_t<_Sch, _Sndr, _Closure> { using sender_concept = sender_t; - _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __fwd_env_t> + _CCCL_API constexpr auto get_env() const noexcept -> __fwd_env_t> { return __fwd_env(execution::get_env(__sndr_)); } - _CCCL_NO_UNIQUE_ADDRESS on_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ on_t __tag_; __closure_t<_Sch, _Closure> __sch_closure_; _Sndr __sndr_; }; @@ -442,8 +445,8 @@ inline constexpr size_t structured_binding_size> = 3 template inline constexpr size_t structured_binding_size> = 3; -template -inline constexpr size_t structured_binding_size> = 3; +template +inline constexpr size_t structured_binding_size> = 3; } // namespace cuda::experimental::execution diff --git a/cudax/include/cuda/experimental/__execution/prologue.cuh b/cudax/include/cuda/experimental/__execution/prologue.cuh index 437f0d26a43..482926303e9 100644 --- a/cudax/include/cuda/experimental/__execution/prologue.cuh +++ b/cudax/include/cuda/experimental/__execution/prologue.cuh @@ -24,6 +24,7 @@ _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_GCC("-Wsubobject-linkage") _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-value") _CCCL_DIAG_SUPPRESS_MSVC(4848) // [[no_unique_address]] prior to C++20 as a vendor extension +_CCCL_DIAG_SUPPRESS_MSVC(4714) // function 'foo' marked as __forceinline not inlined _CCCL_DIAG_SUPPRESS_GCC("-Wmissing-braces") _CCCL_DIAG_SUPPRESS_CLANG("-Wmissing-braces") diff --git a/cudax/include/cuda/experimental/__execution/queries.cuh b/cudax/include/cuda/experimental/__execution/queries.cuh index dba8db1a767..747ecfb8b85 100644 --- a/cudax/include/cuda/experimental/__execution/queries.cuh +++ b/cudax/include/cuda/experimental/__execution/queries.cuh @@ -59,7 +59,7 @@ _CCCL_GLOBAL_CONSTANT struct get_allocator_t return __query_or(__env, *this, ::cuda::std::allocator{}); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } @@ -78,7 +78,7 @@ _CCCL_GLOBAL_CONSTANT struct get_stop_token_t return __query_or(__env, *this, never_stop_token{}); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } @@ -99,33 +99,12 @@ _CCCL_GLOBAL_CONSTANT struct get_scheduler_t return __env.query(*this); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } } get_scheduler{}; -////////////////////////////////////////////////////////////////////////////////////////// -// get_previous_scheduler -_CCCL_GLOBAL_CONSTANT struct get_previous_scheduler_t -{ - _CCCL_EXEC_CHECK_DISABLE - _CCCL_TEMPLATE(class _Env) - _CCCL_REQUIRES(__queryable_with<_Env, get_previous_scheduler_t>) - [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env& __env) const noexcept - -> __query_result_t<_Env, get_previous_scheduler_t> - { - static_assert(noexcept(__env.query(*this))); - static_assert(__is_scheduler<__query_result_t<_Env, get_previous_scheduler_t>>); - return __env.query(*this); - } - - [[nodiscard]] _CCCL_TRIVIAL_API static constexpr auto query(forwarding_query_t) noexcept -> bool - { - return true; - } -} get_previous_scheduler{}; - ////////////////////////////////////////////////////////////////////////////////////////// // get_delegation_scheduler _CCCL_GLOBAL_CONSTANT struct get_delegation_scheduler_t @@ -141,7 +120,7 @@ _CCCL_GLOBAL_CONSTANT struct get_delegation_scheduler_t return __env.query(*this); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } @@ -221,11 +200,14 @@ private: return _Self{}(__read_query_t{}(__sch, __env...), __env...); } } - else if constexpr (__callable<__read_query_t, env_of_t>, const _Env&...>) + else { - _CCCL_ASSERT(__sch == __read_query_t{}(get_env(__sch.schedule()), __env...), - "the scheduler's sender must have a completion scheduler attribute equal to the scheduler that " - "provided it."); + if constexpr (__callable<__read_query_t, env_of_t>, const _Env&...>) + { + _CCCL_ASSERT(__sch == __read_query_t{}(get_env(__sch.schedule()), __env...), + "the scheduler's sender must have a completion scheduler attribute equal to the scheduler that " + "provided it."); + } return __sch; } } @@ -286,7 +268,7 @@ public: } } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } @@ -317,7 +299,7 @@ _CCCL_GLOBAL_CONSTANT struct get_forward_progress_guarantee_t return __query_or(__sch, *this, forward_progress_guarantee::weakly_parallel); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } @@ -335,21 +317,23 @@ struct __single_threaded_config_t : __single_threaded_config_base_t {} }; +_CCCL_GLOBAL_CONSTANT __single_threaded_config_t __single_threaded_config{}; + ////////////////////////////////////////////////////////////////////////////////////////// // get_launch_config: A sender can define this attribute to control the launch configuration // of the kernel it will launch when executed on a CUDA stream scheduler. _CCCL_GLOBAL_CONSTANT struct get_launch_config_t { template - [[nodiscard]] _CCCL_HOST_API constexpr auto operator()(const _Env& __env) const noexcept + [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env& __env) const noexcept -> __query_result_or_t<_Env, get_launch_config_t, __single_threaded_config_t> { static_assert(__nothrow_queryable_with_or<_Env, get_launch_config_t, true>, "The get_launch_config query must be noexcept."); - return __query_or(__env, *this, __single_threaded_config_t{}); + return __query_or(__env, *this, __single_threaded_config); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool + [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool { return true; } diff --git a/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh b/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh index f31a0620e51..940e2130859 100644 --- a/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh +++ b/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh @@ -45,23 +45,23 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_ref {} template - _CCCL_NODEBUG_API constexpr void set_value(_As&&... __as) noexcept + _CCCL_API constexpr void set_value(_As&&... __as) noexcept { execution::set_value(static_cast<_Rcvr&&>(*__rcvr_), static_cast<_As&&>(__as)...); } template - _CCCL_NODEBUG_API constexpr void set_error(_Error&& __err) noexcept + _CCCL_API constexpr void set_error(_Error&& __err) noexcept { execution::set_error(static_cast<_Rcvr&&>(*__rcvr_), static_cast<_Error&&>(__err)); } - _CCCL_NODEBUG_API constexpr void set_stopped() noexcept + _CCCL_API constexpr void set_stopped() noexcept { execution::set_stopped(static_cast<_Rcvr&&>(*__rcvr_)); } - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> env_of_t<_Rcvr> + [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> env_of_t<_Rcvr> { return execution::get_env(*__rcvr_); } @@ -77,7 +77,7 @@ private: // 2. If the receiver is nothrow copy constructible, return it. // 3. Otherwise, return a __rcvr_ref wrapping the receiver. template -[[nodiscard]] _CCCL_NODEBUG_API constexpr auto __ref_rcvr(_Rcvr& __rcvr) noexcept +[[nodiscard]] _CCCL_API constexpr auto __ref_rcvr(_Rcvr& __rcvr) noexcept { if constexpr (__is_specialization_of_v<_Rcvr, __rcvr_ref>) { diff --git a/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh b/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh index fb7a22191a6..6c80e6aa185 100644 --- a/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh +++ b/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh @@ -41,7 +41,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_with_env_t : _Rcvr _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Query, class... _Args) _CCCL_REQUIRES(__queryable_with<_Env, _Query, _Args...>) - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(_Query, _Args&&... __args) const + [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const noexcept(__nothrow_queryable_with<_Env, _Query, _Args...>) -> __query_result_t<_Env, _Query, _Args...> { return __rcvr_->__env_.query(_Query{}, static_cast<_Args&&>(__args)...); @@ -53,7 +53,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_with_env_t : _Rcvr _CCCL_TEMPLATE(class _Query, class... _Args) _CCCL_REQUIRES((!__queryable_with<_Env, _Query, _Args...>) _CCCL_AND __forwarding_query<_Query> _CCCL_AND __queryable_with, _Query, _Args...>) - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(_Query, _Args&&... __args) const + [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const noexcept(__nothrow_queryable_with, _Query, _Args...>) -> __query_result_t, _Query, _Args...> { @@ -67,22 +67,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_with_env_t : _Rcvr __rcvr_with_env_t const* __rcvr_; }; - [[nodiscard]] _CCCL_NODEBUG_API auto __base() && noexcept -> _Rcvr&& + [[nodiscard]] _CCCL_API auto __base() && noexcept -> _Rcvr&& { return static_cast<_Rcvr&&>(*this); } - [[nodiscard]] _CCCL_NODEBUG_API auto __base() & noexcept -> _Rcvr& + [[nodiscard]] _CCCL_API auto __base() & noexcept -> _Rcvr& { return *this; } - [[nodiscard]] _CCCL_NODEBUG_API auto __base() const& noexcept -> _Rcvr const& + [[nodiscard]] _CCCL_API auto __base() const& noexcept -> _Rcvr const& { return *this; } - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __env_t + [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> __env_t { return __env_t{this}; } diff --git a/cudax/include/cuda/experimental/__execution/read_env.cuh b/cudax/include/cuda/experimental/__execution/read_env.cuh index 8f9a1151ecd..6d6c39df4e2 100644 --- a/cudax/include/cuda/experimental/__execution/read_env.cuh +++ b/cudax/include/cuda/experimental/__execution/read_env.cuh @@ -102,7 +102,7 @@ public: /// invokes the query with the receiver's environment and forwards the result /// to the receiver's `set_value` member. template - _CCCL_NODEBUG_API constexpr __sndr_t<_Query> operator()(_Query) const noexcept; + _CCCL_API constexpr __sndr_t<_Query> operator()(_Query) const noexcept; }; template @@ -147,12 +147,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT read_env_t::__sndr_t return __attrs_t{}; } - _CCCL_NO_UNIQUE_ADDRESS read_env_t __tag; - _CCCL_NO_UNIQUE_ADDRESS _Query __query; + /*_CCCL_NO_UNIQUE_ADDRESS*/ read_env_t __tag; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _Query __query; }; template -_CCCL_NODEBUG_API constexpr read_env_t::__sndr_t<_Query> read_env_t::operator()(_Query __query) const noexcept +_CCCL_API constexpr read_env_t::__sndr_t<_Query> read_env_t::operator()(_Query __query) const noexcept { return __sndr_t<_Query>{{}, __query}; } diff --git a/cudax/include/cuda/experimental/__execution/run_loop.cuh b/cudax/include/cuda/experimental/__execution/run_loop.cuh index e640dcfdc51..8e02dcc76e0 100644 --- a/cudax/include/cuda/experimental/__execution/run_loop.cuh +++ b/cudax/include/cuda/experimental/__execution/run_loop.cuh @@ -70,7 +70,7 @@ public: using __execute_fn_t _CCCL_NODEBUG_ALIAS = void(__task*) noexcept; _CCCL_HIDE_FROM_ABI __task() = default; - _CCCL_NODEBUG_API explicit __task(__execute_fn_t* __execute_fn) noexcept + _CCCL_API explicit __task(__execute_fn_t* __execute_fn) noexcept : __execute_fn_(__execute_fn) {} @@ -250,14 +250,24 @@ public: } private: - _CCCL_NO_UNIQUE_ADDRESS _Env __env_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _Env __env_; }; +// A run_loop with an empty environment. This is a struct instead of a type alias to give +// it a simpler type name that is easier to read in diagnostics. struct _CCCL_TYPE_VISIBILITY_DEFAULT run_loop : basic_run_loop> { + struct _CCCL_TYPE_VISIBILITY_DEFAULT scheduler : basic_run_loop::scheduler + {}; + _CCCL_API constexpr run_loop() noexcept : basic_run_loop>{env{}} {} + + [[nodiscard]] _CCCL_API constexpr auto get_scheduler() noexcept -> scheduler + { + return scheduler{basic_run_loop::get_scheduler()}; + } }; template diff --git a/cudax/include/cuda/experimental/__execution/sequence.cuh b/cudax/include/cuda/experimental/__execution/sequence.cuh index a078fc0e09b..f8b5952f747 100644 --- a/cudax/include/cuda/experimental/__execution/sequence.cuh +++ b/cudax/include/cuda/experimental/__execution/sequence.cuh @@ -96,36 +96,28 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t { _CUDAX_SEMI_PRIVATE : template - using __env2_t = __detail::__seq_env_next_t<_Attrs, _Env...>; + using __env2_t = __join_env_t<__detail::__seq_env_next_t<_Attrs, __fwd_env_t<_Env>...>, __fwd_env_t<_Env>...>; - template - struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_base_t + template + [[nodiscard]] _CCCL_API static constexpr auto __mk_env2(const _Attrs& __attrs, const _Env&... __env) noexcept + -> __env2_t<_Attrs, _Env...> { - __rcvr_with_env_t<_Rcvr, _Env2> __rcvr2_; - void (*__start2_fn_)(__state_base_t*) noexcept; - }; + return __join_env(__detail::__mk_seq_env_next(__attrs, __fwd_env(__env)...), __fwd_env(__env)...); + } template - struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t : __state_base_t<_Rcvr, _Env2> + struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t { - // TODO: __rcvr2_'s env should be wrapped in __fwd_env_t before it is used to - // connect __sndr2. _CCCL_API constexpr explicit __state_t(_Rcvr&& __rcvr, _Env2 __env, _Sndr2&& __sndr2) - : __state_base_t<_Rcvr, _Env2>{{static_cast<_Rcvr&&>(__rcvr), __env}, &__start2_fn} - , __opstate2_(execution::connect(static_cast<_Sndr2&&>(__sndr2), __ref_rcvr(this->__rcvr2_))) + : __rcvr2_{static_cast<_Rcvr&&>(__rcvr), __env} + , __opstate2_(execution::connect(static_cast<_Sndr2&&>(__sndr2), __ref_rcvr(__rcvr2_))) {} - _CCCL_API static constexpr void __start2_fn(__state_base_t<_Rcvr, _Env2>* __base) noexcept - { - auto* __state = static_cast<__state_t*>(__base); - execution::start(__state->__opstate2_); - } - - private: + __rcvr_with_env_t<_Rcvr, _Env2> __rcvr2_; connect_result_t<_Sndr2, __rcvr_ref_t<__rcvr_with_env_t<_Rcvr, _Env2>>> __opstate2_; }; - template + template struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t { using receiver_concept = receiver_t; @@ -133,7 +125,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t template _CCCL_API constexpr void set_value(_Values&&...) noexcept { - __state_->__start2_fn_(__state_); + execution::start(__state_->__opstate2_); } template @@ -152,19 +144,19 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t return __fwd_env(execution::get_env(__state_->__rcvr2_.__base())); } - __state_base_t<_Rcvr, _Env2>* __state_; + __state_t<_Rcvr, _Env2, _Sndr2>* __state_; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t { using operation_state_concept = operation_state_t; - using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t, env_of_t<_Rcvr>>; + using __env2_t _CCCL_NODEBUG_ALIAS = __detail::__seq_env_next_t, env_of_t<_Rcvr>>; // The moves from lvalues here is intentional: _CCCL_API constexpr __opstate_t(_Sndr1& __sndr1, _Sndr2& __sndr2, _Rcvr& __rcvr, __env2_t __env2) : __state_(static_cast<_Rcvr&&>(__rcvr), static_cast<__env2_t&&>(__env2), static_cast<_Sndr2&&>(__sndr2)) - , __opstate1_(execution::connect(static_cast<_Sndr1&&>(__sndr1), __rcvr_t<_Rcvr, __env2_t>{&__state_})) + , __opstate1_(execution::connect(static_cast<_Sndr1&&>(__sndr1), __rcvr_t<_Rcvr, __env2_t, _Sndr2>{&__state_})) {} _CCCL_API constexpr __opstate_t(_Sndr1&& __sndr1, _Sndr2&& __sndr2, _Rcvr&& __rcvr) @@ -182,76 +174,30 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t private: __state_t<_Rcvr, __env2_t, _Sndr2> __state_; - connect_result_t<_Sndr1, __rcvr_t<_Rcvr, __env2_t>> __opstate1_; + connect_result_t<_Sndr1, __rcvr_t<_Rcvr, __env2_t, _Sndr2>> __opstate1_; }; - template - [[nodiscard]] _CCCL_API constexpr auto - __get_completion_scheduler(const _Attrs1& __attrs1, const _Sndr2& __sndr2, _Env&&... __env) noexcept - { - using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<_Attrs1, _Env...>; - - if constexpr (__callable, env_of_t<_Sndr2>, __join_env_t<__env2_t, _Env...>>) - { - // If the second sender has a completion scheduler for the given tag, use it. - auto __env2 = __detail::__mk_seq_env_next(__attrs1, __env...); - return get_completion_scheduler<_SetTag>( - get_env(__sndr2), __join_env(static_cast<__env2_t&&>(__env2), static_cast<_Env&&>(__env)...)); - } - else if constexpr (get_completion_signatures<_Sndr2, __join_env_t<__env2_t, _Env...>>().count(_SetTag{}) == 0) - { - // If the second sender does not have any _SetTag completions, use the first sender's - // completion scheduler, if it has one: - if constexpr (__callable, _Attrs1, __fwd_env_t<_Env>...>) - { - return get_completion_scheduler<_SetTag>(__attrs1, __fwd_env(static_cast<_Env&&>(__env))...); - } - } - } - - template - [[nodiscard]] _CCCL_API constexpr auto - __get_completion_domain(const _Attrs1& __attrs1, const _Sndr2& __sndr2, _Env&&... __env) noexcept - { - using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<_Attrs1, _Env...>; - - if constexpr (__callable, env_of_t<_Sndr2>, __join_env_t<__env2_t, _Env...>>) - { - // If the second sender has a completion domain for the given tag, use it. - return __call_result_t, env_of_t<_Sndr2>, __join_env_t<__env2_t, _Env...>>{}; - } - else if constexpr (get_completion_signatures<_Sndr2, __join_env_t<__env2_t, _Env...>>().count(_SetTag{}) == 0) - { - // If the second sender does not have any _SetTag completions, use the first - // sender's completion domain, if it has one: - if constexpr (__callable, _Attrs1, __fwd_env_t<_Env>...>) - { - return __call_result_t, _Attrs1, __fwd_env_t<_Env>...>{}; - } - } - } - public: template struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; template - _CCCL_NODEBUG_API constexpr auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const; + _CCCL_API constexpr auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const; }; template struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t::__sndr_t { - using sender_concept = sender_t; - using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t>; + using sender_concept = sender_t; + template + using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t, _Env...>; template [[nodiscard]] _CCCL_API static _CCCL_CONSTEVAL auto get_completion_signatures() { _CUDAX_LET_COMPLETIONS(auto(__completions1) = get_child_completion_signatures<_Self, _Sndr1, _Env...>()) { - _CUDAX_LET_COMPLETIONS( - auto(__completions2) = get_child_completion_signatures<_Self, _Sndr2, __join_env_t<__env2_t, _Env...>>()) + _CUDAX_LET_COMPLETIONS(auto(__completions2) = get_child_completion_signatures<_Self, _Sndr2, __env2_t<_Env...>>()) { // __swallow_transform to ignore the first sender's value completions return __completions2 + transform_completion_signatures(__completions1, __swallow_transform{}); @@ -279,45 +225,40 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t::__sndr_t struct __attrs_t { - // If the second sender does not have any _SetTag completions, we can look at the - // first sender for a completion scheduler. - _CCCL_TEMPLATE(class _SetTag, class... _Env, class _Env2 = sequence_t::__env2_t, _Env...>) - _CCCL_REQUIRES((!__has_completions_for<_SetTag, _Sndr2, __join_env_t<_Env2, _Env...>>) ) - [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_SetTag>, _Env&&... __env) const noexcept - -> __call_result_t, env_of_t<_Sndr1>, __fwd_env_t<_Env>...> - { - return get_completion_scheduler<_SetTag>( - execution::get_env(__self_->__sndr1_), __fwd_env(static_cast<_Env&&>(__env))...); - } - - // If the second sender does not have any _SetTag completions, we can look at the - // first sender for a completion domain. - _CCCL_TEMPLATE(class _SetTag, class... _Env, class _Env2 = sequence_t::__env2_t, _Env...>) - _CCCL_REQUIRES((!__has_completions_for<_SetTag, _Sndr2, __join_env_t<_Env2, _Env...>>) ) - [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&... __env) const noexcept - -> decay_t<__call_result_t, env_of_t<_Sndr1>, __fwd_env_t<_Env>...>> - { - return {}; - } + // If _Sndr2 has _SetTag completions but does not know its _SetTag completion scheduler, + // then we cannot know it either. Delete the function to prevent its use. + _CCCL_TEMPLATE(class _SetTag, class... _Env) + _CCCL_REQUIRES(__has_completions_for<_SetTag, _Sndr2, __env2_t<_Env...>> _CCCL_AND( + !__callable, env_of_t<_Sndr2>, __env2_t<_Env...>>)) + _CCCL_API auto query(get_completion_scheduler_t<_SetTag>, const _Env&...) const = delete; + + // If _Sndr2 has _SetTag completions but does not know its _SetTag completion domain, + // then we cannot know it either. Delete the function to prevent its use. + _CCCL_TEMPLATE(class _SetTag, class... _Env) + _CCCL_REQUIRES(__has_completions_for<_SetTag, _Sndr2, __env2_t<_Env...>> _CCCL_AND( + !__callable, env_of_t<_Sndr2>, __env2_t<_Env...>>)) + _CCCL_API auto query(get_completion_domain_t<_SetTag>, const _Env&...) const = delete; template [[nodiscard]] _CCCL_API constexpr auto query(get_completion_behavior_t, const _Env&...) const noexcept { - using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t, _Env...>; return (execution::min) (execution::get_completion_behavior<_Sndr1, __fwd_env_t<_Env>...>(), - execution::get_completion_behavior<_Sndr2, __join_env_t<__env2_t, _Env...>>()); + execution::get_completion_behavior<_Sndr2, __env2_t<_Env...>>()); } + using __child_attrs_t = __join_env_t, env_of_t<_Sndr1>>; + // The following overload will not be considered when _Query is get_domain_override_t // because get_domain_override_t is not a forwarding query. _CCCL_EXEC_CHECK_DISABLE _CCCL_TEMPLATE(class _Query, class... _Args) - _CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with, _Query, _Args...>) + _CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with<__child_attrs_t, _Query, _Args...>) [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const - noexcept(__nothrow_queryable_with, _Query, _Args...>) - -> __query_result_t, _Query, _Args...> + noexcept(__nothrow_queryable_with<__child_attrs_t, _Query, _Args...>) + -> __query_result_t<__child_attrs_t, _Query, _Args...> { - return execution::get_env(__self_->__sndr2_).query(_Query{}, static_cast<_Args&&>(__args)...); + auto&& __env = __join_env(execution::get_env(__self_->__sndr2_), execution::get_env(__self_->__sndr1_)); + return __env.query(_Query{}, static_cast<_Args&&>(__args)...); } __sndr_t const* __self_; @@ -328,14 +269,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t::__sndr_t return {this}; } - _CCCL_NO_UNIQUE_ADDRESS sequence_t __tag_; - _CCCL_NO_UNIQUE_ADDRESS ::cuda::std::__ignore_t __ign_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ sequence_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ ::cuda::std::__ignore_t __ign_; _Sndr1 __sndr1_; _Sndr2 __sndr2_; }; template -_CCCL_NODEBUG_API constexpr auto sequence_t::operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const +_CCCL_API constexpr auto sequence_t::operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const { using __sndr_t _CCCL_NODEBUG_ALIAS = sequence_t::__sndr_t<_Sndr1, _Sndr2>; return __sndr_t{{}, {}, static_cast<_Sndr1&&>(__sndr1), static_cast<_Sndr2&&>(__sndr2)}; diff --git a/cudax/include/cuda/experimental/__execution/start_detached.cuh b/cudax/include/cuda/experimental/__execution/start_detached.cuh index 1d8b8ffd9fe..9797cc6e0f4 100644 --- a/cudax/include/cuda/experimental/__execution/start_detached.cuh +++ b/cudax/include/cuda/experimental/__execution/start_detached.cuh @@ -97,7 +97,7 @@ public: /// run detached. template - _CCCL_NODEBUG_API void operator()(_Sndr __sndr) const + _CCCL_API void operator()(_Sndr __sndr) const { using __domain_t _CCCL_NODEBUG_ALIAS = __completion_domain_of_t>; execution::apply_sender(__domain_t{}, *this, static_cast<_Sndr&&>(__sndr)); diff --git a/cudax/include/cuda/experimental/__execution/starts_on.cuh b/cudax/include/cuda/experimental/__execution/starts_on.cuh index 0a68918448d..6175a0bed2c 100644 --- a/cudax/include/cuda/experimental/__execution/starts_on.cuh +++ b/cudax/include/cuda/experimental/__execution/starts_on.cuh @@ -86,15 +86,15 @@ namespace cuda::experimental::execution struct starts_on_t { private: - template - using __env2_t = __join_env_t<__call_result_t<__mk_sch_env_t, _Sch, _Env...>, _Env...>; - template [[nodiscard]] _CCCL_API static constexpr auto __mk_env2(_Sch __sch, _Env&&... __env) { return __join_env(__mk_sch_env(__sch, __env...), static_cast<_Env&&>(__env)...); } + template + using __env2_t = decltype(__mk_env2(declval<_Sch>(), declval<_Env>()...)); + public: template struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t; @@ -107,7 +107,7 @@ public: } template - _CCCL_NODEBUG_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const; + _CCCL_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const; }; template @@ -129,7 +129,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t { // If the sender has a _SetTag completion, then the completion scheduler for _SetTag - // is the sender's if it has one. + // is the sender's. template [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_SetTag>, _Env&&... __env) const noexcept -> __call_result_t, env_of_t<_Sndr>, __env2_t<_Sch, _Env>...> @@ -138,19 +138,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t execution::get_env(__self_->__sndr_), __mk_env2(__self_->__sch_, static_cast<_Env&&>(__env))...); } - // If the sender does not have a _SetTag completion (and _SetTag is not set_value_t), - // then the completion scheduler for _SetTag is the scheduler sender's if it has one. - _CCCL_TEMPLATE(class _SetTag, class... _Env) - _CCCL_REQUIRES((!__same_as<_SetTag, set_value_t>) _CCCL_AND( - execution::get_completion_signatures<_Sndr, __env2_t<_Sch, _Env>...>().count(_SetTag{}) == 0)) - [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_SetTag>, _Env&&... __env) const noexcept - -> __call_result_t, _Sch, __fwd_env_t<_Env>...> - { - return get_completion_scheduler<_SetTag>(__self_->__sch_, __fwd_env(static_cast<_Env&&>(__env))...); - } - // If the sender has a _SetTag completion, then the completion scheduler for _SetTag - // is the sender's if it has one. + // is the sender's. template [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&... __env) const noexcept -> __call_result_t, env_of_t<_Sndr>, __env2_t<_Sch, _Env>...> @@ -158,17 +147,6 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t return {}; } - // If the sender does not have a _SetTag completion (and _SetTag is not set_value_t), - // then the completion scheduler for _SetTag is the scheduler sender's if it has one. - _CCCL_TEMPLATE(class _SetTag, class... _Env) - _CCCL_REQUIRES((!__same_as<_SetTag, set_value_t>) _CCCL_AND( - execution::get_completion_signatures<_Sndr, __env2_t<_Sch, _Env>...>().count(_SetTag{}) == 0)) - [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&...) const noexcept - -> __call_result_t, _Sch, __fwd_env_t<_Env>...> - { - return {}; - } - template [[nodiscard]] _CCCL_API constexpr auto query(get_completion_behavior_t, _Env&&...) const noexcept { @@ -212,13 +190,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t return __attrs_t{this}; } - _CCCL_NO_UNIQUE_ADDRESS starts_on_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ starts_on_t __tag_; _Sch __sch_; _Sndr __sndr_; }; template -[[nodiscard]] _CCCL_NODEBUG_API constexpr auto starts_on_t::operator()(_Sch __sch, _Sndr __sndr) const +[[nodiscard]] _CCCL_API constexpr auto starts_on_t::operator()(_Sch __sch, _Sndr __sndr) const { return __sndr_t<_Sch, _Sndr>{{}, static_cast<_Sch&&>(__sch), static_cast<_Sndr&&>(__sndr)}; } diff --git a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh index eeb3a86b7f4..6da0844fc06 100644 --- a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh @@ -68,28 +68,22 @@ namespace cuda::experimental::execution { namespace __stream { -template -struct __completion_fn +struct __complete_rcvr { - template - _CCCL_API void operator()(_Tag, _Args&&... __args) const noexcept + template + _CCCL_API void operator()(_Rcvr& __rcvr, _Tag, _Args&&... __args) const noexcept { - _Tag{}(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Args&&>(__args)...); + _Tag{}(static_cast<_Rcvr&&>(__rcvr), static_cast<_Args&&>(__args)...); } - - _Rcvr& __rcvr_; }; -template -struct __results_visitor +struct __visit_results { - template - _CCCL_API void operator()(_Tuple&& __tuple) const noexcept + template + _CCCL_API void operator()(_Rcvr& __rcvr, _Tuple&& __tuple) const noexcept { - ::cuda::std::__apply(__completion_fn<_Rcvr>{__rcvr_}, static_cast<_Tuple&&>(__tuple)); + ::cuda::std::__apply(__complete_rcvr{}, static_cast<_Tuple&&>(__tuple), __rcvr); } - - _Rcvr& __rcvr_; }; // __state_t lives in managed memory. It stores everything the operation state needs, @@ -128,7 +122,7 @@ _CCCL_VISIBILITY_HIDDEN __launch_bounds__(_BlockThreads) __global__ void __completion_kernel(__state_base_t<_Rcvr, _Variant>* __state) { _CCCL_ASSERT(__state->__results_.__index() != __npos, "__completion_kernel called with empty results"); - _Variant::__visit(__results_visitor<_Rcvr>{__state->__rcvr_}, __state->__results_); + _Variant::__visit(__visit_results{}, __state->__results_, __state->__rcvr_); } // This is the environment of the inner receiver that is used to connect the child sender. @@ -156,7 +150,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __env_t } _Env __env_; - _CCCL_NO_UNIQUE_ADDRESS _Config __launch_config_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _Config __launch_config_; }; // This is the inner receiver that is used to connect the child sender. @@ -178,13 +172,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t } template - _CCCL_NODEBUG_API constexpr void set_value(_Args&&... __args) noexcept + _CCCL_API constexpr void set_value(_Args&&... __args) noexcept { __complete(execution::set_value, static_cast<_Args&&>(__args)...); } template - _CCCL_NODEBUG_API constexpr void set_error(_Error&& __err) noexcept + _CCCL_API constexpr void set_error(_Error&& __err) noexcept { // Map any exception_ptr error completions to cudaErrorUnknown: if constexpr (__same_as<::cuda::std::remove_cvref_t<_Error>, ::std::exception_ptr>) @@ -197,7 +191,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t } } - _CCCL_NODEBUG_API constexpr void set_stopped() noexcept + _CCCL_API constexpr void set_stopped() noexcept { __complete(execution::set_stopped); } @@ -227,14 +221,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t _CCCL_API constexpr void start() noexcept { - NV_IF_TARGET(NV_IS_HOST, (__host_start();), (__device_start();)); + NV_IF_TARGET(NV_IS_HOST, ({ __host_start(); }), ({ __device_start(); })); } // This is called by the continues_on adaptor after it has sync'ed the stream. template - _CCCL_HOST_API auto __set_results(_Rcvr2& __rcvr) noexcept + _CCCL_API auto __set_results(_Rcvr2& __rcvr) noexcept { - __results_t::__visit(__results_visitor<_Rcvr2&>{__rcvr}, __get_state().__state_.__results_); + __results_t::__visit(__visit_results{}, __get_state().__state_.__results_, __rcvr); } private: @@ -283,11 +277,6 @@ private: // of this operation. execution::start(__state.__opstate_); - // printf("Launching completion kernel for %s with %d block threads and %d grid blocks\n", - // __name, - // __block_threads, - // __grid_blocks); - // launch a kernel to pass the results to the receiver. __completion_kernel<__block_threads><<<__grid_blocks, __block_threads, 0, __stream_.get()>>>(&__state.__state_); @@ -315,7 +304,7 @@ private: // This is the part of the operation state that is stored in managed memory. struct __state_t { - _CCCL_HOST_API constexpr explicit __state_t(_CvSndr&& __sndr, _Rcvr __rcvr) + _CCCL_API constexpr explicit __state_t(_CvSndr&& __sndr, _Rcvr __rcvr) : __state_{{static_cast<_Rcvr&&>(__rcvr), {}, false}, get_launch_config(execution::get_env(__sndr))} , __opstate_(execution::connect(static_cast<_CvSndr&&>(__sndr), __rcvr_t{&__state_})) {} @@ -360,12 +349,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t // sender adaptor, like `then` or `let_value`. A stream sender adaptor is an // implementation detail that is not visible to the user. It should be as transparent as // possible. - _CCCL_TEMPLATE(class _Query) - _CCCL_REQUIRES(__queryable_with, _Query>) - [[nodiscard]] _CCCL_API constexpr auto query(_Query) const noexcept(__nothrow_queryable_with, _Query>) - -> __query_result_t, _Query> + _CCCL_TEMPLATE(class _Query, class... _Args) + _CCCL_REQUIRES(__queryable_with, _Query, _Args...>) + [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const + noexcept(__nothrow_queryable_with, _Query, _Args...>) + -> __query_result_t, _Query, _Args...> { - return execution::get_env(__sndr_.__sndr_).query(_Query{}); + return execution::get_env(__sndr_.__sndr_).query(_Query{}, static_cast<_Args&&>(__args)...); } const __sndr_t<_Sndr, _GetStream>& __sndr_; @@ -407,8 +397,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t return __attrs_t<_Sndr, _GetStream>{*this}; } - _CCCL_NO_UNIQUE_ADDRESS __tag_t<__stream::__tag_of_t<_Sndr>> __tag_; - _CCCL_NO_UNIQUE_ADDRESS _GetStream __get_stream_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ __tag_t<__stream::__tag_of_t<_Sndr>> __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _GetStream __get_stream_; _Sndr __sndr_; }; diff --git a/cudax/include/cuda/experimental/__execution/stream/bulk.cuh b/cudax/include/cuda/experimental/__execution/stream/bulk.cuh index 24beab3ab63..551b1226c33 100644 --- a/cudax/include/cuda/experimental/__execution/stream/bulk.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/bulk.cuh @@ -89,7 +89,7 @@ struct __bulk_chunked_t : execution::__bulk_t<__bulk_chunked_t> // domain argument of stream_domain. It adapts a `bulk_chunked` sender to the stream // domain. template - _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const + _CCCL_API constexpr auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const { // Decompose the bulk sender into its components: auto& [__tag, __state, __child] = __sndr; @@ -124,9 +124,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_unchunked_t : execution::__bulk_t<__ { const _Shape __tid = threadIdx.x + blockIdx.x * blockDim.x; - if (__tid < this->__shape_) + // Each thread processes exactly one element, if it is in range. + if (__tid < this->__state_->__shape_) { - this->__fn_(_Shape(__tid), __values...); + this->__state_->__fn_(_Shape(__tid), __values...); } __syncthreads(); @@ -135,7 +136,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_unchunked_t : execution::__bulk_t<__ // elements. if (__tid == 0) { - execution::set_value(static_cast<_Rcvr&&>(this->__rcvr_), static_cast<_Values&&>(__values)...); + execution::set_value(static_cast<_Rcvr&&>(this->__state_->__rcvr_), static_cast<_Values&&>(__values)...); } } }; @@ -152,7 +153,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_unchunked_t : execution::__bulk_t<__ // domain argument of stream_domain. It adapts a `bulk_unchunked` sender to the stream // domain. template - _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const + _CCCL_API constexpr auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const { // Decompose the bulk sender into its components: auto& [__tag, __state, __child] = __sndr; @@ -186,11 +187,11 @@ struct __bulk_t : execution::__bulk_t<__bulk_t> {}; template - _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const -> decltype(auto) + _CCCL_API constexpr auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const -> decltype(auto) { // This converts a bulk sender into a bulk_chunked sender, which will then be // further transformed by __bulk_chunked_t above. - return bulk.transform_sender(static_cast<_Sndr&&>(__sndr), env{}); + return bulk.transform_sender(set_value, static_cast<_Sndr&&>(__sndr), env{}); } }; diff --git a/cudax/include/cuda/experimental/__execution/stream/context.cuh b/cudax/include/cuda/experimental/__execution/stream/context.cuh index 35af625124e..04eb2e7ca33 100644 --- a/cudax/include/cuda/experimental/__execution/stream/context.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/context.cuh @@ -45,12 +45,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_context : private __immovable __stream_.sync(); } - [[nodiscard]] _CCCL_HOST_API constexpr auto query(get_stream_t) const noexcept -> stream_ref + [[nodiscard]] _CCCL_API constexpr auto query(get_stream_t) const noexcept -> stream_ref { return __stream_; } - [[nodiscard]] _CCCL_NODEBUG_HOST_API auto get_scheduler() noexcept -> stream_scheduler + [[nodiscard]] _CCCL_API auto get_scheduler() noexcept -> stream_scheduler { return stream_scheduler{__stream_}; } diff --git a/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh b/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh index 43671359328..06bf7eff32a 100644 --- a/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh @@ -45,6 +45,25 @@ namespace cuda::experimental::execution { namespace __stream { +//! The customization of continues_on, when transferring back to the CPU, involves +//! adapting the sender and receiver types. +//! +//! A continues_on sender such as continues_on(sndr, sch), where sndr completes on the GPU, +//! needs to synchronize the CUDA stream to ensure that all queued GPU work is finished. +//! Only then can the schedule operation be safely invoked -- from the CPU. +//! +//! To effect this, continues_on(sndr, sch) is transformed into +//! continues_on(SYNC-STREAM-ADAPTOR(sndr), sch), where SYNC-STREAM-ADAPTOR(sndr) is a +//! sender that does the following: +//! +//! 1. In connect (called on host): Connects sndr with a sink receiver that ignores values +//! passed to it and simply returns. The sink receiver's completion operations are +//! executed on device when the child sender completes. +//! +//! 2. In start (called on host): Starts the child sender, which launches kernels for the +//! predecessor operations, and then synchronizes the CUDA stream to ensure all queued +//! GPU work is finished. Then, it pulls the results from sndr's operation state and +//! passes them to the receiver on the host. struct __continues_on_t { // Transition from the GPU to the CPU domain @@ -92,9 +111,18 @@ struct __continues_on_t _CCCL_IMMOVABLE(__opstate_t); - _CCCL_HOST_API void start() noexcept + _CCCL_API void start() noexcept { + NV_IF_TARGET(NV_IS_HOST, ({ __host_start(); }), ({ __device_start(); })); + } + + _CCCL_HOST_API void __host_start() noexcept + { + // This launches all predecessor kernels on the given stream execution::start(__opstate_); + + // Synchronize the CUDA stream to make sure all predecessor work has completed, and + // the results are available in __opstate_. if (auto __status = ::cudaStreamSynchronize(__stream_.get()); __status != ::cudaSuccess) { execution::set_error(static_cast<_Rcvr&&>(__rcvr_), cudaError_t(__status)); @@ -102,19 +130,31 @@ struct __continues_on_t else { // __opstate_ is an instance of __stream::__opstate_t, and it has a __set_results - // member function that will pass the results to the receiver on the host. + // member function that will pass the results to the receiver on the host. __rcvr_ + // is the receiver of the parent default continues_on operation. That receiver + // will then start the schedule operation on the host. __opstate_.__set_results(__rcvr_); } } + [[noreturn]] _CCCL_DEVICE_API void __device_start() noexcept + { + _CCCL_ASSERT(false, "internal error: stream::continues_on opstate started on device"); + ::cuda::std::terminate(); + + // We do not want the following to be called, but we need these code paths to be + // instantiated. Without this, the __device_start function in stream/adaptor.cuh + // will not be instantiated, and the kernel launch in the adaptor's __host_start + // function will fail. + execution::start(__opstate_); + __opstate_.__set_results(__rcvr_); + } + _Rcvr __rcvr_; stream_ref __stream_; connect_result_t<_Sndr, __rcvr_t<_Rcvr>> __opstate_; }; - struct _CCCL_TYPE_VISIBILITY_DEFAULT __thunk_t - {}; - template struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t { @@ -143,24 +183,36 @@ struct __continues_on_t return execution::get_env(__sndr_); } - _CCCL_NO_UNIQUE_ADDRESS __thunk_t __tag_; - _CCCL_NO_UNIQUE_ADDRESS ::cuda::std::__ignore_t __ignore_; + // The use of __tag_t here instructs the stream_domain not to apply any further + // transformations to this sender. See stream/domain.cuh. + /*_CCCL_NO_UNIQUE_ADDRESS*/ __tag_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ ::cuda::std::__ignore_t __ignore_; _Sndr __sndr_; }; - // If the child sender has not already been adapted to be a stream sender, we adapt it - // now. - _CCCL_TEMPLATE(class _Sndr) - _CCCL_REQUIRES((!::cuda::__is_specialization_of_v>, __stream::__sndr_t>) ) + template + _CCCL_API static constexpr auto __mk_sndr(_Sndr&& __sndr) + { + return __sndr_t<_Sndr>{{}, {}, static_cast<_Sndr&&>(__sndr)}; + } + + // This function is called when a continues_on sender, with a predecessor that completes + // on the stream scheduler, is being connected. It wraps the child sender so that it + // synchronizes the stream after launching the child. + template [[nodiscard]] _CCCL_API auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const { - auto& [__tag, __sched, __child] = __sndr; - using __child_t = ::cuda::std::__copy_cvref_t<_Sndr, decltype(__child)>; + [[maybe_unused]] auto& [__tag, __sched, __child] = __sndr; + using __child_t = ::cuda::std::__copy_cvref_t<_Sndr, decltype(__child)>; - auto __adapted_sndr = __stream::__adapt(static_cast<__child_t&&>(__child)); - using __adapted_sndr_t = decltype(__adapted_sndr); - return execution::continues_on( - __sched, __sndr_t<__adapted_sndr_t>{{}, {}, static_cast<__adapted_sndr_t&&>(__adapted_sndr)}); + if constexpr (::cuda::__is_specialization_of_v) + { + return static_cast<_Sndr&&>(__sndr); + } + else + { + return execution::continues_on(__mk_sndr(static_cast<__child_t&&>(__child)), __sched); + } } }; } // namespace __stream diff --git a/cudax/include/cuda/experimental/__execution/stream/domain.cuh b/cudax/include/cuda/experimental/__execution/stream/domain.cuh index fbbae7ffd79..c67440b0bba 100644 --- a/cudax/include/cuda/experimental/__execution/stream/domain.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/domain.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -105,11 +106,8 @@ struct stream_domain struct __apply_adapt_t { // This is the default apply function that adapts a sender to a stream sender. - // The constraint prevents this function from applying an adaptor to a sender - // that has already been adapted. The __stream::__adapted_t query is present - // only on receivers that come from an adapted sender. template - _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const + _CCCL_API constexpr auto operator()(::cuda::std::__ignore_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const noexcept(__nothrow_decay_copyable<_Sndr>) { return __stream::__adapt(static_cast<_Sndr&&>(__sndr)); @@ -119,7 +117,7 @@ struct stream_domain struct __apply_passthru_t { template - _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const + _CCCL_API constexpr auto operator()(::cuda::std::__ignore_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const noexcept(__nothrow_movable<_Sndr>) -> _Sndr { return static_cast<_Sndr&&>(__sndr); @@ -158,20 +156,19 @@ struct stream_domain public: _CCCL_TEMPLATE(class _Tag, class _Sndr, class... _Args) _CCCL_REQUIRES(__callable<__apply_t<_Tag>, _Sndr, _Args...>) - _CCCL_NODEBUG_HOST_API static constexpr auto + _CCCL_API static constexpr auto apply_sender(_Tag, _Sndr&& __sndr, _Args&&... __args) noexcept(__nothrow_callable<__apply_t<_Tag>, _Sndr, _Args...>) -> __call_result_t<__apply_t<_Tag>, _Sndr, _Args...> { - return __apply_t<_Tag>{}(static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...); + return __apply_t<_Tag>()(static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...); } - _CCCL_TEMPLATE(class _Sndr, class _Env) - _CCCL_REQUIRES(__callable<__transform_strategy_t<_Sndr, _Env>, _Sndr, const _Env&>) - _CCCL_NODEBUG_API static constexpr auto transform_sender(_Sndr&& __sndr, const _Env& __env) noexcept( - __nothrow_callable<__transform_strategy_t<_Sndr, _Env>, _Sndr, const _Env&>) - -> __call_result_t<__transform_strategy_t<_Sndr, _Env>, _Sndr, const _Env&> + _CCCL_TEMPLATE(class _OpTag, class _Sndr, class _Env, class _Apply = __transform_strategy_t<_Sndr, _Env>) + _CCCL_REQUIRES(__callable<_Apply, _OpTag, _Sndr, const _Env&>) + _CCCL_API static constexpr auto transform_sender(_OpTag, _Sndr&& __sndr, const _Env& __env) noexcept( + __nothrow_callable<_Apply, _OpTag, _Sndr, const _Env&>) -> __call_result_t<_Apply, _OpTag, _Sndr, const _Env&> { - return __transform_strategy_t<_Sndr, _Env>{}(static_cast<_Sndr&&>(__sndr), __env); + return _Apply()(_OpTag(), static_cast<_Sndr&&>(__sndr), __env); } }; diff --git a/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh b/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh index 2c0ab902948..1bb2b6553c6 100644 --- a/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh @@ -45,11 +45,11 @@ namespace cuda::experimental { namespace execution { -template -_CCCL_VISIBILITY_HIDDEN __launch_bounds__(_BlockThreads) __global__ - void __stream_complete(_Tag, _Rcvr* __rcvr, _Args... __args) +template +//_CCCL_VISIBILITY_HIDDEN +__launch_bounds__(_BlockThreads) __global__ void __stream_complete(_Tag, _Rcvr* __rcvr) { - _Tag{}(static_cast<_Rcvr&&>(*__rcvr), static_cast<_Args&&>(__args)...); + _Tag{}(static_cast<_Rcvr&&>(*__rcvr)); } //////////////////////////////////////////////////////////////////////////////////////// @@ -91,7 +91,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t, _Env&& __env) const noexcept -> __call_result_t { - return execution::get_domain(__env); + return {}; } [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(get_completion_behavior_t) const noexcept @@ -114,8 +114,9 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler : __rcvr_{static_cast<_Rcvr&&>(__rcvr)} , __stream_{__stream_ref} { - _CCCL_ASSERT(execution::__get_pointer_attributes(this).type == cudaMemoryTypeManaged, - "stream scheduler's operation state must be allocated in managed memory"); + NV_IF_TARGET(NV_IS_HOST, + (_CCCL_ASSERT(execution::__get_pointer_attributes(this).type == cudaMemoryTypeManaged, + "stream scheduler's operation state must be allocated in managed memory");)) } _CCCL_IMMOVABLE(__opstate_t); @@ -136,13 +137,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler int const __grid_blocks = __launch_dims.count(experimental::block, experimental::grid); static_assert(__block_threads != ::cuda::std::dynamic_extent); - // printf("Launching completion kernel for stream_scheduler with %d block threads and %d grid blocks\n", - // __block_threads, - // __grid_blocks); - // Launch the kernel that completes the receiver with the launch configuration from // the receiver. - __stream_complete<__block_threads><<<__grid_blocks, __block_threads, 0, __stream_.get()>>>(set_value, &__rcvr_); + __stream_complete<__block_threads, set_value_t, _Rcvr> + <<<__grid_blocks, __block_threads, 0, __stream_.get()>>>(set_value, &__rcvr_); if (auto __status = cudaGetLastError(); __status != cudaSuccess) { @@ -158,7 +156,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler // without the following, the kernel in __host_start will fail to launch with // cudaErrorInvalidDeviceFunction. - ::__cccl_unused(&__stream_complete<__block_threads, set_value_t, _Rcvr>); + ::cuda::std::ignore = &__stream_complete<__block_threads, set_value_t, _Rcvr>; + execution::set_value(static_cast<_Rcvr&&>(__rcvr_)); } @@ -170,7 +169,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler {}; public: - _CCCL_API explicit constexpr stream_scheduler(stream_ref __stream) noexcept + _CCCL_API constexpr stream_scheduler(stream_ref __stream) noexcept : __stream_{__stream} {} @@ -201,7 +200,7 @@ public: return __opstate_t<_Rcvr>{static_cast<_Rcvr&&>(__rcvr), __attrs_.__stream_}; } - _CCCL_NO_UNIQUE_ADDRESS __tag_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ __tag_t __tag_; __attrs_t __attrs_; }; @@ -256,17 +255,40 @@ struct stream_domain::__apply_t : stream_domain::__ap } // namespace execution -_CCCL_HOST_API inline auto stream_ref::schedule() const noexcept +_CCCL_API inline auto stream_ref::schedule() const noexcept { return execution::schedule(execution::stream_scheduler{*this}); } -[[nodiscard]] _CCCL_API constexpr auto stream_ref::query(const execution::get_domain_t&) const noexcept +[[nodiscard]] _CCCL_API constexpr auto +stream_ref::query(const execution::get_completion_scheduler_t&) const noexcept -> stream_ref +{ + return *this; +} + +template +[[nodiscard]] _CCCL_API constexpr auto stream_ref::query( + const execution::get_completion_scheduler_t&, const _Env& __env) const noexcept + -> execution::__scheduler_of_t +{ + return execution::get_scheduler(__env); +} + +[[nodiscard]] _CCCL_API constexpr auto +stream_ref::query(const execution::get_completion_domain_t&) const noexcept -> execution::stream_domain { return {}; } +template +[[nodiscard]] _CCCL_API constexpr auto +stream_ref::query(const execution::get_completion_domain_t&, const _Env& __env) const noexcept + -> __call_result_t +{ + return {}; +} + } // namespace cuda::experimental _CCCL_DIAG_POP diff --git a/cudax/include/cuda/experimental/__execution/stream/sequence.cuh b/cudax/include/cuda/experimental/__execution/stream/sequence.cuh index f8e186ad022..39f23c0db06 100644 --- a/cudax/include/cuda/experimental/__execution/stream/sequence.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/sequence.cuh @@ -29,18 +29,18 @@ namespace cuda::experimental::execution { -///////////////////////////////////////////////////////////////////////////////// -// sequence: customization for the stream scheduler -template <> -struct stream_domain::__apply_t -{ - template - _CCCL_API auto operator()(_Sndr __sndr, const _Env& __env) const - { - static_assert(::cuda::std::__always_false_v<_Sndr>, - "The CUDA stream scheduler does not yet support the `sequence` algorithm."); - } -}; +// ///////////////////////////////////////////////////////////////////////////////// +// // sequence: customization for the stream scheduler +// template <> +// struct stream_domain::__apply_t +// { +// template +// _CCCL_API auto operator()(_Sndr __sndr, const _Env& __env) const +// { +// static_assert(::cuda::std::__always_false_v<_Sndr>, +// "The CUDA stream scheduler does not yet support the 'sequence' algorithm."); +// } +// }; } // namespace cuda::experimental::execution diff --git a/cudax/include/cuda/experimental/__execution/stream/starts_on.cuh b/cudax/include/cuda/experimental/__execution/stream/starts_on.cuh deleted file mode 100644 index d77d037e44b..00000000000 --- a/cudax/include/cuda/experimental/__execution/stream/starts_on.cuh +++ /dev/null @@ -1,107 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef __CUDAX_EXECUTION_STREAM_STARTS_ON -#define __CUDAX_EXECUTION_STREAM_STARTS_ON - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -namespace cuda::experimental::execution -{ -namespace __stream -{ -struct __starts_on_t -{ - struct _CCCL_TYPE_VISIBILITY_DEFAULT __get_stream_fn - { - template - [[nodiscard]] _CCCL_API auto operator()(const _Sndr& __sndr, ::cuda::std::__ignore_t) const - { - // __sndr is a write_env sender (see __mk_sndr_base below), which contains an - // environment that contains the stream scheduler, from which we can obtain the - // stream. - auto& [__ign0, __env, __ign1] = __sndr; - return cuda::get_stream(get_scheduler(__env)); - } - }; - - template - [[nodiscard]] static _CCCL_API constexpr auto __mk_sndr_base(_Sch __sch, _Sndr&& __sndr) - { - // This is the implementation of the starts_on sender for the stream domain. _Sndr - // here is the child of the starts_on sender, and _Sch is the stream scheduler. We use - // write_env to let _Sndr and its children know that they are running on the stream - // scheduler. We construct the adaptor with a __get_stream_fn that knows how to obtain - // the stream from the write_env sender. - return __stream::__adapt(write_env(static_cast<_Sndr&&>(__sndr), __mk_sch_env(__sch)), __get_stream_fn{}); - } - - template - using __sndr_base_t = decltype(__starts_on_t::__mk_sndr_base(declval<_Sch>(), declval<_Sndr>())); - - template - using __with_sch_t = __call_result_t>; - - // Wrap the sender returned from __mk_sndr_base in a type that hides the complexity of - // the sender's type name. This results in more readable diagnostics. - template - struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t : __stream::__sndr_t<__with_sch_t<_Sch, _Sndr>, __get_stream_fn> - { - // BUGBUG NO this is a wrong use of __mk_sch_env. it needs to be passed an - // environment. turn this into a transform_sender. - _CCCL_API explicit constexpr __sndr_t(_Sch __sch, _Sndr __sndr) - : __stream::__sndr_t<__with_sch_t<_Sch, _Sndr>, __get_stream_fn>{ - {}, {}, write_env(static_cast<_Sndr&&>(__sndr), __mk_sch_env(__sch))} - {} - }; - - // The connect cpo calls transform_sender, which is directed here for starts_on senders. - // It returns a custom sender that knows how to start the child sender on the specified - // stream. - template - [[nodiscard]] _CCCL_API auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const - { - auto& [__ign0, __sch, __child] = __sndr; - return __sndr_t{__sch, ::cuda::std::forward_like<_Sndr>(__child)}; - } -}; -} // namespace __stream - -// Start work on the GPU -template <> -struct stream_domain::__apply_t : __stream::__starts_on_t -{}; -} // namespace cuda::experimental::execution - -#include - -#endif // __CUDAX_EXECUTION_STREAM_STARTS_ON diff --git a/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh b/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh index e8919d94e71..e9c07f19bba 100644 --- a/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -45,7 +46,7 @@ struct __sync_wait_t : private sync_wait_t // The transformation would happen in due course in the connect cpo, so why transform // it here? This transformation shuffles the sender into one that can provide a // stream_ref, which is needed by __host_apply. - auto __new_sndr = execution::transform_sender(stream_domain{}, static_cast<_Sndr&&>(__sndr), __env); + auto __new_sndr = stream_domain{}.transform_sender(set_value, static_cast<_Sndr&&>(__sndr), __env); NV_IF_TARGET(NV_IS_HOST, (return __host_apply(::cuda::std::move(__new_sndr), static_cast<_Env&&>(__env));), @@ -61,7 +62,7 @@ struct __sync_wait_t : private sync_wait_t private: template - struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t + struct _CCCL_TYPE_VISIBILITY_DEFAULT __managed_state_t { using __partial_completions_t = completion_signatures_of_t<_Sndr, __env_t<_Env>>; using __all_nothrow_t = @@ -74,9 +75,9 @@ private: using __errors_t = __error_types<__completions_t, __decayed_variant>; using __rcvr_t = sync_wait_t::__rcvr_t<__values_t, __errors_t, _Env>; - _CCCL_HOST_API explicit __state_t(_Sndr&& __sndr, _Env&& __env) + _CCCL_HOST_API explicit __managed_state_t(_Sndr&& __sndr, _Env&& __env) : __result_{} - , __state_{{{}, static_cast<_Env&&>(__env)}, &__result_, {}} + , __state_{static_cast<_Env&&>(__env), &__result_} , __opstate_{execution::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{&__state_})} {} @@ -97,7 +98,7 @@ private: stream_ref __stream = __get_stream(__sndr, __env); // Launch the sender with a continuation that will fill in a variant - using __box_t = __managed_box<__state_t<_Sndr, _Env>>; + using __box_t = __managed_box<__managed_state_t<_Sndr, _Env>>; auto __box = __box_t::__make_unique(static_cast<_Sndr&&>(__sndr), static_cast<_Env&&>(__env)); execution::start(__box->__value.__opstate_); diff --git a/cudax/include/cuda/experimental/__execution/stream_context.cuh b/cudax/include/cuda/experimental/__execution/stream_context.cuh index e5bb2bf5602..ce60d0545fb 100644 --- a/cudax/include/cuda/experimental/__execution/stream_context.cuh +++ b/cudax/include/cuda/experimental/__execution/stream_context.cuh @@ -21,17 +21,16 @@ # pragma system_header #endif // no system header -// // IWYU pragma: begin_exports -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// // IWYU pragma: end_exports +// IWYU pragma: begin_exports +#include +#include +#include +#include +#include +#include +#include +#include +#include +// IWYU pragma: end_exports #endif //__CUDAX_EXECUTION_STREAM_CONTEXT diff --git a/cudax/include/cuda/experimental/__execution/sync_wait.cuh b/cudax/include/cuda/experimental/__execution/sync_wait.cuh index 2dd857f735b..b90f8cd4d97 100644 --- a/cudax/include/cuda/experimental/__execution/sync_wait.cuh +++ b/cudax/include/cuda/experimental/__execution/sync_wait.cuh @@ -62,7 +62,8 @@ struct sync_wait_t basic_run_loop<_Env> __loop_; }; - template +public: + template > struct _CCCL_TYPE_VISIBILITY_DEFAULT __env_t { _CCCL_EXEC_CHECK_DISABLE @@ -108,6 +109,7 @@ struct sync_wait_t template using __decayed_tuple = ::cuda::std::tuple...>; + _CUDAX_SEMI_PRIVATE : template > struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t : __state_base_t<_Env> { @@ -177,7 +179,15 @@ struct sync_wait_t struct __throw_error_fn { template - _CCCL_HOST_API void operator()(_Error __err) const + [[noreturn]] + _CCCL_API void operator()(_Error __err) const + { + NV_IF_TARGET(NV_IS_HOST, (__do_throw(static_cast<_Error&&>(__err));), (::cuda::std::terminate();)) + } + + template + [[noreturn]] + _CCCL_HOST_API static void __do_throw(_Error __err) { if constexpr (__same_as<_Error, ::std::exception_ptr>) { @@ -195,6 +205,7 @@ struct sync_wait_t { throw static_cast<_Error&&>(__err); } + _CCCL_UNREACHABLE(); } }; diff --git a/cudax/include/cuda/experimental/__execution/then.cuh b/cudax/include/cuda/experimental/__execution/then.cuh index 67bb5f9e11e..90abee390d4 100644 --- a/cudax/include/cuda/experimental/__execution/then.cuh +++ b/cudax/include/cuda/experimental/__execution/then.cuh @@ -140,7 +140,7 @@ struct __upon_t } template - _CCCL_NODEBUG_API void __complete(_Tag, _Ts&&... __ts) noexcept + _CCCL_API void __complete(_Tag, _Ts&&... __ts) noexcept { if constexpr (_Tag{} == _SetTag{}) { @@ -227,13 +227,13 @@ struct __upon_t // extended (host/device) lambda { template - _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) -> __call_result_t<__upon_tag_t, _Sndr, _Fn> + _CCCL_API constexpr auto operator()(_Sndr __sndr) -> __call_result_t<__upon_tag_t, _Sndr, _Fn> { return __upon_tag_t{}(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__fn_)); } template - _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr __sndr, __closure_base_t __self) // + _CCCL_API friend constexpr auto operator|(_Sndr __sndr, __closure_base_t __self) // -> __call_result_t<__upon_tag_t, _Sndr, _Fn> { return __upon_tag_t{}(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__self.__fn_)); @@ -244,10 +244,10 @@ struct __upon_t public: template - _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const; + _CCCL_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const; template - _CCCL_NODEBUG_API constexpr auto operator()(_Fn __fn) const; + _CCCL_API constexpr auto operator()(_Fn __fn) const; }; struct then_t : __upon_t @@ -330,7 +330,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __upon_t<_UponTag, _SetTag>::__sndr_base_t return __fwd_env(execution::get_env(__sndr_)); } - _CCCL_NO_UNIQUE_ADDRESS __upon_tag_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ __upon_tag_t __tag_; _Fn __fn_; _Sndr __sndr_; }; @@ -365,7 +365,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT upon_stopped_t::__closure_t template template -_CCCL_NODEBUG_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Sndr __sndr, _Fn __fn) const +_CCCL_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Sndr __sndr, _Fn __fn) const { using __sndr_t = typename _UponTag::template __sndr_t<_Sndr, _Fn>; @@ -381,7 +381,7 @@ _CCCL_NODEBUG_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Sndr _ template template -_CCCL_NODEBUG_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Fn __fn) const +_CCCL_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Fn __fn) const { using __closure_t = typename _UponTag::template __closure_t<_Fn>; return __closure_t{{static_cast<_Fn&&>(__fn)}}; diff --git a/cudax/include/cuda/experimental/__execution/thread_context.cuh b/cudax/include/cuda/experimental/__execution/thread_context.cuh index 219bee64ca0..47b001647fa 100644 --- a/cudax/include/cuda/experimental/__execution/thread_context.cuh +++ b/cudax/include/cuda/experimental/__execution/thread_context.cuh @@ -51,7 +51,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT thread_context } } - _CCCL_HOST_API auto get_scheduler() + _CCCL_API auto get_scheduler() { return __loop_.get_scheduler(); } diff --git a/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh index f18d65200cc..adb964ab7f4 100644 --- a/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh +++ b/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh @@ -40,7 +40,7 @@ template struct __default_transform_fn { template - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(_Ts...)> + _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(_Ts...)> { return {}; } @@ -49,7 +49,7 @@ struct __default_transform_fn struct __swallow_transform { template - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> + _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> { return {}; } @@ -59,7 +59,7 @@ template struct __decay_transform { template - _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(decay_t<_Ts>...)> + _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(decay_t<_Ts>...)> { return {}; } @@ -70,7 +70,7 @@ using __meta_call_result_t _CCCL_NODEBUG_ALIAS = decltype(declval<_Fn>().templat _CCCL_EXEC_CHECK_DISABLE template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn) +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn) -> __meta_call_result_t { return __fn.template operator()<_Ay, _As...>(); @@ -78,7 +78,7 @@ template _CCCL_EXEC_CHECK_DISABLE template -[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn) -> __call_result_t +[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn) -> __call_result_t { return __fn(); } diff --git a/cudax/include/cuda/experimental/__execution/transform_sender.cuh b/cudax/include/cuda/experimental/__execution/transform_sender.cuh index 5de2017450c..187cdfeab48 100644 --- a/cudax/include/cuda/experimental/__execution/transform_sender.cuh +++ b/cudax/include/cuda/experimental/__execution/transform_sender.cuh @@ -36,10 +36,18 @@ namespace cuda::experimental::execution { namespace __detail { +template +using __starting_domain = __call_result_or_t; + +template +using __completing_domain = + __call_result_t<__first_callable>, + env_of_t<_Sndr>, + const _Env&>; + template struct __transform_sender_t { -private: template using __domain_for_t = ::cuda::std::_If< // __has_transform_sender<_Domain, _OpTag, _Sndr, _Env>, @@ -58,13 +66,17 @@ private: { return __declfn<__result_t, __is_nothrow>; } - else + else if constexpr (__same_as<_OpTag, start_t>) { return __get_declfn<__result_t, const _Env&, (_Nothrow && __is_nothrow)>(); } + else + { + using __transform_recurse_t = __transform_sender_t<__completing_domain<__result_t, _Env>, set_value_t>; + return __transform_recurse_t::template __get_declfn<__result_t, set_value_t, (_Nothrow && __is_nothrow)>(); + } } -public: template ()> [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr&& __sndr, const _Env& __env) const noexcept(noexcept(_DeclFn())) -> decltype(_DeclFn()) @@ -76,10 +88,16 @@ public: { return __domain_t().transform_sender(_OpTag(), static_cast<_Sndr&&>(__sndr), __env); } - else + else if constexpr (__same_as<_OpTag, start_t>) { return (*this)(__domain_t().transform_sender(_OpTag(), static_cast<_Sndr&&>(__sndr), __env), __env); } + else + { + using __transform_recurse_t = __transform_sender_t<__completing_domain<__result_t, _Env>, set_value_t>; + return __transform_recurse_t()( + __domain_t().transform_sender(_OpTag(), static_cast<_Sndr&&>(__sndr), __env), __env); + } } }; } // namespace __detail @@ -92,25 +110,17 @@ private: { template _CCCL_API constexpr auto operator()(_Sndr&& __sndr, const _Env& __env) const - noexcept(noexcept(_Fn2()(_Fn1()(static_cast<_Sndr&&>(__sndr), __env), __env))) - -> decltype(_Fn2()(_Fn1()(static_cast<_Sndr&&>(__sndr), __env), __env)) + noexcept(noexcept(_Fn1()(_Fn2()(static_cast<_Sndr&&>(__sndr), __env), __env))) + -> decltype(_Fn1()(_Fn2()(static_cast<_Sndr&&>(__sndr), __env), __env)) { - return _Fn2()(_Fn1()(static_cast<_Sndr&&>(__sndr), __env), __env); + return _Fn1()(_Fn2()(static_cast<_Sndr&&>(__sndr), __env), __env); } }; - template - using __starting_domain = __call_result_or_t; - - template - using __completing_domain = - __call_result_t<__first_callable>, - env_of_t<_Sndr>, - const _Env&>; - template - using __impl_fn_t = __compose<__detail::__transform_sender_t<__starting_domain, start_t>, - __detail::__transform_sender_t<__completing_domain<_Sndr, const _Env&>, set_value_t>>; + using __impl_fn_t = + __compose<__detail::__transform_sender_t<__detail::__starting_domain<_Env>, start_t>, + __detail::__transform_sender_t<__detail::__completing_domain<_Sndr, _Env>, set_value_t>>; public: template > diff --git a/cudax/include/cuda/experimental/__execution/utility.cuh b/cudax/include/cuda/experimental/__execution/utility.cuh index 3d4205d2398..6e5e4d90c0a 100644 --- a/cudax/include/cuda/experimental/__execution/utility.cuh +++ b/cudax/include/cuda/experimental/__execution/utility.cuh @@ -134,6 +134,8 @@ template return __attrs; } +#define __debug_printf(...) (printf(__VA_ARGS__), [] NV_IF_TARGET(NV_IS_HOST, (fflush(stdout);), (void(0);))()) + // This function can only be called from a catch handler. [[nodiscard]] _CCCL_HOST_API inline auto __get_cuda_error_from_active_exception() -> ::cudaError_t { @@ -208,7 +210,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __first_callable private: //! @brief Returns the first function that is callable with a given set of arguments. template - [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto __get_1st(_Self&& __self) noexcept -> decltype(auto) + [[nodiscard]] _CCCL_API static constexpr auto __get_1st(_Self&& __self) noexcept -> decltype(auto) { // NOLINTNEXTLINE (modernize-avoid-c-arrays) constexpr bool __flags[] = {__callable<::cuda::std::__copy_cvref_t<_Self, _Fns>, _Args...>..., false}; @@ -227,7 +229,7 @@ public: //! @brief Calls the first function that is callable with a given set of arguments. _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API constexpr auto + _CCCL_API constexpr auto operator()(_Args&&... __args) && noexcept(__nothrow_callable<__1st_fn_t<__first_callable, _Args...>, _Args...>) -> __call_result_t<__1st_fn_t<__first_callable, _Args...>, _Args...> { @@ -238,7 +240,7 @@ public: //! @overload _CCCL_EXEC_CHECK_DISABLE template - _CCCL_NODEBUG_API constexpr auto operator()(_Args&&... __args) const& noexcept( + _CCCL_API constexpr auto operator()(_Args&&... __args) const& noexcept( __nothrow_callable<__1st_fn_t<__first_callable const&, _Args...>, _Args...>) -> __call_result_t<__1st_fn_t<__first_callable const&, _Args...>, _Args...> { @@ -255,7 +257,8 @@ _CCCL_HOST_DEVICE __first_callable(_Fns...) -> __first_callable<_Fns...>; // __call_or namespace __detail { -// query an environment, or return a default value if the query is not supported +// call a function with a set of arguments or return a default value if the function is +// not callable. struct __call_or_t { _CCCL_EXEC_CHECK_DISABLE @@ -280,8 +283,7 @@ struct __call_or_t _CCCL_GLOBAL_CONSTANT __detail::__call_or_t __call_or{}; template -using __call_result_or_t _CCCL_NODEBUG_ALIAS = - decltype(__call_or(::cuda::std::declval<_Fn>(), ::cuda::std::declval<_Default>(), ::cuda::std::declval<_Args>()...)); +using __call_result_or_t _CCCL_NODEBUG_ALIAS = __call_result_t<__detail::__call_or_t, _Fn, _Default, _Args...>; //! @brief A callable that always return a value of type _Ty, regardless of the arguments //! passed to it. @@ -289,18 +291,18 @@ template struct _CCCL_TYPE_VISIBILITY_DEFAULT __always { template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Args&&...) && noexcept -> _Ty&& + [[nodiscard]] _CCCL_API constexpr auto operator()(_Args&&...) && noexcept -> _Ty&& { return static_cast<_Ty&&>(__value); } template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Args&&...) const& noexcept -> _Ty const& + [[nodiscard]] _CCCL_API constexpr auto operator()(_Args&&...) const& noexcept -> _Ty const& { return __value; } - _CCCL_NO_UNIQUE_ADDRESS _Ty __value{}; + /*_CCCL_NO_UNIQUE_ADDRESS*/ _Ty __value{}; }; template diff --git a/cudax/include/cuda/experimental/__execution/variant.cuh b/cudax/include/cuda/experimental/__execution/variant.cuh index 3dcf9627a2c..0d8eb226cff 100644 --- a/cudax/include/cuda/experimental/__execution/variant.cuh +++ b/cudax/include/cuda/experimental/__execution/variant.cuh @@ -62,7 +62,7 @@ public: _CCCL_ASSERT(false, "cannot visit a stateless variant"); } - [[nodiscard]] _CCCL_NODEBUG_API static constexpr size_t __index() noexcept + [[nodiscard]] _CCCL_API static constexpr size_t __index() noexcept { return __npos; } @@ -112,12 +112,12 @@ public: __destroy(); } - [[nodiscard]] _CCCL_NODEBUG_API void* __ptr() noexcept + [[nodiscard]] _CCCL_API void* __ptr() noexcept { return __storage_; } - [[nodiscard]] _CCCL_NODEBUG_API size_t __index() const noexcept + [[nodiscard]] _CCCL_API size_t __index() const noexcept { return __index_; } diff --git a/cudax/include/cuda/experimental/__execution/visit.cuh b/cudax/include/cuda/experimental/__execution/visit.cuh index 7bd5c669b6d..47dd356ba58 100644 --- a/cudax/include/cuda/experimental/__execution/visit.cuh +++ b/cudax/include/cuda/experimental/__execution/visit.cuh @@ -96,7 +96,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT visit_t _CCCL_EXEC_CHECK_DISABLE template requires(static_cast(structured_binding_size<_CvSndr>) >= 2) - _CCCL_NODEBUG_API constexpr auto operator()(_Visitor& __visitor, _CvSndr&& __sndr, _Context& __context) const + _CCCL_API constexpr auto operator()(_Visitor& __visitor, _CvSndr&& __sndr, _Context& __context) const -> decltype(auto) { auto&& [__tag, __data, ... __children] = static_cast<_CvSndr&&>(__sndr); @@ -146,8 +146,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT visit_t { _CCCL_TEMPLATE(class _Visitor, class _Sndr, class _Context) _CCCL_REQUIRES((static_cast(structured_binding_size<_Sndr>) >= 2)) - _CCCL_NODEBUG_API constexpr auto operator()(_Visitor& __visitor, _Sndr&& __sndr, _Context& __context) const - -> decltype(auto) + _CCCL_API constexpr auto operator()(_Visitor& __visitor, _Sndr&& __sndr, _Context& __context) const -> decltype(auto) { // This `if constexpr` shouldn't be needed given the `requires` clause above. It is // here because nvcc 12.0 has a bug where the full signature of the function template diff --git a/cudax/include/cuda/experimental/__execution/when_all.cuh b/cudax/include/cuda/experimental/__execution/when_all.cuh index 2ff9c80499e..2dc994b3e20 100644 --- a/cudax/include/cuda/experimental/__execution/when_all.cuh +++ b/cudax/include/cuda/experimental/__execution/when_all.cuh @@ -115,7 +115,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t __state_t& __state_; template - _CCCL_NODEBUG_API constexpr void set_value(_Ts&&... __ts) noexcept + _CCCL_API constexpr void set_value(_Ts&&... __ts) noexcept { constexpr ::cuda::std::index_sequence_for<_Ts...>* idx = nullptr; __state_.template __set_value<_Index>(idx, static_cast<_Ts&&>(__ts)...); @@ -123,7 +123,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t } template - _CCCL_NODEBUG_API constexpr void set_error(_Error&& __error) noexcept + _CCCL_API constexpr void set_error(_Error&& __error) noexcept { __state_.__set_error(static_cast<_Error&&>(__error)); __state_.__arrive(); @@ -293,7 +293,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t struct __start_all { template - _CCCL_NODEBUG_API void operator()(_Ops&... __ops) const noexcept + _CCCL_API void operator()(_Ops&... __ops) const noexcept { (execution::start(__ops), ...); } @@ -390,7 +390,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t public: template - _CCCL_NODEBUG_API constexpr auto operator()(_Sndrs... __sndrs) const; + _CCCL_API constexpr auto operator()(_Sndrs... __sndrs) const; }; template @@ -521,7 +521,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t::__sndr_t }; template -_CCCL_NODEBUG_API constexpr auto when_all_t::operator()(_Sndrs... __sndrs) const +_CCCL_API constexpr auto when_all_t::operator()(_Sndrs... __sndrs) const { if constexpr (sizeof...(_Sndrs) == 0) { diff --git a/cudax/include/cuda/experimental/__execution/write_attrs.cuh b/cudax/include/cuda/experimental/__execution/write_attrs.cuh index 96ef9ad8c06..8359508f2e3 100644 --- a/cudax/include/cuda/experimental/__execution/write_attrs.cuh +++ b/cudax/include/cuda/experimental/__execution/write_attrs.cuh @@ -136,7 +136,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT write_attrs_t::__sndr_t return {{__env_ref(__attrs_), __fwd_env(execution::get_env(__sndr_))}}; } - _CCCL_NO_UNIQUE_ADDRESS write_attrs_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ write_attrs_t __tag_; _Attrs __attrs_; _Sndr __sndr_; }; diff --git a/cudax/include/cuda/experimental/__execution/write_env.cuh b/cudax/include/cuda/experimental/__execution/write_env.cuh index 7f289ffc2ca..9e176a153b9 100644 --- a/cudax/include/cuda/experimental/__execution/write_env.cuh +++ b/cudax/include/cuda/experimental/__execution/write_env.cuh @@ -141,7 +141,7 @@ public: /// @brief Wraps one sender in another that modifies the execution /// environment by merging in the environment specified. template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Env __env) const + [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr, _Env __env) const { return __sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__env), static_cast<_Sndr&&>(__sndr)}; } @@ -149,7 +149,7 @@ public: /// @brief Returns a closure that can be used with the pipe operator /// to modify the execution environment. template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Env __env) const + [[nodiscard]] _CCCL_API constexpr auto operator()(_Env __env) const { return __closure_t<_Env>{static_cast<_Env&&>(__env)}; } @@ -185,7 +185,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT write_env_t::__sndr_t return __fwd_env(execution::get_env(__sndr_)); } - _CCCL_NO_UNIQUE_ADDRESS write_env_t __tag_; + /*_CCCL_NO_UNIQUE_ADDRESS*/ write_env_t __tag_; _Env __env_; _Sndr __sndr_; }; @@ -194,14 +194,13 @@ template struct _CCCL_TYPE_VISIBILITY_DEFAULT write_env_t::__closure_t { template - [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) const -> __sndr_t<_Sndr, _Env> + [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr) const -> __sndr_t<_Sndr, _Env> { return __sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__env_), static_cast<_Sndr&&>(__sndr)}; } template - [[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self) - -> __sndr_t<_Sndr, _Env> + [[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self) -> __sndr_t<_Sndr, _Env> { return __sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__self.__env_), static_cast<_Sndr&&>(__sndr)}; } diff --git a/cudax/include/cuda/experimental/__stream/stream_ref.cuh b/cudax/include/cuda/experimental/__stream/stream_ref.cuh index e9ec9215916..ae623c83b61 100644 --- a/cudax/include/cuda/experimental/__stream/stream_ref.cuh +++ b/cudax/include/cuda/experimental/__stream/stream_ref.cuh @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -48,7 +49,7 @@ struct stream_ref : ::cuda::stream_ref //! @brief Converting constructor from \c ::cuda::stream_ref //! //! @post `*this == __other` - _CCCL_HOST_API constexpr stream_ref(const ::cuda::stream_ref& __other) noexcept + _CCCL_API constexpr stream_ref(const ::cuda::stream_ref& __other) noexcept : ::cuda::stream_ref(__other) {} @@ -63,7 +64,7 @@ struct stream_ref : ::cuda::stream_ref //! @brief Returns a \c execution::sender that completes on this stream. //! //! @note Equivalent to `execution::schedule(execution::stream_scheduler{*this})`. - _CCCL_HOST_API auto schedule() const noexcept; + _CCCL_API auto schedule() const noexcept; //! @brief Get the logical device under which this stream was created. //! @@ -108,8 +109,26 @@ struct stream_ref : ::cuda::stream_ref return execution::forward_progress_guarantee::weakly_parallel; } - [[nodiscard]] _CCCL_API constexpr auto query(const execution::get_domain_t&) const noexcept - -> execution::stream_domain; + [[nodiscard]] _CCCL_API constexpr auto query(const execution::get_completion_behavior_t&) const noexcept + { + return execution::completion_behavior::asynchronous; + } + + [[nodiscard]] _CCCL_API constexpr auto + query(const execution::get_completion_scheduler_t&) const noexcept -> stream_ref; + + template + [[nodiscard]] _CCCL_API constexpr auto + query(const execution::get_completion_scheduler_t&, const _Env& __env) const noexcept + -> execution::__scheduler_of_t; + + [[nodiscard]] _CCCL_API constexpr auto + query(const execution::get_completion_domain_t&) const noexcept -> execution::stream_domain; + + template + [[nodiscard]] _CCCL_API constexpr auto + query(const execution::get_completion_domain_t&, const _Env& __env) const noexcept + -> __call_result_t; }; } // namespace cuda::experimental diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index d6e6c56fe5a..4888de26d87 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -25,6 +25,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test target_link_libraries(${test_target} PRIVATE ${cn_target} cccl.c2h.main + cudart ) target_compile_options(${test_target} PRIVATE "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE" @@ -78,18 +79,20 @@ foreach(cn_target IN LISTS cudax_TARGETS) execution/test_continues_on.cu execution/test_just.cu execution/test_let_value.cu - #execution/test_on.cu + execution/test_on.cu execution/test_sequence.cu execution/test_starts_on.cu - #execution/test_stream_context.cu + execution/test_stream_context.cu execution/test_then.cu execution/test_visit.cu execution/test_when_all.cu execution/test_write_attrs.cu execution/test_write_env.cu ) - target_compile_options(${test_target} PRIVATE $<$:--extended-lambda>) - target_compile_options(${test_target} PRIVATE $<$:--expt-relaxed-constexpr>) + + target_compile_options(${test_target} PRIVATE + $<$:-allow-unsupported-compiler> + ) # The stream context needs atomic wait/notify, which is only available on sm70 and # above. diff --git a/cudax/test/execution/test_on.cu b/cudax/test/execution/test_on.cu index 0baec6d7bc3..e896f3f3e78 100644 --- a/cudax/test/execution/test_on.cu +++ b/cudax/test/execution/test_on.cu @@ -8,14 +8,14 @@ // //===----------------------------------------------------------------------===// +// BUGBUG + #include #include "testing.cuh" namespace ex = cudax::execution; -namespace -{ __host__ __device__ bool _on_device() noexcept { NV_IF_ELSE_TARGET(NV_IS_HOST, // @@ -59,10 +59,10 @@ void simple_start_on_stream_test() { cudax::stream str{cuda::device_ref(0)}; auto sch = cudax::stream_ref{str}; - auto sndr = ex::on(sch, ex::just(42) | ex::then([] __host__ __device__(int i) -> int { + auto sndr = ex::on(sch, ex::just(42) | ex::then([] __host__ __device__(int i) noexcept -> int { return _on_device() ? i : -i; })) - | ex::then([] __host__ __device__(int i) -> int { + | ex::then([] __host__ __device__(int i) noexcept -> int { return _on_device() ? -1 : i; }); auto [result] = ex::sync_wait(std::move(sndr)).value(); @@ -73,16 +73,18 @@ void simple_continue_on_stream_test() { cudax::stream str{cuda::device_ref(0)}; auto sch = cudax::stream_ref{str}; - auto sndr = ex::just(42) | ex::on(sch, ex::then([] __host__ __device__(int i) -> int { + auto sndr = ex::just(42) | ex::on(sch, ex::then([] __host__ __device__(int i) noexcept -> int { return _on_device() ? i : -i; })) - | ex::then([] __host__ __device__(int i) -> int { + | ex::then([] __host__ __device__(int i) noexcept -> int { return _on_device() ? -1 : i; }); auto [result] = ex::sync_wait(std::move(sndr)).value(); CUDAX_CHECK(result == 42); } +namespace +{ C2H_TEST("simple on(sch, sndr) thread test", "[on]") { simple_start_on_thread_test(); diff --git a/cudax/test/execution/test_stream_context.cu b/cudax/test/execution/test_stream_context.cu index 101d8fc7df3..af1f25fd179 100644 --- a/cudax/test/execution/test_stream_context.cu +++ b/cudax/test/execution/test_stream_context.cu @@ -15,6 +15,7 @@ #include #include +#include #include @@ -24,8 +25,6 @@ _CCCL_BEGIN_NV_DIAG_SUPPRESS(177) // function "_is_on_device" was declared but n namespace ex = cuda::experimental::execution; -namespace -{ __host__ __device__ bool _is_on_device() noexcept { NV_IF_ELSE_TARGET(NV_IS_HOST, // @@ -135,8 +134,10 @@ void bulk_on_stream_scheduler() auto sch = sctx.get_scheduler(); using _env_t = cudax::env_t; - _env_t env{cudax::device_memory_resource{_dev}, cuda::get_stream(sch), ex::par_unseq}; - cudax::async_device_buffer buf{env, 10, 40}; // a device buffer of 10 integers, initialized to 40 + auto mr = cudax::device_memory_resource{_dev}; + auto mr2 = cudax::any_resource(mr); + _env_t env{mr, cuda::get_stream(sch), ex::par_unseq}; + auto buf = cudax::make_async_buffer(sctx, mr2, 10, 40, env); // a device buffer of 10 integers, initialized to 40 cuda::std::span data{buf}; auto start = // @@ -153,7 +154,8 @@ void bulk_on_stream_scheduler() data[i] += 2; }); - cudax::async_device_buffer expected{env, 10, 42}; // a device buffer of 10 integers, initialized to 42 + auto expected = cudax::make_async_buffer(sctx, mr2, 10, 42, env); // a device buffer of 10 integers, initialized + // to 42 // start the sender and wait for it to finish auto [span] = ex::sync_wait(std::move(start)).value(); @@ -206,6 +208,8 @@ void starts_on_with_stream_scheduler2() CHECK(i == 43); } +namespace +{ // Test code is placed in separate functions to avoid an nvc++ issue with // extended lambdas in functions with internal linkage (as is the case // with C2H tests). diff --git a/cudax/test/execution/test_visit.cu b/cudax/test/execution/test_visit.cu index b55fcaf820e..9d8ef1c6e1a 100644 --- a/cudax/test/execution/test_visit.cu +++ b/cudax/test/execution/test_visit.cu @@ -33,13 +33,13 @@ static_assert(cudax_async::structured_binding_size == 2); template struct recursive_lambda { - Fn fn; - template - __host__ __device__ auto operator()(Args&&... args) + auto operator()(Args&&... args) { return fn(*this, cuda::std::forward(args)...); } + + Fn fn; }; template