diff --git a/cudax/examples/stdexec_stream.cu b/cudax/examples/stdexec_stream.cu
index 49e55573589..a8ecc792f37 100644
--- a/cudax/examples/stdexec_stream.cu
+++ b/cudax/examples/stdexec_stream.cu
@@ -17,48 +17,54 @@
 #include <cuda_runtime_api.h>
 
 namespace cudax = cuda::experimental;
-namespace task  = cudax::execution;
+namespace ex    = cudax::execution;
 
 // This example demonstrates how to use the experimental CUDA implementation of
 // C++26's std::execution async tasking framework.
 
-struct say_hello
+int main()
 {
-  __device__ int operator()() const
+  try
   {
-    printf("Hello from lambda on device!\n");
-    return value;
-  }
+    auto tctx = ex::thread_context{};
+    auto sctx = ex::stream_context{cuda::device_ref{0}};
+    auto gpu  = sctx.get_scheduler();
 
-  int value;
-};
+    const auto bulk_shape = 10;
+    const auto bulk_fn    = [] __device__(const int index, int i) noexcept {
+      const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+      if (tid < bulk_shape)
+      {
+        printf("Hello from bulk task on device! index = %d, i = %d\n", index, i);
+      }
+    };
 
-__host__ void run()
-{
-  /*
-  try
-  {
-    task::thread_context tctx;
-    task::stream_context sctx{cuda::device_ref{0}};
-    auto sch = sctx.get_scheduler();
+    auto start =
+      // begin work on the GPU:
+      ex::schedule(gpu)
 
-    auto start = //
-      task::schedule(sch) // begin work on the GPU
-      | task::then(say_hello{42}) // enqueue a function object on the GPU
-      | task::then([] __device__(int i) noexcept -> int { // enqueue a lambda on the GPU
-          printf("Hello again from lambda on device! i = %d\n", i);
-          return i + 1;
+      // execute a device lambda on the GPU:
+      | ex::then([] __device__() noexcept -> int {
+          printf("Hello from lambda on device!\n");
+          return 42;
         })
-      | task::continues_on(tctx.get_scheduler()) // continue work on the CPU
-      | task::then([] __host__ __device__(int i) noexcept -> int { // run a lambda on the CPU
+
+      // do some parallel work on the GPU:
+      | ex::bulk(ex::par, bulk_shape, bulk_fn) //
+
+      // transfer execution back to the CPU:
+      | ex::continues_on(tctx.get_scheduler())
+
+      // execute a host/device lambda on the CPU:
+      | ex::then([] __host__ __device__(int i) noexcept -> int {
           NV_IF_TARGET(NV_IS_HOST,
                        (printf("Hello from lambda on host! i = %d\n", i);),
                        (printf("OOPS! still on the device! i = %d\n", i);))
-          return i;
+          return i + 1;
         });
 
     // run the task, wait for it to finish, and get the result
-    auto [i] = task::sync_wait(std::move(start)).value();
+    auto [i] = ex::sync_wait(std::move(start)).value();
     printf("All done on the host! result = %d\n", i);
   }
   catch (cuda::cuda_error const& e)
@@ -73,10 +79,4 @@ __host__ void run()
   {
     std::printf("Unknown exception\n");
   }
-  */
-}
-
-int main()
-{
-  run();
 }
diff --git a/cudax/include/cuda/experimental/__container/async_buffer.cuh b/cudax/include/cuda/experimental/__container/async_buffer.cuh
index 0078af4a9c1..441cdb0bcf8 100644
--- a/cudax/include/cuda/experimental/__container/async_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/async_buffer.cuh
@@ -669,7 +669,11 @@ _CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
 // Size and value make function
 template <class _Tp, class... _Properties, class _Env = ::cuda::std::execution::env<>>
 async_buffer<_Tp, _Properties...> make_async_buffer(
-  stream_ref __stream, any_resource<_Properties...> __mr, size_t __size, const _Tp& __value, const _Env& __env = {})
+  stream_ref __stream,
+  any_resource<_Properties...> __mr,
+  size_t __size,
+  const _Tp& __value,
+  [[maybe_unused]] const _Env& __env = {})
 {
   auto __res = async_buffer<_Tp, _Properties...>{__stream, __mr, __size, no_init};
   __fill_n<_Tp, !::cuda::mr::__is_device_accessible<_Properties...>>(
diff --git a/cudax/include/cuda/experimental/__execution/apply_sender.cuh b/cudax/include/cuda/experimental/__execution/apply_sender.cuh
index 948a514641a..b9c4f93ca8f 100644
--- a/cudax/include/cuda/experimental/__execution/apply_sender.cuh
+++ b/cudax/include/cuda/experimental/__execution/apply_sender.cuh
@@ -63,7 +63,7 @@ public:
   //! @throws Any exception thrown by the underlying domain's `apply_sender`.
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Domain, class _Tag, class _Sndr, class... _Args>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Domain, _Tag, _Sndr&& __sndr, _Args&&... __args) const
+  _CCCL_API constexpr auto operator()(_Domain, _Tag, _Sndr&& __sndr, _Args&&... __args) const
     noexcept(noexcept(__apply_domain_t<_Domain, _Tag, _Sndr, _Args...>{}.apply_sender(
       _Tag{}, static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...)))
       -> __apply_sender_result_t<__apply_domain_t<_Domain, _Tag, _Sndr, _Args...>, _Tag, _Sndr, _Args...>
diff --git a/cudax/include/cuda/experimental/__execution/bulk.cuh b/cudax/include/cuda/experimental/__execution/bulk.cuh
index a5289dd7a17..16c1cb3121d 100644
--- a/cudax/include/cuda/experimental/__execution/bulk.cuh
+++ b/cudax/include/cuda/experimental/__execution/bulk.cuh
@@ -68,13 +68,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t
 template <class _Sndr, class _Shape>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t
 {
-  [[nodiscard]] _CCCL_HOST_API constexpr auto query(get_launch_config_t) const noexcept
+  [[nodiscard]] _CCCL_HOST_API static constexpr auto __get_launch_config(_Shape __shape) noexcept
   {
     constexpr int __block_threads = 256;
-    const int __grid_blocks       = ::cuda::ceil_div(static_cast<int>(__shape_), __block_threads);
+    const int __grid_blocks       = ::cuda::ceil_div(static_cast<int>(__shape), __block_threads);
     return experimental::make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks));
   }
 
+  using __launch_config_t = decltype(__get_launch_config(_Shape()));
+
+  [[nodiscard]] _CCCL_API constexpr auto query(get_launch_config_t) const noexcept -> __launch_config_t
+  {
+    NV_IF_TARGET(NV_IS_HOST,
+                 (return __get_launch_config(__shape_);),
+                 (_CCCL_ASSERT(false, "cannot get a launch configuration from device"); ::cuda::std::terminate();))
+  }
+
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Query, class... _Args)
   _CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with<env_of_t<_Sndr>, _Query, _Args...>)
@@ -152,7 +161,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
       execution::set_stopped(static_cast<_Rcvr&&>(__state_->__rcvr_));
     }
 
-    [[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __fwd_env_t<env_of_t<_Rcvr>>
+    [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> __fwd_env_t<env_of_t<_Rcvr>>
     {
       return __fwd_env(execution::get_env(__state_->__rcvr_));
     }
@@ -188,7 +197,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __closure_base_t
   {
     template <class _Sndr>
-    [[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr&& __sndr, __closure_base_t __self)
+    [[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr&& __sndr, __closure_base_t __self)
     {
       static_assert(__is_sender<_Sndr>);
 
@@ -202,7 +211,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
         {{}, static_cast<__closure_base_t&&>(__self), static_cast<_Sndr&&>(__sndr)}};
     }
 
-    _CCCL_NO_UNIQUE_ADDRESS _Policy __policy_;
+    /*_CCCL_NO_UNIQUE_ADDRESS*/ _Policy __policy_;
     _Shape __shape_;
     _Fn __fn_;
   };
@@ -250,7 +259,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
       return {__state_.__shape_, __sndr_};
     }
 
-    _CCCL_NO_UNIQUE_ADDRESS _BulkTag __tag_;
+    /*_CCCL_NO_UNIQUE_ADDRESS*/ _BulkTag __tag_;
     __closure_base_t<_Policy, _Shape, _Fn> __state_;
     _Sndr __sndr_;
   };
@@ -267,7 +276,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_t
   // This function call operator creates a sender adaptor closure object that can appear
   // on the right-hand side of a pipe operator, like: sndr | bulk(par, shape, fn).
   template <class _Policy, class _Shape, class _Fn>
-  [[nodiscard]] _CCCL_NODEBUG_API auto operator()(_Policy __policy, _Shape __shape, _Fn __fn) const
+  [[nodiscard]] _CCCL_API auto operator()(_Policy __policy, _Shape __shape, _Fn __fn) const
   {
     static_assert(::cuda::std::integral<_Shape>);
     static_assert(::cuda::std::is_execution_policy_v<_Policy>);
@@ -389,7 +398,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT bulk_t : __bulk_t<bulk_t>
   {
     _CCCL_EXEC_CHECK_DISABLE
     template <class... _Ts>
-    _CCCL_NODEBUG_API void operator()(_Shape __begin, _Shape __end, _Ts&&... __values) noexcept(
+    _CCCL_API void operator()(_Shape __begin, _Shape __end, _Ts&&... __values) noexcept(
       __nothrow_callable<_Fn&, _Shape, decltype(__values)&...>)
     {
       for (; __begin != __end; ++__begin)
diff --git a/cudax/include/cuda/experimental/__execution/completion_behavior.cuh b/cudax/include/cuda/experimental/__execution/completion_behavior.cuh
index 4c98e6c1a19..f4e34f696b0 100644
--- a/cudax/include/cuda/experimental/__execution/completion_behavior.cuh
+++ b/cudax/include/cuda/experimental/__execution/completion_behavior.cuh
@@ -139,7 +139,7 @@ struct get_completion_behavior_t
     return __attrs.query(*this, __env);
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
diff --git a/cudax/include/cuda/experimental/__execution/completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/completion_signatures.cuh
index 6a8108b344a..8fcd9a966e7 100644
--- a/cudax/include/cuda/experimental/__execution/completion_signatures.cuh
+++ b/cudax/include/cuda/experimental/__execution/completion_signatures.cuh
@@ -189,7 +189,7 @@ using __make_completion_signatures_t _CCCL_NODEBUG_ALIAS =
   decltype(execution::__make_unique(execution::__normalize(static_cast<_Sigs*>(nullptr))...));
 
 template <class... _ExplicitSigs, class... _DeducedSigs>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto make_completion_signatures(_DeducedSigs*...) noexcept
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto make_completion_signatures(_DeducedSigs*...) noexcept
   -> __make_completion_signatures_t<_ExplicitSigs..., _DeducedSigs...>
 {
   return {};
@@ -206,7 +206,7 @@ using __concat_completion_signatures_t _CCCL_NODEBUG_ALIAS =
 struct __concat_completion_signatures_fn
 {
   template <class... _Sigs>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(const _Sigs&...) const noexcept
+  _CCCL_API _CCCL_CONSTEVAL auto operator()(const _Sigs&...) const noexcept
     -> __concat_completion_signatures_t<_Sigs...>
   {
     return {};
@@ -217,13 +217,13 @@ extern const completion_signatures<>& __empty_completion_signatures;
 
 struct __concat_completion_signatures_impl
 {
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> (*)()
+  _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<> (*)()
   {
     return nullptr;
   }
 
   template <class... _Sigs>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(const completion_signatures<_Sigs...>&) const noexcept
+  _CCCL_API _CCCL_CONSTEVAL auto operator()(const completion_signatures<_Sigs...>&) const noexcept
     -> __make_completion_signatures_t<_Sigs...> (*)()
   {
     return nullptr;
@@ -235,7 +235,7 @@ struct __concat_completion_signatures_impl
             class... _Cs,
             class... _Ds,
             class... _Rest>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()(
+  _CCCL_API _CCCL_CONSTEVAL auto operator()(
     const completion_signatures<_As...>&,
     const completion_signatures<_Bs...>&,
     const completion_signatures<_Cs...>& = __empty_completion_signatures,
@@ -252,7 +252,7 @@ struct __concat_completion_signatures_impl
             class _Cp = ::cuda::std::__ignore_t,
             class _Dp = ::cuda::std::__ignore_t,
             class... _Rest>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto
+  _CCCL_API _CCCL_CONSTEVAL auto
   operator()(const _Ap&, const _Bp& = {}, const _Cp& = {}, const _Dp& = {}, const _Rest&...) const noexcept
   {
     if constexpr (!__valid_completion_signatures<_Ap>)
@@ -645,7 +645,7 @@ template <class... _What, class... _Values>
 #else // ^^^ constexpr exceptions ^^^ / vvv no constexpr exceptions vvv
 
 template <class... _What, class... _Values>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto invalid_completion_signature(_Values...)
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto invalid_completion_signature(_Values...)
 {
   return _ERROR<_What...>{};
 }
diff --git a/cudax/include/cuda/experimental/__execution/conditional.cuh b/cudax/include/cuda/experimental/__execution/conditional.cuh
index a01ddec863f..d429937e52c 100644
--- a/cudax/include/cuda/experimental/__execution/conditional.cuh
+++ b/cudax/include/cuda/experimental/__execution/conditional.cuh
@@ -206,17 +206,17 @@ public:
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;
 
   template <class _Sndr, class _Pred, class _Then, class _Else>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const;
+  _CCCL_API constexpr auto operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const;
 
   template <class _Pred, class _Then, class _Else>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Pred __pred, _Then __then, _Else __else) const;
+  _CCCL_API constexpr auto operator()(_Pred __pred, _Then __then, _Else __else) const;
 };
 
 template <class _Pred, class _Then, class _Else, class _Sndr>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__sndr_t<conditional_t::__closure_base_t<_Pred, _Then, _Else>, _Sndr>
 {
   using __params_t _CCCL_NODEBUG_ALIAS = conditional_t::__closure_base_t<_Pred, _Then, _Else>;
-  _CCCL_NO_UNIQUE_ADDRESS conditional_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ conditional_t __tag_;
   __params_t __params_;
   _Sndr __sndr_;
 
@@ -260,7 +260,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__closure_base_t
   _Else on_false;
 
   template <class _Sndr>
-  _CCCL_NODEBUG_API auto __mk_sender(_Sndr&& __sndr) -> __sndr_t<__closure_base_t, _Sndr>
+  _CCCL_API auto __mk_sender(_Sndr&& __sndr) -> __sndr_t<__closure_base_t, _Sndr>
   {
     using __sndr_t = conditional_t::__sndr_t<__closure_base_t, _Sndr>;
 
@@ -275,20 +275,20 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT conditional_t::__closure_base_t
   }
 
   template <class _Sndr>
-  _CCCL_NODEBUG_API auto operator()(_Sndr __sndr) -> __sndr_t<__closure_base_t, _Sndr>
+  _CCCL_API auto operator()(_Sndr __sndr) -> __sndr_t<__closure_base_t, _Sndr>
   {
     return __mk_sender(static_cast<_Sndr&&>(__sndr));
   }
 
   template <class _Sndr>
-  _CCCL_NODEBUG_API friend auto operator|(_Sndr __sndr, __closure_base_t __self) -> __sndr_t<__closure_base_t, _Sndr>
+  _CCCL_API friend auto operator|(_Sndr __sndr, __closure_base_t __self) -> __sndr_t<__closure_base_t, _Sndr>
   {
     return __self.__mk_sender(static_cast<_Sndr&&>(__sndr));
   }
 };
 
 template <class _Sndr, class _Pred, class _Then, class _Else>
-_CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const
+_CCCL_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred __pred, _Then __then, _Else __else) const
 {
   using __params_t _CCCL_NODEBUG_ALIAS = __closure_base_t<_Pred, _Then, _Else>;
   __params_t __params{static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)};
@@ -296,7 +296,7 @@ _CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Sndr __sndr, _Pred _
 }
 
 template <class _Pred, class _Then, class _Else>
-_CCCL_NODEBUG_API constexpr auto conditional_t::operator()(_Pred __pred, _Then __then, _Else __else) const
+_CCCL_API constexpr auto conditional_t::operator()(_Pred __pred, _Then __then, _Else __else) const
 {
   return __closure_base_t<_Pred, _Then, _Else>{
     static_cast<_Pred&&>(__pred), static_cast<_Then&&>(__then), static_cast<_Else&&>(__else)};
diff --git a/cudax/include/cuda/experimental/__execution/continues_on.cuh b/cudax/include/cuda/experimental/__execution/continues_on.cuh
index dc9eae3145f..51fc6fc44e1 100644
--- a/cudax/include/cuda/experimental/__execution/continues_on.cuh
+++ b/cudax/include/cuda/experimental/__execution/continues_on.cuh
@@ -177,17 +177,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t
     _CCCL_API void __set_result(_Tag, _As&&... __as) noexcept
     {
       using __tupl_t _CCCL_NODEBUG_ALIAS = ::cuda::std::__tuple<_Tag, decay_t<_As>...>;
-      if constexpr (__nothrow_decay_copyable<_As...>)
+      _CCCL_TRY
       {
         __state_->__result_.template __emplace<__tupl_t>(_Tag{}, static_cast<_As&&>(__as)...);
       }
-      else
+      _CCCL_CATCH_ALL
       {
-        _CCCL_TRY
-        {
-          __state_->__result_.template __emplace<__tupl_t>(_Tag{}, static_cast<_As&&>(__as)...);
-        }
-        _CCCL_CATCH_ALL
+        // Avoid ODR-using this completion operation if this code path is not taken.
+        if constexpr (!__nothrow_decay_copyable<_As...>)
         {
           execution::set_error(static_cast<_Rcvr&&>(__state_->__rcvr_), ::std::current_exception());
         }
@@ -315,7 +312,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t
     _CCCL_EXEC_CHECK_DISABLE
     template <class... _Env>
     [[nodiscard]] _CCCL_API constexpr auto query(get_domain_override_t, _Env&&...) const noexcept
-      -> __call_result_t<get_completion_domain_t<set_value_t>, env_of_t<_Sndr>, _Env...>
+      -> __completion_domain_of_t<set_value_t, _Sndr, __fwd_env_t<_Env>...>
     {
       return {};
     }
@@ -382,6 +379,8 @@ public:
   }
 };
 
+//////////////////////////////////////////////////////////////////////////////////////////
+// continues_on sender
 template <class _Sch, class _Sndr>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t::__sndr_t
 {
@@ -424,7 +423,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT continues_on_t::__sndr_t
     return __attrs_t<_Sch, _Sndr>{this};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS continues_on_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ continues_on_t __tag_;
   _Sch __sch_;
   _Sndr __sndr_;
 };
diff --git a/cudax/include/cuda/experimental/__execution/cpos.cuh b/cudax/include/cuda/experimental/__execution/cpos.cuh
index 2e679caa89c..2adedee35c1 100644
--- a/cudax/include/cuda/experimental/__execution/cpos.cuh
+++ b/cudax/include/cuda/experimental/__execution/cpos.cuh
@@ -37,13 +37,13 @@ template <__disposition _Disposition>
 struct __completion_tag
 {
   template <__disposition _OtherDisposition>
-  _CCCL_NODEBUG_API constexpr auto operator==(__completion_tag<_OtherDisposition>) const noexcept -> bool
+  _CCCL_TRIVIAL_API constexpr auto operator==(__completion_tag<_OtherDisposition>) const noexcept -> bool
   {
     return _Disposition == _OtherDisposition;
   }
 
   template <__disposition _OtherDisposition>
-  _CCCL_NODEBUG_API constexpr auto operator!=(__completion_tag<_OtherDisposition>) const noexcept -> bool
+  _CCCL_TRIVIAL_API constexpr auto operator!=(__completion_tag<_OtherDisposition>) const noexcept -> bool
   {
     return _Disposition != _OtherDisposition;
   }
@@ -63,7 +63,7 @@ struct set_value_t : __completion_tag<__disposition::__value>
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Rcvr, class... _Ts)
   _CCCL_REQUIRES(__has_set_value_mbr<_Rcvr, _Ts...>)
-  _CCCL_NODEBUG_API constexpr void operator()(_Rcvr&& __rcvr, _Ts&&... __ts) const noexcept
+  _CCCL_TRIVIAL_API constexpr void operator()(_Rcvr&& __rcvr, _Ts&&... __ts) const noexcept
   {
     static_assert(__same_as<decltype(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)), void>);
     static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_value(static_cast<_Ts&&>(__ts)...)));
@@ -83,7 +83,7 @@ struct set_error_t : __completion_tag<__disposition::__error>
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Rcvr, class _Ey)
   _CCCL_REQUIRES(__has_set_error_mbr<_Rcvr, _Ey>)
-  _CCCL_NODEBUG_API constexpr void operator()(_Rcvr&& __rcvr, _Ey&& __e) const noexcept
+  _CCCL_TRIVIAL_API constexpr void operator()(_Rcvr&& __rcvr, _Ey&& __e) const noexcept
   {
     static_assert(__same_as<decltype(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e))), void>);
     static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_error(static_cast<_Ey&&>(__e))));
@@ -103,7 +103,7 @@ struct set_stopped_t : __completion_tag<__disposition::__stopped>
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Rcvr)
   _CCCL_REQUIRES(__has_set_stopped_mbr<_Rcvr>)
-  _CCCL_NODEBUG_API constexpr void operator()(_Rcvr&& __rcvr) const noexcept
+  _CCCL_TRIVIAL_API constexpr void operator()(_Rcvr&& __rcvr) const noexcept
   {
     static_assert(__same_as<decltype(static_cast<_Rcvr&&>(__rcvr).set_stopped()), void>);
     static_assert(noexcept(static_cast<_Rcvr&&>(__rcvr).set_stopped()));
@@ -123,7 +123,7 @@ struct start_t
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _OpState)
   _CCCL_REQUIRES(__has_start_mbr<_OpState>)
-  _CCCL_NODEBUG_API constexpr void operator()(_OpState& __opstate) const noexcept
+  _CCCL_TRIVIAL_API constexpr void operator()(_OpState& __opstate) const noexcept
   {
     static_assert(__same_as<decltype(__opstate.start()), void>);
     static_assert(noexcept(__opstate.start()));
@@ -134,8 +134,9 @@ struct start_t
 // connect
 struct connect_t
 {
+  _CCCL_EXEC_CHECK_DISABLE
   template <class _Sndr, class _Rcvr>
-  [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr&& __sndr, _Rcvr __rcvr) const
+  [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto operator()(_Sndr&& __sndr, _Rcvr __rcvr) const
     noexcept(noexcept(transform_sender(declval<_Sndr>(), get_env(declval<_Rcvr>())).connect(declval<_Rcvr>())))
       -> decltype(transform_sender(declval<_Sndr>(), get_env(declval<_Rcvr>())).connect(declval<_Rcvr>()))
   {
@@ -147,7 +148,7 @@ struct schedule_t
 {
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Sch>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sch&& __sch) const noexcept
+  _CCCL_TRIVIAL_API constexpr auto operator()(_Sch&& __sch) const noexcept
   {
     static_assert(noexcept(static_cast<_Sch&&>(__sch).schedule()));
     return static_cast<_Sch&&>(__sch).schedule();
diff --git a/cudax/include/cuda/experimental/__execution/domain.cuh b/cudax/include/cuda/experimental/__execution/domain.cuh
index 0c67ac7ab4c..db3e269bc94 100644
--- a/cudax/include/cuda/experimental/__execution/domain.cuh
+++ b/cudax/include/cuda/experimental/__execution/domain.cuh
@@ -89,7 +89,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT default_domain
   //! @return The result of applying the sender operation.
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Tag, class _Sndr, class... _Args>
-  _CCCL_NODEBUG_API static constexpr auto apply_sender(_Tag, _Sndr&& __sndr, _Args&&... __args) noexcept(
+  _CCCL_API static constexpr auto apply_sender(_Tag, _Sndr&& __sndr, _Args&&... __args) noexcept(
     noexcept(_Tag{}.apply_sender(declval<_Sndr>(), declval<_Args>()...))) //
     -> __apply_sender_result_t<_Tag, _Sndr, _Args...>
   {
@@ -107,7 +107,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT default_domain
   _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _OpTag, class _Sndr, class _Env)
   _CCCL_REQUIRES(__has_transform_sender<tag_of_t<_Sndr>, _OpTag, _Sndr, _Env>)
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto transform_sender(_OpTag, _Sndr&& __sndr, const _Env& __env) //
+  [[nodiscard]] _CCCL_API static constexpr auto transform_sender(_OpTag, _Sndr&& __sndr, const _Env& __env) //
     noexcept(__nothrow_transform_sender<tag_of_t<_Sndr>, _OpTag, _Sndr, _Env>)
       -> __transform_sender_result_t<tag_of_t<_Sndr>, _OpTag, _Sndr, _Env>
   {
@@ -117,7 +117,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT default_domain
   //! @overload
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Sndr>
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto
+  [[nodiscard]] _CCCL_API static constexpr auto
   transform_sender(::cuda::std::__ignore_t, _Sndr&& __sndr, ::cuda::std::__ignore_t = {}) //
     noexcept(__nothrow_movable<_Sndr>) -> _Sndr
   {
@@ -149,10 +149,10 @@ private:
 };
 
 template <class _Env>
-struct __hide_scheduler : __hide_query<_Env, get_scheduler_t>
+struct __hide_scheduler : __hide_query<_Env, get_scheduler_t, get_domain_t>
 {
   _CCCL_API explicit constexpr __hide_scheduler(_Env&& __env) noexcept
-      : __hide_query<_Env, get_scheduler_t>{static_cast<_Env&&>(__env), {}}
+      : __hide_query<_Env, get_scheduler_t, get_domain_t>{static_cast<_Env&&>(__env), {}, {}}
   {}
 };
 
@@ -186,10 +186,9 @@ using __scheduler_domain_t _CCCL_NODEBUG_ALIAS = decay_t<decltype(__detail::__ge
 struct get_domain_t
 {
   //! @brief If there is a @c get_domain_t query in @c __env, return it.
-  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Env)
   _CCCL_REQUIRES(__queryable_with<_Env, get_domain_t>)
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env&) const noexcept
+  [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env&) const noexcept
     -> decay_t<__query_result_t<_Env, get_domain_t>>
   {
     using __domain_t = decay_t<__query_result_t<_Env, get_domain_t>>;
@@ -200,10 +199,9 @@ struct get_domain_t
   //! @brief If there is not a @c get_domain_t query in @c __env, but there is a
   //! scheduler, return the domain of the scheduler if it has one, and @c default_domain
   //! otherwise.
-  _CCCL_EXEC_CHECK_DISABLE
   _CCCL_TEMPLATE(class _Env)
   _CCCL_REQUIRES((!__queryable_with<_Env, get_domain_t>) _CCCL_AND __callable<get_scheduler_t, const _Env&>)
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env&) const noexcept
+  [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env&) const noexcept
   {
     using __sch_t      = __scheduler_of_t<const _Env&>;
     using __env_t      = __hide_scheduler<const _Env&>; // to prevent recursion
@@ -213,7 +211,7 @@ struct get_domain_t
     return __domain_t{};
   }
 
-  _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept
+  _CCCL_API static constexpr auto query(forwarding_query_t) noexcept
   {
     return true;
   }
@@ -342,10 +340,7 @@ _CCCL_GLOBAL_CONSTANT get_completion_domain_t<set_error_t> get_completion_domain
 template <>
 _CCCL_GLOBAL_CONSTANT get_completion_domain_t<set_stopped_t> get_completion_domain<set_stopped_t>{};
 
-template <class _Tag, class _Sndr, class... _Env>
-using __completion_domain_of_t = __call_result_t<get_completion_domain_t<_Tag>, env_of_t<_Sndr>, const _Env&...>;
-
-// Used by the schedule_from and continues_on senders
+// Used by the continues_on sender
 struct get_domain_override_t
 {
   _CCCL_EXEC_CHECK_DISABLE
diff --git a/cudax/include/cuda/experimental/__execution/env.cuh b/cudax/include/cuda/experimental/__execution/env.cuh
index 06eb5fa4d80..97267644143 100644
--- a/cudax/include/cuda/experimental/__execution/env.cuh
+++ b/cudax/include/cuda/experimental/__execution/env.cuh
@@ -80,32 +80,32 @@ namespace __detail
 {
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __env_ref_fn
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(env<>) const noexcept -> env<>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(env<>) const noexcept -> env<>
   {
     return {};
   }
 
   _CCCL_TEMPLATE(class _Env, class = _Env*) // not considered if _Env is a reference type
   _CCCL_REQUIRES((!::cuda::__is_specialization_of_v<_Env, __fwd_env_>) )
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Env&& __env) const noexcept -> _Env
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Env&& __env) const noexcept -> _Env
   {
     return static_cast<_Env&&>(__env);
   }
 
   template <class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env& __env) const noexcept -> __env_ref_<_Env>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env& __env) const noexcept -> __env_ref_<_Env>
   {
     return __env_ref_<_Env>{__env};
   }
 
   template <class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept -> __env_ref_<_Env>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept -> __env_ref_<_Env>
   {
     return __env;
   }
 
   template <class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const __fwd_env_<_Env>& __env) const noexcept
+  [[nodiscard]] _CCCL_API constexpr auto operator()(const __fwd_env_<_Env>& __env) const noexcept
     -> __fwd_env_<_Env const&>
   {
     return __fwd_env_<_Env const&>{__env.__env_};
@@ -142,20 +142,19 @@ namespace __detail
 {
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __fwd_env_fn
 {
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(env<>) const noexcept -> env<>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(env<>) const noexcept -> env<>
   {
     return {};
   }
 
   template <class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept
-    -> __fwd_env_<_Env const&>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(__env_ref_<_Env> __env) const noexcept -> __fwd_env_<_Env const&>
   {
     return __fwd_env_<_Env const&>{__env.__env_};
   }
 
   template <class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Env&& __env) const noexcept
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Env&& __env) const noexcept
   {
     static_assert(__nothrow_movable<_Env>);
     // If the environment is already a forwarding environment, we can just return it.
@@ -181,11 +180,8 @@ _CCCL_GLOBAL_CONSTANT __detail::__fwd_env_fn __fwd_env{};
 
 //! @brief __sch_env_t is a utility that builds an environment from a scheduler. It
 //! defines the `get_scheduler` query and provides a default for the `get_domain` query.
-template <class _Sch, class... _PrevSch>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t;
-
 template <class _Sch>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t<_Sch>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t
 {
   [[nodiscard]] _CCCL_API constexpr auto query(get_scheduler_t) const noexcept -> _Sch
   {
@@ -200,39 +196,18 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t<_Sch>
   _Sch __sch_;
 };
 
-template <class _Sch, class _PrevSch>
-struct _CCCL_TYPE_VISIBILITY_DEFAULT __sch_env_t<_Sch, _PrevSch>
-{
-  [[nodiscard]] _CCCL_API constexpr auto query(get_scheduler_t) const noexcept -> _Sch
-  {
-    return __sch_;
-  }
-
-  [[nodiscard]] _CCCL_API constexpr auto query(get_domain_t) const noexcept
-  {
-    return __query_result_or_t<_Sch, get_completion_domain_t<set_value_t>, default_domain>{};
-  }
-
-  [[nodiscard]] _CCCL_API constexpr auto query(get_previous_scheduler_t) const noexcept -> _PrevSch
-  {
-    return __prev_sch_;
-  }
-
-  _Sch __sch_;
-  _PrevSch __prev_sch_;
-};
-
-template <class _Sch, class... _PrevSch>
-_CCCL_HOST_DEVICE __sch_env_t(_Sch, _PrevSch...) -> __sch_env_t<_Sch, _PrevSch...>;
+template <class _Sch>
+_CCCL_HOST_DEVICE __sch_env_t(_Sch) -> __sch_env_t<_Sch>;
 
 struct __mk_sch_env_t
 {
   template <class _Sch, class... _Env>
-  [[nodiscard]] _CCCL_API constexpr auto operator()(_Sch __sch, const _Env&... __env) const noexcept
+  [[nodiscard]] _CCCL_API constexpr auto operator()([[maybe_unused]] _Sch __sch, const _Env&... __env) const noexcept
   {
-    if constexpr ((__callable<get_scheduler_t, const _Env&> || ...))
+    if constexpr (__completes_inline<env_of_t<schedule_result_t<_Sch>>, _Env...>
+                  && (__callable<get_scheduler_t, const _Env&> || ...))
     {
-      return __sch_env_t{__sch, get_scheduler(__env)...};
+      return __sch_env_t{get_scheduler(__env)...};
     }
     else
     {
@@ -272,17 +247,14 @@ _CCCL_HOST_DEVICE __sch_attrs_t(_Sch) -> __sch_attrs_t<_Sch>;
 // __inln_attrs
 
 //! @brief __inln_attrs_t is a utility that builds an attributes queryable for a sender
-//! that completes inline. It delegates the @c get_completion_scheduler and @c get_completion_domain
-//! queries to the receiver's environment.
-//!
-//! @tparam _Tags The completion tags for which @c get_completion_signatures should return
-//! the current scheduler, and @c get_completion_domain should return the current domain.
-template <class... _Tags>
+//! that completes inline. It implements get_completion_behavior to return
+//! completion_behavior::inline_completion, and relies on the logic of
+//! get_completion_scheduler and get_completion_domain to provide the current scheduler
+//! and domain based on the environment.
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __inln_attrs_t
 {
-  _CCCL_TEMPLATE(class _Tag, class _Env, class _Sch = __scheduler_of_t<_Env>)
-  _CCCL_REQUIRES(__one_of<_Tag, _Tags...> _CCCL_AND __callable<get_scheduler_t, const _Env&>)
-  [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_Tag>, const _Env& __env) const noexcept
+  template <class _Tag, class _Env, class _Sch = __scheduler_of_t<_Env>>
+  [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<set_value_t>, const _Env& __env) const noexcept
     -> __call_result_or_t<get_completion_scheduler_t<set_value_t>, _Sch, _Sch, __hide_scheduler<const _Env&>>
   {
     _Sch __sch = get_scheduler(__env);
@@ -291,9 +263,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __inln_attrs_t
     return __call_or(get_completion_scheduler<set_value_t>, __sch, __sch, __hide_scheduler{__env});
   }
 
-  _CCCL_TEMPLATE(class _Tag, class _Env)
-  _CCCL_REQUIRES(__one_of<_Tag, _Tags...> _CCCL_AND __callable<get_domain_t, const _Env&>)
-  [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_Tag>, const _Env& __env) const noexcept
+  template <class _Env>
+  [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<set_value_t>, const _Env& __env) const noexcept
     -> __call_result_t<get_domain_t, const _Env&>
   {
     return {};
diff --git a/cudax/include/cuda/experimental/__execution/fwd.cuh b/cudax/include/cuda/experimental/__execution/fwd.cuh
index 2b53da29b15..09748559cb0 100644
--- a/cudax/include/cuda/experimental/__execution/fwd.cuh
+++ b/cudax/include/cuda/experimental/__execution/fwd.cuh
@@ -119,7 +119,7 @@ template <class... _Sigs>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT completion_signatures;
 
 template <class _Sndr, class... _Env>
-_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto get_completion_signatures();
+_CCCL_API _CCCL_CONSTEVAL auto get_completion_signatures();
 
 template <class _Sndr, class... _Env>
 using completion_signatures_of_t _CCCL_NODEBUG_ALIAS = decltype(execution::get_completion_signatures<_Sndr, _Env...>());
@@ -193,7 +193,6 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT start_detached_t;
 struct _CCCL_TYPE_VISIBILITY_DEFAULT get_allocator_t;
 struct _CCCL_TYPE_VISIBILITY_DEFAULT get_stop_token_t;
 struct _CCCL_TYPE_VISIBILITY_DEFAULT get_scheduler_t;
-struct _CCCL_TYPE_VISIBILITY_DEFAULT get_previous_scheduler_t;
 struct _CCCL_TYPE_VISIBILITY_DEFAULT get_delegation_scheduler_t;
 struct _CCCL_TYPE_VISIBILITY_DEFAULT get_forward_progress_guarantee_t;
 template <class _Tag>
@@ -211,7 +210,13 @@ template <class _Env>
 using __scheduler_of_t _CCCL_NODEBUG_ALIAS = decay_t<__call_result_t<get_scheduler_t, _Env>>;
 
 template <class _Env>
-using __previous_scheduler_of_t _CCCL_NODEBUG_ALIAS = decay_t<__call_result_t<get_previous_scheduler_t, _Env>>;
+using __domain_of_t _CCCL_NODEBUG_ALIAS = decay_t<::cuda::std::__type_call<
+  ::cuda::std::__type_try_catch<::cuda::std::__type_bind_front_quote<__call_result_t, get_domain_t>,
+                                ::cuda::std::__type_always<default_domain>>,
+  _Env>>;
+
+template <class _Tag, class _Sndr, class... _Env>
+using __completion_domain_of_t = __call_result_t<get_completion_domain_t<_Tag>, env_of_t<_Sndr>, _Env...>;
 
 // get_forward_progress_guarantee:
 enum class forward_progress_guarantee
@@ -226,7 +231,7 @@ namespace __detail
 struct __get_tag
 {
   template <class _Tag, class... _Child>
-  _CCCL_NODEBUG_API constexpr auto operator()(int, _Tag, ::cuda::std::__ignore_t, _Child&&...) const -> _Tag
+  _CCCL_API constexpr auto operator()(int, _Tag, ::cuda::std::__ignore_t, _Child&&...) const -> _Tag
   {
     return _Tag{};
   }
@@ -236,7 +241,8 @@ template <class _Sndr, class _Tag = __visit_result_t<__get_tag&, _Sndr, int&>>
 extern __fn_ptr_t<_Tag> __tag_of_v;
 } // namespace __detail
 
-template <class _Sndr>
+_CCCL_TEMPLATE(class _Sndr)
+_CCCL_REQUIRES(__is_sender<_Sndr>)
 using tag_of_t _CCCL_NODEBUG_ALIAS = decltype(__detail::__tag_of_v<_Sndr>());
 
 template <class _Sndr, class... _Tag>
diff --git a/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh
index 8cab2d7d058..3940b00aaa5 100644
--- a/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh
+++ b/cudax/include/cuda/experimental/__execution/get_completion_signatures.cuh
@@ -94,7 +94,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __sender_type_check_failure //
 private:
   friend struct __compile_time_error<__sender_type_check_failure>;
 
-  _CCCL_NODEBUG_API constexpr auto __what() const noexcept -> const char*
+  _CCCL_API constexpr auto __what() const noexcept -> const char*
   {
     return "This sender is not well-formed. It does not meet the requirements of a sender type.";
   }
@@ -111,7 +111,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT dependent_sender_error : __compile_time_err
 private:
   friend struct __compile_time_error<dependent_sender_error>;
 
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto __what() const noexcept -> char const*
+  [[nodiscard]] _CCCL_API constexpr auto __what() const noexcept -> char const*
   {
     return __what_;
   }
@@ -122,7 +122,7 @@ private:
 template <class _Sndr>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __dependent_sender_error : dependent_sender_error
 {
-  _CCCL_NODEBUG_API constexpr __dependent_sender_error() noexcept
+  _CCCL_API constexpr __dependent_sender_error() noexcept
       : dependent_sender_error{"This sender needs to know its execution " //
                                "environment before it can know how it will complete."}
   {}
@@ -192,7 +192,7 @@ template <class... _Sndr>
     else
 
 template <class... _Sndr>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __dependent_sender() -> __dependent_sender_error<_Sndr...>
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __dependent_sender() -> __dependent_sender_error<_Sndr...>
 {
   return __dependent_sender_error<_Sndr...>{};
 }
@@ -216,7 +216,7 @@ struct _A_GET_COMPLETION_SIGNATURES_CUSTOMIZATION_RETURNED_A_TYPE_THAT_IS_NOT_A_
 {};
 
 template <class _Completions>
-_CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __checked_complsigs()
+_CCCL_API _CCCL_CONSTEVAL auto __checked_complsigs()
 {
   _CUDAX_LET_COMPLETIONS(auto(__cs) = _Completions())
   {
@@ -260,7 +260,7 @@ struct _COULD_NOT_DETERMINE_COMPLETION_SIGNATURES_FOR_THIS_SENDER
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Sndr, class... _Env>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __get_completion_signatures_helper()
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __get_completion_signatures_helper()
 {
   if constexpr (__has_get_completion_signatures<_Sndr, _Env...>)
   {
@@ -288,7 +288,7 @@ template <class _Sndr, class... _Env>
 }
 
 template <class _Sndr, class... _Env>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto get_completion_signatures()
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto get_completion_signatures()
 {
   static_assert(sizeof...(_Env) <= 1, "At most one environment is allowed.");
   if constexpr (0 == sizeof...(_Env))
@@ -304,7 +304,7 @@ template <class _Sndr, class... _Env>
 }
 
 template <class _Parent, class _Child, class... _Env>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto get_child_completion_signatures()
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto get_child_completion_signatures()
 {
   return get_completion_signatures<::cuda::std::__copy_cvref_t<_Parent, _Child>, __fwd_env_t<_Env>...>();
 }
diff --git a/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh b/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh
index 24700597d4b..c0714f8cf53 100644
--- a/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh
+++ b/cudax/include/cuda/experimental/__execution/inline_scheduler.cuh
@@ -36,10 +36,10 @@
 namespace cuda::experimental::execution
 {
 //! Scheduler that returns a sender that always completes inline (successfully).
-struct _CCCL_TYPE_VISIBILITY_DEFAULT inline_scheduler : __inln_attrs_t<set_value_t>
+struct _CCCL_TYPE_VISIBILITY_DEFAULT inline_scheduler : __inln_attrs_t
 {
 private:
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t : __inln_attrs_t<set_value_t>
+  struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t : __inln_attrs_t
   {};
 
   template <class _Rcvr>
diff --git a/cudax/include/cuda/experimental/__execution/just.cuh b/cudax/include/cuda/experimental/__execution/just.cuh
index e78f02d1702..0aa9193f379 100644
--- a/cudax/include/cuda/experimental/__execution/just.cuh
+++ b/cudax/include/cuda/experimental/__execution/just.cuh
@@ -82,7 +82,7 @@ private:
 public:
   _CCCL_EXEC_CHECK_DISABLE
   template <class... _Ts>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Ts... __ts) const;
+  _CCCL_API constexpr auto operator()(_Ts... __ts) const;
 };
 
 struct just_t : __just_t<just_t, set_value_t>
@@ -132,10 +132,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_t<_JustTag, _SetTag>::__sndr_base_t
 
   [[nodiscard]] _CCCL_API static constexpr auto get_env() noexcept
   {
-    return __inln_attrs_t<__set_tag_t>{};
+    return __inln_attrs_t{};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS __just_tag_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ __just_tag_t __tag_;
   ::cuda::std::__tuple<_Ts...> __values_;
 };
 
@@ -159,7 +159,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT just_stopped_t::__sndr_t
 _CCCL_EXEC_CHECK_DISABLE
 template <class _JustTag, class _SetTag>
 template <class... _Ts>
-_CCCL_NODEBUG_API constexpr auto __just_t<_JustTag, _SetTag>::operator()(_Ts... __ts) const
+_CCCL_API constexpr auto __just_t<_JustTag, _SetTag>::operator()(_Ts... __ts) const
 {
   using __sndr_t = typename _JustTag::template __sndr_t<_Ts...>;
   return __sndr_t{{{}, {static_cast<_Ts&&>(__ts)...}}};
diff --git a/cudax/include/cuda/experimental/__execution/just_from.cuh b/cudax/include/cuda/experimental/__execution/just_from.cuh
index 3b1a2e276db..1d0dcddbe01 100644
--- a/cudax/include/cuda/experimental/__execution/just_from.cuh
+++ b/cudax/include/cuda/experimental/__execution/just_from.cuh
@@ -70,13 +70,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_from_t
   template <class _Rcvr>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __complete_fn
   {
-    _Rcvr& __rcvr_;
-
     template <class... _Ts>
-    _CCCL_API auto operator()(_Ts&&... __ts) const noexcept
+    _CCCL_API void operator()(_Ts&&... __ts) const noexcept
     {
       _SetTag{}(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Ts&&>(__ts)...);
     }
+
+    _Rcvr& __rcvr_;
   };
 
   template <class _Rcvr, class _Fn>
@@ -99,7 +99,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_from_t
 
 public:
   template <class _Fn>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Fn __fn) const noexcept;
+  _CCCL_API constexpr auto operator()(_Fn __fn) const noexcept;
 };
 
 struct just_from_t : __just_from_t<just_from_t, set_value_t>
@@ -148,10 +148,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __just_from_t<_JustFromTag, _SetTag>::__snd
 
   [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept
   {
-    return __inln_attrs_t<_SetTag>{};
+    return __inln_attrs_t{};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS __just_from_tag_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ __just_from_tag_t __tag_;
   _Fn __fn_;
 };
 
@@ -171,7 +171,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT just_stopped_from_t::__sndr_t
 
 template <class _JustFromTag, class _SetTag>
 template <class _Fn>
-_CCCL_NODEBUG_API constexpr auto __just_from_t<_JustFromTag, _SetTag>::operator()(_Fn __fn) const noexcept
+_CCCL_API constexpr auto __just_from_t<_JustFromTag, _SetTag>::operator()(_Fn __fn) const noexcept
 {
   using __sndr_t                          = typename _JustFromTag::template __sndr_t<_Fn>;
   using __completions _CCCL_NODEBUG_ALIAS = __call_result_t<_Fn, __probe_fn>;
diff --git a/cudax/include/cuda/experimental/__execution/lazy.cuh b/cudax/include/cuda/experimental/__execution/lazy.cuh
index 4b44d8daf59..d32854b7fbc 100644
--- a/cudax/include/cuda/experimental/__execution/lazy.cuh
+++ b/cudax/include/cuda/experimental/__execution/lazy.cuh
@@ -91,7 +91,7 @@ template <>
 struct __lazy_tupl<::cuda::std::index_sequence<>>
 {
   template <class _Fn, class _Self, class... _Us>
-  _CCCL_NODEBUG_API static auto __apply(_Fn&& __fn, _Self&&, _Us&&... __us) //
+  _CCCL_API static auto __apply(_Fn&& __fn, _Self&&, _Us&&... __us) //
     noexcept(__nothrow_callable<_Fn, _Us...>) -> __call_result_t<_Fn, _Us...>
   {
     return static_cast<_Fn&&>(__fn)(static_cast<_Us&&>(__us)...);
@@ -104,7 +104,7 @@ struct __lazy_tupl<::cuda::std::index_sequence<_Idx...>, _Ts...> : __detail::__l
   template <size_t _Ny>
   using __at _CCCL_NODEBUG_ALIAS = ::cuda::std::__type_index_c<_Ny, _Ts...>;
 
-  _CCCL_NODEBUG_API __lazy_tupl() noexcept {}
+  _CCCL_API __lazy_tupl() noexcept {}
 
   _CCCL_API ~__lazy_tupl()
   {
@@ -112,13 +112,13 @@ struct __lazy_tupl<::cuda::std::index_sequence<_Idx...>, _Ts...> : __detail::__l
   }
 
   template <size_t _Ny, class _Ty>
-  _CCCL_NODEBUG_API _Ty* __get() noexcept
+  _CCCL_API _Ty* __get() noexcept
   {
     return reinterpret_cast<_Ty*>(this->__detail::__lazy_box<_Ny, _Ty>::__data_);
   }
 
   template <size_t _Ny, class... _Us>
-  _CCCL_NODEBUG_API __at<_Ny>& __emplace(_Us&&... __us) //
+  _CCCL_API __at<_Ny>& __emplace(_Us&&... __us) //
     noexcept(__nothrow_constructible<__at<_Ny>, _Us...>)
   {
     using _Ty _CCCL_NODEBUG_ALIAS = __at<_Ny>;
@@ -128,7 +128,7 @@ struct __lazy_tupl<::cuda::std::index_sequence<_Idx...>, _Ts...> : __detail::__l
   }
 
   template <class _Fn, class _Self, class... _Us>
-  _CCCL_NODEBUG_API static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
+  _CCCL_API static auto __apply(_Fn&& __fn, _Self&& __self, _Us&&... __us) //
     noexcept(__nothrow_callable<_Fn, _Us..., ::cuda::std::__copy_cvref_t<_Self, _Ts>...>)
       -> __call_result_t<_Fn, _Us..., ::cuda::std::__copy_cvref_t<_Self, _Ts>...>
   {
diff --git a/cudax/include/cuda/experimental/__execution/let_value.cuh b/cudax/include/cuda/experimental/__execution/let_value.cuh
index 4420fa6d271..06285b46ab3 100644
--- a/cudax/include/cuda/experimental/__execution/let_value.cuh
+++ b/cudax/include/cuda/experimental/__execution/let_value.cuh
@@ -106,7 +106,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_t
   {
     using __base_t = __rcvr_ref_t<__rcvr_with_env_t<_Rcvr, _Env2>>;
 
-    _CCCL_NODEBUG_API explicit constexpr __sndr2_rcvr_t(__rcvr_with_env_t<_Rcvr, _Env2>& __rcvr) noexcept
+    _CCCL_API explicit constexpr __sndr2_rcvr_t(__rcvr_with_env_t<_Rcvr, _Env2>& __rcvr) noexcept
         : __base_t(__ref_rcvr(__rcvr))
     {}
   };
@@ -341,13 +341,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_t
                                                   // extended (host/device) lambda
   {
     template <class _Sndr>
-    [[nodiscard]] _CCCL_NODEBUG_API auto operator()(_Sndr __sndr) const -> __call_result_t<_LetTag, _Sndr, _Fn>
+    [[nodiscard]] _CCCL_API auto operator()(_Sndr __sndr) const -> __call_result_t<_LetTag, _Sndr, _Fn>
     {
       return _LetTag{}(static_cast<_Sndr&&>(__sndr), __fn_);
     }
 
     template <class _Sndr>
-    [[nodiscard]] _CCCL_NODEBUG_API friend auto operator|(_Sndr __sndr, const __closure_t& __self)
+    [[nodiscard]] _CCCL_API friend auto operator|(_Sndr __sndr, const __closure_t& __self)
       -> __call_result_t<_LetTag, _Sndr, _Fn>
     {
       return _LetTag{}(static_cast<_Sndr&&>(__sndr), __self.__fn_);
@@ -365,10 +365,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_base_t : __let_t
   //! @tparam _Fn The function to be called when the predecessor sender
   //! completes.
   template <class _Sndr, class _Fn>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const;
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const;
 
   template <class _Fn>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Fn __fn) const noexcept;
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Fn __fn) const noexcept;
 };
 
 struct let_value_t : __let_base_t<let_value_t>
@@ -507,7 +507,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __let_t::__sndr_t
     return {__sndr_};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS _LetTag __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ _LetTag __tag_;
   _Fn __fn_;
   _Sndr __sndr_;
 };
@@ -541,7 +541,7 @@ using __all_non_dependent_t = ::cuda::std::__fold_and<(!dependent_sender<_Sndr>)
 
 template <class _LetTag>
 template <class _Sndr, class _Fn>
-[[nodiscard]] _CCCL_NODEBUG_API constexpr auto __let_base_t<_LetTag>::operator()(_Sndr __sndr, _Fn __fn) const
+[[nodiscard]] _CCCL_API constexpr auto __let_base_t<_LetTag>::operator()(_Sndr __sndr, _Fn __fn) const
 {
   using __sndr_t = typename _LetTag::template __sndr_t<_Sndr, _Fn>;
 
@@ -571,7 +571,7 @@ template <class _Sndr, class _Fn>
 
 template <class _LetTag>
 template <class _Fn>
-[[nodiscard]] _CCCL_NODEBUG_API constexpr auto __let_base_t<_LetTag>::operator()(_Fn __fn) const noexcept
+[[nodiscard]] _CCCL_API constexpr auto __let_base_t<_LetTag>::operator()(_Fn __fn) const noexcept
 {
   using __closure_t = typename _LetTag::template __closure_t<_Fn>;
   return __closure_t{{static_cast<_Fn&&>(__fn)}};
diff --git a/cudax/include/cuda/experimental/__execution/on.cuh b/cudax/include/cuda/experimental/__execution/on.cuh
index 33102d482e0..47243d0fd77 100644
--- a/cudax/include/cuda/experimental/__execution/on.cuh
+++ b/cudax/include/cuda/experimental/__execution/on.cuh
@@ -50,7 +50,8 @@ namespace cuda::experimental::execution
 //! resource where the `on` sender was started.
 //!
 //! @code
-//! auto result = on(gpu_scheduler, some_computation) | sync_wait();
+//! auto sndr = on(gpu_scheduler, some_computation);
+//! auto [result] = sync_wait(std::move(sndr)).value();
 //! @endcode
 //!
 //! ## Form 2: `on(sender, scheduler, closure)` or `sender | on(scheduler, closure)`
@@ -60,7 +61,8 @@ namespace cuda::experimental::execution
 //! execution back to where the original sender completed.
 //!
 //! @code
-//! auto result = some_computation | on(gpu_scheduler, then([](auto value) { return process(value); }));
+//! auto sndr = some_computation | on(gpu_scheduler, then([](auto value) { /*...*/ }));
+//! auto [result] = sync_wait(std::move(sndr)).value();
 //! @endcode
 //!
 //! ## Behavior
@@ -138,17 +140,17 @@ struct on_t
     }
   };
 
-  // Helper alias for the environment of the receiver used to connect the child sender
-  // in the on(sch, sndr) case.
-  template <class _Sch, class _Env>
-  using __env2_t = __join_env_t<__call_result_t<__mk_sch_env_t, _Sch, _Env>, _Env>;
-
   template <class _Sch, class _Env>
   [[nodiscard]] _CCCL_API static constexpr auto __mk_env2(_Sch __sch, _Env&& __env)
   {
     return __join_env(__mk_sch_env(__sch, __env), static_cast<_Env&&>(__env));
   }
 
+  // Helper alias for the environment of the receiver used to connect the child sender
+  // in the on(sch, sndr) case.
+  template <class _Sch, class _Env>
+  using __env2_t = decltype(__mk_env2(declval<_Sch>(), declval<_Env>()));
+
 public:
   template <class _Sch, class _Sndr, class... _Closure>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;
@@ -157,19 +159,19 @@ public:
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __closure_t
   {
     template <class _Sndr>
-    [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) &&
+    [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr) &&
     {
       return on_t{}(static_cast<_Sndr&&>(__sndr), __sch_, static_cast<_Closure&&>(__closure_));
     }
 
     template <class _Sndr>
-    [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) const&
+    [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr) const&
     {
       return on_t{}(static_cast<_Sndr&&>(__sndr), __sch_, __closure_);
     }
 
     template <class _Sndr>
-    [[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self)
+    [[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self)
     {
       return on_t{}(static_cast<_Sndr&&>(__sndr), __self.__sch_, static_cast<_Closure&&>(__self.__closure_));
     }
@@ -180,20 +182,20 @@ public:
 
   _CCCL_TEMPLATE(class _Sch, class _Sndr)
   _CCCL_REQUIRES(__is_sender<_Sndr>)
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const
+  _CCCL_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const
   {
     return __sndr_t<_Sch, _Sndr>{{}, __sch, __sndr};
   }
 
   _CCCL_TEMPLATE(class _Sch, class _Closure)
   _CCCL_REQUIRES((!__is_sender<_Closure>) )
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sch __sch, _Closure __closure) const
+  _CCCL_API constexpr auto operator()(_Sch __sch, _Closure __closure) const
   {
     return __closure_t<_Sch, _Closure>{__sch, static_cast<_Closure&&>(__closure)};
   }
 
   template <class _Sndr, class _Sch, class _Closure>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Sch __sch, _Closure __closure) const
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr, _Sch __sch, _Closure __closure) const
   {
     using __sndr_t = on_t::__sndr_t<_Sch, _Sndr, _Closure>;
     return __sndr_t{{}, {__sch, static_cast<_Closure&&>(__closure)}, static_cast<_Sndr&&>(__sndr)};
@@ -214,7 +216,7 @@ public:
   }
 
   template <class _Sndr, class _Env>
-  [[nodiscard]] _CCCL_API static constexpr auto transform_sender(start_t, _Sndr&& __sndr, const _Env& __env)
+  [[nodiscard]] _CCCL_API static constexpr auto transform_sender(set_value_t, _Sndr&& __sndr, const _Env& __env)
   {
     auto&& [__ign, __data, __child] = __sndr;
     if constexpr (__is_scheduler<decltype(__data)>)
@@ -222,6 +224,7 @@ public:
       // The on(sch, sndr) case:
       auto __old_sch = __call_or(get_scheduler, __not_a_scheduler{}, __env);
       using __sndr_t = __lowered_sndr_t<decltype(__child), decltype(__data), decltype(__old_sch)>;
+      static_assert(sender_for<__sndr_t, continues_on_t>);
       return __sndr_t{::cuda::std::forward_like<_Sndr>(__child), __data, __old_sch};
     }
     else
@@ -263,7 +266,7 @@ private:
     {
       // When it completes successfully, the on(sch, sndr) sender completes where it
       // starts.
-      return get_completion_scheduler<set_value_t>(__inln_attrs_t<set_value_t>{}, __env);
+      return get_scheduler(__env);
     }
     else
     {
@@ -312,7 +315,7 @@ private:
     {
       // When it completes successfully, the on(sch, sndr) sender completes where it
       // starts.
-      return __call_or(execution::get_completion_domain<_SetTag>, default_domain{}, execution::get_scheduler(__env));
+      return __call_or(execution::get_domain, default_domain{}, __env);
     }
     else
     {
@@ -369,7 +372,7 @@ private:
 
     template <class _SetTag, class _Env>
     [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&) const noexcept
-      -> __completion_domain_for_t<get_completion_domain_t<_SetTag>, _Env>
+      -> __completion_domain_for_t<_SetTag, _Env>
     {
       return {};
     }
@@ -408,12 +411,12 @@ private:
 public:
   using sender_concept = sender_t;
 
-  _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __attrs_t
+  _CCCL_API constexpr auto get_env() const noexcept -> __attrs_t
   {
     return {__sndr_};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS on_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ on_t __tag_;
   _Sch __sch_;
   _Sndr __sndr_;
 };
@@ -424,12 +427,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT on_t::__sndr_t<_Sch, _Sndr, _Closure>
 {
   using sender_concept = sender_t;
 
-  _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __fwd_env_t<env_of_t<_Sndr>>
+  _CCCL_API constexpr auto get_env() const noexcept -> __fwd_env_t<env_of_t<_Sndr>>
   {
     return __fwd_env(execution::get_env(__sndr_));
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS on_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ on_t __tag_;
   __closure_t<_Sch, _Closure> __sch_closure_;
   _Sndr __sndr_;
 };
@@ -442,8 +445,8 @@ inline constexpr size_t structured_binding_size<on_t::__sndr_t<_Sch, _Sndr>> = 3
 template <class _Sch, class _Sndr, class _Closure>
 inline constexpr size_t structured_binding_size<on_t::__sndr_t<_Sch, _Sndr, _Closure>> = 3;
 
-template <class _Sndr, class _NewSch, class _Env, class... _Closure>
-inline constexpr size_t structured_binding_size<on_t::__lowered_sndr_t<_Sndr, _NewSch, _Env, _Closure...>> = 3;
+template <class _Sndr, class _NewSch, class _OldSch, class... _Closure>
+inline constexpr size_t structured_binding_size<on_t::__lowered_sndr_t<_Sndr, _NewSch, _OldSch, _Closure...>> = 3;
 
 } // namespace cuda::experimental::execution
 
diff --git a/cudax/include/cuda/experimental/__execution/prologue.cuh b/cudax/include/cuda/experimental/__execution/prologue.cuh
index 437f0d26a43..482926303e9 100644
--- a/cudax/include/cuda/experimental/__execution/prologue.cuh
+++ b/cudax/include/cuda/experimental/__execution/prologue.cuh
@@ -24,6 +24,7 @@ _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_GCC("-Wsubobject-linkage")
 _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-value")
 _CCCL_DIAG_SUPPRESS_MSVC(4848) // [[no_unique_address]] prior to C++20 as a vendor extension
+_CCCL_DIAG_SUPPRESS_MSVC(4714) // function 'foo' marked as __forceinline not inlined
 
 _CCCL_DIAG_SUPPRESS_GCC("-Wmissing-braces")
 _CCCL_DIAG_SUPPRESS_CLANG("-Wmissing-braces")
diff --git a/cudax/include/cuda/experimental/__execution/queries.cuh b/cudax/include/cuda/experimental/__execution/queries.cuh
index dba8db1a767..747ecfb8b85 100644
--- a/cudax/include/cuda/experimental/__execution/queries.cuh
+++ b/cudax/include/cuda/experimental/__execution/queries.cuh
@@ -59,7 +59,7 @@ _CCCL_GLOBAL_CONSTANT struct get_allocator_t
     return __query_or(__env, *this, ::cuda::std::allocator<void>{});
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
@@ -78,7 +78,7 @@ _CCCL_GLOBAL_CONSTANT struct get_stop_token_t
     return __query_or(__env, *this, never_stop_token{});
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
@@ -99,33 +99,12 @@ _CCCL_GLOBAL_CONSTANT struct get_scheduler_t
     return __env.query(*this);
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
 } get_scheduler{};
 
-//////////////////////////////////////////////////////////////////////////////////////////
-// get_previous_scheduler
-_CCCL_GLOBAL_CONSTANT struct get_previous_scheduler_t
-{
-  _CCCL_EXEC_CHECK_DISABLE
-  _CCCL_TEMPLATE(class _Env)
-  _CCCL_REQUIRES(__queryable_with<_Env, get_previous_scheduler_t>)
-  [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env& __env) const noexcept
-    -> __query_result_t<_Env, get_previous_scheduler_t>
-  {
-    static_assert(noexcept(__env.query(*this)));
-    static_assert(__is_scheduler<__query_result_t<_Env, get_previous_scheduler_t>>);
-    return __env.query(*this);
-  }
-
-  [[nodiscard]] _CCCL_TRIVIAL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
-  {
-    return true;
-  }
-} get_previous_scheduler{};
-
 //////////////////////////////////////////////////////////////////////////////////////////
 // get_delegation_scheduler
 _CCCL_GLOBAL_CONSTANT struct get_delegation_scheduler_t
@@ -141,7 +120,7 @@ _CCCL_GLOBAL_CONSTANT struct get_delegation_scheduler_t
     return __env.query(*this);
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
@@ -221,11 +200,14 @@ private:
           return _Self{}(__read_query_t{}(__sch, __env...), __env...);
         }
       }
-      else if constexpr (__callable<__read_query_t, env_of_t<schedule_result_t<_Sch>>, const _Env&...>)
+      else
       {
-        _CCCL_ASSERT(__sch == __read_query_t{}(get_env(__sch.schedule()), __env...),
-                     "the scheduler's sender must have a completion scheduler attribute equal to the scheduler that "
-                     "provided it.");
+        if constexpr (__callable<__read_query_t, env_of_t<schedule_result_t<_Sch>>, const _Env&...>)
+        {
+          _CCCL_ASSERT(__sch == __read_query_t{}(get_env(__sch.schedule()), __env...),
+                       "the scheduler's sender must have a completion scheduler attribute equal to the scheduler that "
+                       "provided it.");
+        }
         return __sch;
       }
     }
@@ -286,7 +268,7 @@ public:
     }
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
@@ -317,7 +299,7 @@ _CCCL_GLOBAL_CONSTANT struct get_forward_progress_guarantee_t
     return __query_or(__sch, *this, forward_progress_guarantee::weakly_parallel);
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
@@ -335,21 +317,23 @@ struct __single_threaded_config_t : __single_threaded_config_base_t
   {}
 };
 
+_CCCL_GLOBAL_CONSTANT __single_threaded_config_t __single_threaded_config{};
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // get_launch_config: A sender can define this attribute to control the launch configuration
 // of the kernel it will launch when executed on a CUDA stream scheduler.
 _CCCL_GLOBAL_CONSTANT struct get_launch_config_t
 {
   template <class _Env>
-  [[nodiscard]] _CCCL_HOST_API constexpr auto operator()(const _Env& __env) const noexcept
+  [[nodiscard]] _CCCL_API constexpr auto operator()(const _Env& __env) const noexcept
     -> __query_result_or_t<_Env, get_launch_config_t, __single_threaded_config_t>
   {
     static_assert(__nothrow_queryable_with_or<_Env, get_launch_config_t, true>,
                   "The get_launch_config query must be noexcept.");
-    return __query_or(__env, *this, __single_threaded_config_t{});
+    return __query_or(__env, *this, __single_threaded_config);
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto query(forwarding_query_t) noexcept -> bool
+  [[nodiscard]] _CCCL_API static constexpr auto query(forwarding_query_t) noexcept -> bool
   {
     return true;
   }
diff --git a/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh b/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh
index f31a0620e51..940e2130859 100644
--- a/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh
+++ b/cudax/include/cuda/experimental/__execution/rcvr_ref.cuh
@@ -45,23 +45,23 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_ref
   {}
 
   template <class... _As>
-  _CCCL_NODEBUG_API constexpr void set_value(_As&&... __as) noexcept
+  _CCCL_API constexpr void set_value(_As&&... __as) noexcept
   {
     execution::set_value(static_cast<_Rcvr&&>(*__rcvr_), static_cast<_As&&>(__as)...);
   }
 
   template <class _Error>
-  _CCCL_NODEBUG_API constexpr void set_error(_Error&& __err) noexcept
+  _CCCL_API constexpr void set_error(_Error&& __err) noexcept
   {
     execution::set_error(static_cast<_Rcvr&&>(*__rcvr_), static_cast<_Error&&>(__err));
   }
 
-  _CCCL_NODEBUG_API constexpr void set_stopped() noexcept
+  _CCCL_API constexpr void set_stopped() noexcept
   {
     execution::set_stopped(static_cast<_Rcvr&&>(*__rcvr_));
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> env_of_t<_Rcvr>
+  [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> env_of_t<_Rcvr>
   {
     return execution::get_env(*__rcvr_);
   }
@@ -77,7 +77,7 @@ private:
 // 2. If the receiver is nothrow copy constructible, return it.
 // 3. Otherwise, return a __rcvr_ref wrapping the receiver.
 template <class _Rcvr>
-[[nodiscard]] _CCCL_NODEBUG_API constexpr auto __ref_rcvr(_Rcvr& __rcvr) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __ref_rcvr(_Rcvr& __rcvr) noexcept
 {
   if constexpr (__is_specialization_of_v<_Rcvr, __rcvr_ref>)
   {
diff --git a/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh b/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh
index fb7a22191a6..6c80e6aa185 100644
--- a/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh
+++ b/cudax/include/cuda/experimental/__execution/rcvr_with_env.cuh
@@ -41,7 +41,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_with_env_t : _Rcvr
     _CCCL_EXEC_CHECK_DISABLE
     _CCCL_TEMPLATE(class _Query, class... _Args)
     _CCCL_REQUIRES(__queryable_with<_Env, _Query, _Args...>)
-    [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(_Query, _Args&&... __args) const
+    [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const
       noexcept(__nothrow_queryable_with<_Env, _Query, _Args...>) -> __query_result_t<_Env, _Query, _Args...>
     {
       return __rcvr_->__env_.query(_Query{}, static_cast<_Args&&>(__args)...);
@@ -53,7 +53,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_with_env_t : _Rcvr
     _CCCL_TEMPLATE(class _Query, class... _Args)
     _CCCL_REQUIRES((!__queryable_with<_Env, _Query, _Args...>)
                      _CCCL_AND __forwarding_query<_Query> _CCCL_AND __queryable_with<env_of_t<_Rcvr>, _Query, _Args...>)
-    [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(_Query, _Args&&... __args) const
+    [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const
       noexcept(__nothrow_queryable_with<env_of_t<_Rcvr>, _Query, _Args...>)
         -> __query_result_t<env_of_t<_Rcvr>, _Query, _Args...>
     {
@@ -67,22 +67,22 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_with_env_t : _Rcvr
     __rcvr_with_env_t const* __rcvr_;
   };
 
-  [[nodiscard]] _CCCL_NODEBUG_API auto __base() && noexcept -> _Rcvr&&
+  [[nodiscard]] _CCCL_API auto __base() && noexcept -> _Rcvr&&
   {
     return static_cast<_Rcvr&&>(*this);
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API auto __base() & noexcept -> _Rcvr&
+  [[nodiscard]] _CCCL_API auto __base() & noexcept -> _Rcvr&
   {
     return *this;
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API auto __base() const& noexcept -> _Rcvr const&
+  [[nodiscard]] _CCCL_API auto __base() const& noexcept -> _Rcvr const&
   {
     return *this;
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto get_env() const noexcept -> __env_t
+  [[nodiscard]] _CCCL_API constexpr auto get_env() const noexcept -> __env_t
   {
     return __env_t{this};
   }
diff --git a/cudax/include/cuda/experimental/__execution/read_env.cuh b/cudax/include/cuda/experimental/__execution/read_env.cuh
index 8f9a1151ecd..6d6c39df4e2 100644
--- a/cudax/include/cuda/experimental/__execution/read_env.cuh
+++ b/cudax/include/cuda/experimental/__execution/read_env.cuh
@@ -102,7 +102,7 @@ public:
   /// invokes the query with the receiver's environment and forwards the result
   /// to the receiver's `set_value` member.
   template <class _Query>
-  _CCCL_NODEBUG_API constexpr __sndr_t<_Query> operator()(_Query) const noexcept;
+  _CCCL_API constexpr __sndr_t<_Query> operator()(_Query) const noexcept;
 };
 
 template <class _Query>
@@ -147,12 +147,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT read_env_t::__sndr_t
     return __attrs_t{};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS read_env_t __tag;
-  _CCCL_NO_UNIQUE_ADDRESS _Query __query;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ read_env_t __tag;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ _Query __query;
 };
 
 template <class _Query>
-_CCCL_NODEBUG_API constexpr read_env_t::__sndr_t<_Query> read_env_t::operator()(_Query __query) const noexcept
+_CCCL_API constexpr read_env_t::__sndr_t<_Query> read_env_t::operator()(_Query __query) const noexcept
 {
   return __sndr_t<_Query>{{}, __query};
 }
diff --git a/cudax/include/cuda/experimental/__execution/run_loop.cuh b/cudax/include/cuda/experimental/__execution/run_loop.cuh
index e640dcfdc51..8e02dcc76e0 100644
--- a/cudax/include/cuda/experimental/__execution/run_loop.cuh
+++ b/cudax/include/cuda/experimental/__execution/run_loop.cuh
@@ -70,7 +70,7 @@ public:
     using __execute_fn_t _CCCL_NODEBUG_ALIAS = void(__task*) noexcept;
 
     _CCCL_HIDE_FROM_ABI __task() = default;
-    _CCCL_NODEBUG_API explicit __task(__execute_fn_t* __execute_fn) noexcept
+    _CCCL_API explicit __task(__execute_fn_t* __execute_fn) noexcept
         : __execute_fn_(__execute_fn)
     {}
 
@@ -250,14 +250,24 @@ public:
   }
 
 private:
-  _CCCL_NO_UNIQUE_ADDRESS _Env __env_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ _Env __env_;
 };
 
+// A run_loop with an empty environment. This is a struct instead of a type alias to give
+// it a simpler type name that is easier to read in diagnostics.
 struct _CCCL_TYPE_VISIBILITY_DEFAULT run_loop : basic_run_loop<env<>>
 {
+  struct _CCCL_TYPE_VISIBILITY_DEFAULT scheduler : basic_run_loop::scheduler
+  {};
+
   _CCCL_API constexpr run_loop() noexcept
       : basic_run_loop<env<>>{env{}}
   {}
+
+  [[nodiscard]] _CCCL_API constexpr auto get_scheduler() noexcept -> scheduler
+  {
+    return scheduler{basic_run_loop::get_scheduler()};
+  }
 };
 
 template <class _Env>
diff --git a/cudax/include/cuda/experimental/__execution/sequence.cuh b/cudax/include/cuda/experimental/__execution/sequence.cuh
index a078fc0e09b..f8b5952f747 100644
--- a/cudax/include/cuda/experimental/__execution/sequence.cuh
+++ b/cudax/include/cuda/experimental/__execution/sequence.cuh
@@ -96,36 +96,28 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t
 {
   _CUDAX_SEMI_PRIVATE :
   template <class _Attrs, class... _Env>
-  using __env2_t = __detail::__seq_env_next_t<_Attrs, _Env...>;
+  using __env2_t = __join_env_t<__detail::__seq_env_next_t<_Attrs, __fwd_env_t<_Env>...>, __fwd_env_t<_Env>...>;
 
-  template <class _Rcvr, class _Env2>
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_base_t
+  template <class _Attrs, class... _Env>
+  [[nodiscard]] _CCCL_API static constexpr auto __mk_env2(const _Attrs& __attrs, const _Env&... __env) noexcept
+    -> __env2_t<_Attrs, _Env...>
   {
-    __rcvr_with_env_t<_Rcvr, _Env2> __rcvr2_;
-    void (*__start2_fn_)(__state_base_t*) noexcept;
-  };
+    return __join_env(__detail::__mk_seq_env_next(__attrs, __fwd_env(__env)...), __fwd_env(__env)...);
+  }
 
   template <class _Rcvr, class _Env2, class _Sndr2>
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t : __state_base_t<_Rcvr, _Env2>
+  struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t
   {
-    // TODO: __rcvr2_'s env should be wrapped in __fwd_env_t before it is used to
-    // connect __sndr2.
     _CCCL_API constexpr explicit __state_t(_Rcvr&& __rcvr, _Env2 __env, _Sndr2&& __sndr2)
-        : __state_base_t<_Rcvr, _Env2>{{static_cast<_Rcvr&&>(__rcvr), __env}, &__start2_fn}
-        , __opstate2_(execution::connect(static_cast<_Sndr2&&>(__sndr2), __ref_rcvr(this->__rcvr2_)))
+        : __rcvr2_{static_cast<_Rcvr&&>(__rcvr), __env}
+        , __opstate2_(execution::connect(static_cast<_Sndr2&&>(__sndr2), __ref_rcvr(__rcvr2_)))
     {}
 
-    _CCCL_API static constexpr void __start2_fn(__state_base_t<_Rcvr, _Env2>* __base) noexcept
-    {
-      auto* __state = static_cast<__state_t*>(__base);
-      execution::start(__state->__opstate2_);
-    }
-
-  private:
+    __rcvr_with_env_t<_Rcvr, _Env2> __rcvr2_;
     connect_result_t<_Sndr2, __rcvr_ref_t<__rcvr_with_env_t<_Rcvr, _Env2>>> __opstate2_;
   };
 
-  template <class _Rcvr, class _Env2>
+  template <class _Rcvr, class _Env2, class _Sndr2>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t
   {
     using receiver_concept = receiver_t;
@@ -133,7 +125,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t
     template <class... _Values>
     _CCCL_API constexpr void set_value(_Values&&...) noexcept
     {
-      __state_->__start2_fn_(__state_);
+      execution::start(__state_->__opstate2_);
     }
 
     template <class _Error>
@@ -152,19 +144,19 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t
       return __fwd_env(execution::get_env(__state_->__rcvr2_.__base()));
     }
 
-    __state_base_t<_Rcvr, _Env2>* __state_;
+    __state_t<_Rcvr, _Env2, _Sndr2>* __state_;
   };
 
   template <class _Rcvr, class _Sndr1, class _Sndr2>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
   {
     using operation_state_concept      = operation_state_t;
-    using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<env_of_t<_Sndr1>, env_of_t<_Rcvr>>;
+    using __env2_t _CCCL_NODEBUG_ALIAS = __detail::__seq_env_next_t<env_of_t<_Sndr1>, env_of_t<_Rcvr>>;
 
     // The moves from lvalues here is intentional:
     _CCCL_API constexpr __opstate_t(_Sndr1& __sndr1, _Sndr2& __sndr2, _Rcvr& __rcvr, __env2_t __env2)
         : __state_(static_cast<_Rcvr&&>(__rcvr), static_cast<__env2_t&&>(__env2), static_cast<_Sndr2&&>(__sndr2))
-        , __opstate1_(execution::connect(static_cast<_Sndr1&&>(__sndr1), __rcvr_t<_Rcvr, __env2_t>{&__state_}))
+        , __opstate1_(execution::connect(static_cast<_Sndr1&&>(__sndr1), __rcvr_t<_Rcvr, __env2_t, _Sndr2>{&__state_}))
     {}
 
     _CCCL_API constexpr __opstate_t(_Sndr1&& __sndr1, _Sndr2&& __sndr2, _Rcvr&& __rcvr)
@@ -182,76 +174,30 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t
 
   private:
     __state_t<_Rcvr, __env2_t, _Sndr2> __state_;
-    connect_result_t<_Sndr1, __rcvr_t<_Rcvr, __env2_t>> __opstate1_;
+    connect_result_t<_Sndr1, __rcvr_t<_Rcvr, __env2_t, _Sndr2>> __opstate1_;
   };
 
-  template <class _SetTag, class _Attrs1, class _Sndr2, class... _Env>
-  [[nodiscard]] _CCCL_API constexpr auto
-  __get_completion_scheduler(const _Attrs1& __attrs1, const _Sndr2& __sndr2, _Env&&... __env) noexcept
-  {
-    using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<_Attrs1, _Env...>;
-
-    if constexpr (__callable<get_completion_scheduler_t<_SetTag>, env_of_t<_Sndr2>, __join_env_t<__env2_t, _Env...>>)
-    {
-      // If the second sender has a completion scheduler for the given tag, use it.
-      auto __env2 = __detail::__mk_seq_env_next(__attrs1, __env...);
-      return get_completion_scheduler<_SetTag>(
-        get_env(__sndr2), __join_env(static_cast<__env2_t&&>(__env2), static_cast<_Env&&>(__env)...));
-    }
-    else if constexpr (get_completion_signatures<_Sndr2, __join_env_t<__env2_t, _Env...>>().count(_SetTag{}) == 0)
-    {
-      // If the second sender does not have any _SetTag completions, use the first sender's
-      // completion scheduler, if it has one:
-      if constexpr (__callable<get_completion_scheduler_t<_SetTag>, _Attrs1, __fwd_env_t<_Env>...>)
-      {
-        return get_completion_scheduler<_SetTag>(__attrs1, __fwd_env(static_cast<_Env&&>(__env))...);
-      }
-    }
-  }
-
-  template <class _SetTag, class _Attrs1, class _Sndr2, class... _Env>
-  [[nodiscard]] _CCCL_API constexpr auto
-  __get_completion_domain(const _Attrs1& __attrs1, const _Sndr2& __sndr2, _Env&&... __env) noexcept
-  {
-    using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<_Attrs1, _Env...>;
-
-    if constexpr (__callable<get_completion_domain_t<_SetTag>, env_of_t<_Sndr2>, __join_env_t<__env2_t, _Env...>>)
-    {
-      // If the second sender has a completion domain for the given tag, use it.
-      return __call_result_t<get_completion_domain_t<_SetTag>, env_of_t<_Sndr2>, __join_env_t<__env2_t, _Env...>>{};
-    }
-    else if constexpr (get_completion_signatures<_Sndr2, __join_env_t<__env2_t, _Env...>>().count(_SetTag{}) == 0)
-    {
-      // If the second sender does not have any _SetTag completions, use the first
-      // sender's completion domain, if it has one:
-      if constexpr (__callable<get_completion_domain_t<_SetTag>, _Attrs1, __fwd_env_t<_Env>...>)
-      {
-        return __call_result_t<get_completion_domain_t<_SetTag>, _Attrs1, __fwd_env_t<_Env>...>{};
-      }
-    }
-  }
-
 public:
   template <class _Sndr1, class _Sndr2>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;
 
   template <class _Sndr1, class _Sndr2>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const;
+  _CCCL_API constexpr auto operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const;
 };
 
 template <class _Sndr1, class _Sndr2>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t::__sndr_t
 {
-  using sender_concept               = sender_t;
-  using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<env_of_t<_Sndr1>>;
+  using sender_concept = sender_t;
+  template <class... _Env>
+  using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<env_of_t<_Sndr1>, _Env...>;
 
   template <class _Self, class... _Env>
   [[nodiscard]] _CCCL_API static _CCCL_CONSTEVAL auto get_completion_signatures()
   {
     _CUDAX_LET_COMPLETIONS(auto(__completions1) = get_child_completion_signatures<_Self, _Sndr1, _Env...>())
     {
-      _CUDAX_LET_COMPLETIONS(
-        auto(__completions2) = get_child_completion_signatures<_Self, _Sndr2, __join_env_t<__env2_t, _Env...>>())
+      _CUDAX_LET_COMPLETIONS(auto(__completions2) = get_child_completion_signatures<_Self, _Sndr2, __env2_t<_Env...>>())
       {
         // __swallow_transform to ignore the first sender's value completions
         return __completions2 + transform_completion_signatures(__completions1, __swallow_transform{});
@@ -279,45 +225,40 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t::__sndr_t
 
   struct __attrs_t
   {
-    // If the second sender does not have any _SetTag completions, we can look at the
-    // first sender for a completion scheduler.
-    _CCCL_TEMPLATE(class _SetTag, class... _Env, class _Env2 = sequence_t::__env2_t<env_of_t<_Sndr1>, _Env...>)
-    _CCCL_REQUIRES((!__has_completions_for<_SetTag, _Sndr2, __join_env_t<_Env2, _Env...>>) )
-    [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_SetTag>, _Env&&... __env) const noexcept
-      -> __call_result_t<get_completion_scheduler_t<_SetTag>, env_of_t<_Sndr1>, __fwd_env_t<_Env>...>
-    {
-      return get_completion_scheduler<_SetTag>(
-        execution::get_env(__self_->__sndr1_), __fwd_env(static_cast<_Env&&>(__env))...);
-    }
-
-    // If the second sender does not have any _SetTag completions, we can look at the
-    // first sender for a completion domain.
-    _CCCL_TEMPLATE(class _SetTag, class... _Env, class _Env2 = sequence_t::__env2_t<env_of_t<_Sndr1>, _Env...>)
-    _CCCL_REQUIRES((!__has_completions_for<_SetTag, _Sndr2, __join_env_t<_Env2, _Env...>>) )
-    [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&... __env) const noexcept
-      -> decay_t<__call_result_t<get_completion_domain_t<_SetTag>, env_of_t<_Sndr1>, __fwd_env_t<_Env>...>>
-    {
-      return {};
-    }
+    // If _Sndr2 has _SetTag completions but does not know its _SetTag completion scheduler,
+    // then we cannot know it either. Delete the function to prevent its use.
+    _CCCL_TEMPLATE(class _SetTag, class... _Env)
+    _CCCL_REQUIRES(__has_completions_for<_SetTag, _Sndr2, __env2_t<_Env...>> _CCCL_AND(
+      !__callable<get_completion_scheduler_t<_SetTag>, env_of_t<_Sndr2>, __env2_t<_Env...>>))
+    _CCCL_API auto query(get_completion_scheduler_t<_SetTag>, const _Env&...) const = delete;
+
+    // If _Sndr2 has _SetTag completions but does not know its _SetTag completion domain,
+    // then we cannot know it either. Delete the function to prevent its use.
+    _CCCL_TEMPLATE(class _SetTag, class... _Env)
+    _CCCL_REQUIRES(__has_completions_for<_SetTag, _Sndr2, __env2_t<_Env...>> _CCCL_AND(
+      !__callable<get_completion_domain_t<_SetTag>, env_of_t<_Sndr2>, __env2_t<_Env...>>))
+    _CCCL_API auto query(get_completion_domain_t<_SetTag>, const _Env&...) const = delete;
 
     template <class... _Env>
     [[nodiscard]] _CCCL_API constexpr auto query(get_completion_behavior_t, const _Env&...) const noexcept
     {
-      using __env2_t _CCCL_NODEBUG_ALIAS = sequence_t::__env2_t<env_of_t<_Sndr1>, _Env...>;
       return (execution::min) (execution::get_completion_behavior<_Sndr1, __fwd_env_t<_Env>...>(),
-                               execution::get_completion_behavior<_Sndr2, __join_env_t<__env2_t, _Env...>>());
+                               execution::get_completion_behavior<_Sndr2, __env2_t<_Env...>>());
     }
 
+    using __child_attrs_t = __join_env_t<env_of_t<_Sndr2>, env_of_t<_Sndr1>>;
+
     // The following overload will not be considered when _Query is get_domain_override_t
     // because get_domain_override_t is not a forwarding query.
     _CCCL_EXEC_CHECK_DISABLE
     _CCCL_TEMPLATE(class _Query, class... _Args)
-    _CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with<env_of_t<_Sndr2>, _Query, _Args...>)
+    _CCCL_REQUIRES(__forwarding_query<_Query> _CCCL_AND __queryable_with<__child_attrs_t, _Query, _Args...>)
     [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const
-      noexcept(__nothrow_queryable_with<env_of_t<_Sndr2>, _Query, _Args...>)
-        -> __query_result_t<env_of_t<_Sndr2>, _Query, _Args...>
+      noexcept(__nothrow_queryable_with<__child_attrs_t, _Query, _Args...>)
+        -> __query_result_t<__child_attrs_t, _Query, _Args...>
     {
-      return execution::get_env(__self_->__sndr2_).query(_Query{}, static_cast<_Args&&>(__args)...);
+      auto&& __env = __join_env(execution::get_env(__self_->__sndr2_), execution::get_env(__self_->__sndr1_));
+      return __env.query(_Query{}, static_cast<_Args&&>(__args)...);
     }
 
     __sndr_t const* __self_;
@@ -328,14 +269,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT sequence_t::__sndr_t
     return {this};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS sequence_t __tag_;
-  _CCCL_NO_UNIQUE_ADDRESS ::cuda::std::__ignore_t __ign_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ sequence_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ ::cuda::std::__ignore_t __ign_;
   _Sndr1 __sndr1_;
   _Sndr2 __sndr2_;
 };
 
 template <class _Sndr1, class _Sndr2>
-_CCCL_NODEBUG_API constexpr auto sequence_t::operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const
+_CCCL_API constexpr auto sequence_t::operator()(_Sndr1 __sndr1, _Sndr2 __sndr2) const
 {
   using __sndr_t _CCCL_NODEBUG_ALIAS = sequence_t::__sndr_t<_Sndr1, _Sndr2>;
   return __sndr_t{{}, {}, static_cast<_Sndr1&&>(__sndr1), static_cast<_Sndr2&&>(__sndr2)};
diff --git a/cudax/include/cuda/experimental/__execution/start_detached.cuh b/cudax/include/cuda/experimental/__execution/start_detached.cuh
index 1d8b8ffd9fe..9797cc6e0f4 100644
--- a/cudax/include/cuda/experimental/__execution/start_detached.cuh
+++ b/cudax/include/cuda/experimental/__execution/start_detached.cuh
@@ -97,7 +97,7 @@ public:
 
   /// run detached.
   template <class _Sndr>
-  _CCCL_NODEBUG_API void operator()(_Sndr __sndr) const
+  _CCCL_API void operator()(_Sndr __sndr) const
   {
     using __domain_t _CCCL_NODEBUG_ALIAS = __completion_domain_of_t<set_value_t, _Sndr, env<>>;
     execution::apply_sender(__domain_t{}, *this, static_cast<_Sndr&&>(__sndr));
diff --git a/cudax/include/cuda/experimental/__execution/starts_on.cuh b/cudax/include/cuda/experimental/__execution/starts_on.cuh
index 0a68918448d..6175a0bed2c 100644
--- a/cudax/include/cuda/experimental/__execution/starts_on.cuh
+++ b/cudax/include/cuda/experimental/__execution/starts_on.cuh
@@ -86,15 +86,15 @@ namespace cuda::experimental::execution
 struct starts_on_t
 {
 private:
-  template <class _Sch, class... _Env>
-  using __env2_t = __join_env_t<__call_result_t<__mk_sch_env_t, _Sch, _Env...>, _Env...>;
-
   template <class _Sch, class... _Env>
   [[nodiscard]] _CCCL_API static constexpr auto __mk_env2(_Sch __sch, _Env&&... __env)
   {
     return __join_env(__mk_sch_env(__sch, __env...), static_cast<_Env&&>(__env)...);
   }
 
+  template <class _Sch, class... _Env>
+  using __env2_t = decltype(__mk_env2(declval<_Sch>(), declval<_Env>()...));
+
 public:
   template <class _Sch, class _Sndr>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t;
@@ -107,7 +107,7 @@ public:
   }
 
   template <class _Sch, class _Sndr>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const;
+  _CCCL_API constexpr auto operator()(_Sch __sch, _Sndr __sndr) const;
 };
 
 template <class _Query>
@@ -129,7 +129,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t
   {
     // If the sender has a _SetTag completion, then the completion scheduler for _SetTag
-    // is the sender's if it has one.
+    // is the sender's.
     template <class _SetTag, class... _Env>
     [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_SetTag>, _Env&&... __env) const noexcept
       -> __call_result_t<get_completion_scheduler_t<_SetTag>, env_of_t<_Sndr>, __env2_t<_Sch, _Env>...>
@@ -138,19 +138,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t
         execution::get_env(__self_->__sndr_), __mk_env2(__self_->__sch_, static_cast<_Env&&>(__env))...);
     }
 
-    // If the sender does not have a _SetTag completion (and _SetTag is not set_value_t),
-    // then the completion scheduler for _SetTag is the scheduler sender's if it has one.
-    _CCCL_TEMPLATE(class _SetTag, class... _Env)
-    _CCCL_REQUIRES((!__same_as<_SetTag, set_value_t>) _CCCL_AND(
-      execution::get_completion_signatures<_Sndr, __env2_t<_Sch, _Env>...>().count(_SetTag{}) == 0))
-    [[nodiscard]] _CCCL_API constexpr auto query(get_completion_scheduler_t<_SetTag>, _Env&&... __env) const noexcept
-      -> __call_result_t<get_completion_scheduler_t<_SetTag>, _Sch, __fwd_env_t<_Env>...>
-    {
-      return get_completion_scheduler<_SetTag>(__self_->__sch_, __fwd_env(static_cast<_Env&&>(__env))...);
-    }
-
     // If the sender has a _SetTag completion, then the completion scheduler for _SetTag
-    // is the sender's if it has one.
+    // is the sender's.
     template <class _SetTag, class... _Env>
     [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&... __env) const noexcept
       -> __call_result_t<get_completion_domain_t<_SetTag>, env_of_t<_Sndr>, __env2_t<_Sch, _Env>...>
@@ -158,17 +147,6 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t
       return {};
     }
 
-    // If the sender does not have a _SetTag completion (and _SetTag is not set_value_t),
-    // then the completion scheduler for _SetTag is the scheduler sender's if it has one.
-    _CCCL_TEMPLATE(class _SetTag, class... _Env)
-    _CCCL_REQUIRES((!__same_as<_SetTag, set_value_t>) _CCCL_AND(
-      execution::get_completion_signatures<_Sndr, __env2_t<_Sch, _Env>...>().count(_SetTag{}) == 0))
-    [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<_SetTag>, _Env&&...) const noexcept
-      -> __call_result_t<get_completion_domain_t<_SetTag>, _Sch, __fwd_env_t<_Env>...>
-    {
-      return {};
-    }
-
     template <class... _Env>
     [[nodiscard]] _CCCL_API constexpr auto query(get_completion_behavior_t, _Env&&...) const noexcept
     {
@@ -212,13 +190,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT starts_on_t::__sndr_t
     return __attrs_t{this};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS starts_on_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ starts_on_t __tag_;
   _Sch __sch_;
   _Sndr __sndr_;
 };
 
 template <class _Sch, class _Sndr>
-[[nodiscard]] _CCCL_NODEBUG_API constexpr auto starts_on_t::operator()(_Sch __sch, _Sndr __sndr) const
+[[nodiscard]] _CCCL_API constexpr auto starts_on_t::operator()(_Sch __sch, _Sndr __sndr) const
 {
   return __sndr_t<_Sch, _Sndr>{{}, static_cast<_Sch&&>(__sch), static_cast<_Sndr&&>(__sndr)};
 }
diff --git a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
index eeb3a86b7f4..6da0844fc06 100644
--- a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
@@ -68,28 +68,22 @@ namespace cuda::experimental::execution
 {
 namespace __stream
 {
-template <class _Rcvr>
-struct __completion_fn
+struct __complete_rcvr
 {
-  template <class _Tag, class... _Args>
-  _CCCL_API void operator()(_Tag, _Args&&... __args) const noexcept
+  template <class _Rcvr, class _Tag, class... _Args>
+  _CCCL_API void operator()(_Rcvr& __rcvr, _Tag, _Args&&... __args) const noexcept
   {
-    _Tag{}(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Args&&>(__args)...);
+    _Tag{}(static_cast<_Rcvr&&>(__rcvr), static_cast<_Args&&>(__args)...);
   }
-
-  _Rcvr& __rcvr_;
 };
 
-template <class _Rcvr>
-struct __results_visitor
+struct __visit_results
 {
-  template <class _Tuple>
-  _CCCL_API void operator()(_Tuple&& __tuple) const noexcept
+  template <class _Rcvr, class _Tuple>
+  _CCCL_API void operator()(_Rcvr& __rcvr, _Tuple&& __tuple) const noexcept
   {
-    ::cuda::std::__apply(__completion_fn<_Rcvr>{__rcvr_}, static_cast<_Tuple&&>(__tuple));
+    ::cuda::std::__apply(__complete_rcvr{}, static_cast<_Tuple&&>(__tuple), __rcvr);
   }
-
-  _Rcvr& __rcvr_;
 };
 
 // __state_t lives in managed memory. It stores everything the operation state needs,
@@ -128,7 +122,7 @@ _CCCL_VISIBILITY_HIDDEN __launch_bounds__(_BlockThreads) __global__
   void __completion_kernel(__state_base_t<_Rcvr, _Variant>* __state)
 {
   _CCCL_ASSERT(__state->__results_.__index() != __npos, "__completion_kernel called with empty results");
-  _Variant::__visit(__results_visitor<_Rcvr>{__state->__rcvr_}, __state->__results_);
+  _Variant::__visit(__visit_results{}, __state->__results_, __state->__rcvr_);
 }
 
 // This is the environment of the inner receiver that is used to connect the child sender.
@@ -156,7 +150,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __env_t
   }
 
   _Env __env_;
-  _CCCL_NO_UNIQUE_ADDRESS _Config __launch_config_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ _Config __launch_config_;
 };
 
 // This is the inner receiver that is used to connect the child sender.
@@ -178,13 +172,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t
   }
 
   template <class... _Args>
-  _CCCL_NODEBUG_API constexpr void set_value(_Args&&... __args) noexcept
+  _CCCL_API constexpr void set_value(_Args&&... __args) noexcept
   {
     __complete(execution::set_value, static_cast<_Args&&>(__args)...);
   }
 
   template <class _Error>
-  _CCCL_NODEBUG_API constexpr void set_error(_Error&& __err) noexcept
+  _CCCL_API constexpr void set_error(_Error&& __err) noexcept
   {
     // Map any exception_ptr error completions to cudaErrorUnknown:
     if constexpr (__same_as<::cuda::std::remove_cvref_t<_Error>, ::std::exception_ptr>)
@@ -197,7 +191,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __rcvr_t
     }
   }
 
-  _CCCL_NODEBUG_API constexpr void set_stopped() noexcept
+  _CCCL_API constexpr void set_stopped() noexcept
   {
     __complete(execution::set_stopped);
   }
@@ -227,14 +221,14 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __opstate_t
 
   _CCCL_API constexpr void start() noexcept
   {
-    NV_IF_TARGET(NV_IS_HOST, (__host_start();), (__device_start();));
+    NV_IF_TARGET(NV_IS_HOST, ({ __host_start(); }), ({ __device_start(); }));
   }
 
   // This is called by the continues_on adaptor after it has sync'ed the stream.
   template <class _Rcvr2>
-  _CCCL_HOST_API auto __set_results(_Rcvr2& __rcvr) noexcept
+  _CCCL_API auto __set_results(_Rcvr2& __rcvr) noexcept
   {
-    __results_t::__visit(__results_visitor<_Rcvr2&>{__rcvr}, __get_state().__state_.__results_);
+    __results_t::__visit(__visit_results{}, __get_state().__state_.__results_, __rcvr);
   }
 
 private:
@@ -283,11 +277,6 @@ private:
     // of this operation.
     execution::start(__state.__opstate_);
 
-    // printf("Launching completion kernel for %s with %d block threads and %d grid blocks\n",
-    //        __name,
-    //        __block_threads,
-    //        __grid_blocks);
-
     // launch a kernel to pass the results to the receiver.
     __completion_kernel<__block_threads><<<__grid_blocks, __block_threads, 0, __stream_.get()>>>(&__state.__state_);
 
@@ -315,7 +304,7 @@ private:
   // This is the part of the operation state that is stored in managed memory.
   struct __state_t
   {
-    _CCCL_HOST_API constexpr explicit __state_t(_CvSndr&& __sndr, _Rcvr __rcvr)
+    _CCCL_API constexpr explicit __state_t(_CvSndr&& __sndr, _Rcvr __rcvr)
         : __state_{{static_cast<_Rcvr&&>(__rcvr), {}, false}, get_launch_config(execution::get_env(__sndr))}
         , __opstate_(execution::connect(static_cast<_CvSndr&&>(__sndr), __rcvr_t{&__state_}))
     {}
@@ -360,12 +349,13 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t
   // sender adaptor, like `then` or `let_value`. A stream sender adaptor is an
   // implementation detail that is not visible to the user. It should be as transparent as
   // possible.
-  _CCCL_TEMPLATE(class _Query)
-  _CCCL_REQUIRES(__queryable_with<env_of_t<_Sndr>, _Query>)
-  [[nodiscard]] _CCCL_API constexpr auto query(_Query) const noexcept(__nothrow_queryable_with<env_of_t<_Sndr>, _Query>)
-    -> __query_result_t<env_of_t<_Sndr>, _Query>
+  _CCCL_TEMPLATE(class _Query, class... _Args)
+  _CCCL_REQUIRES(__queryable_with<env_of_t<_Sndr>, _Query, _Args...>)
+  [[nodiscard]] _CCCL_API constexpr auto query(_Query, _Args&&... __args) const
+    noexcept(__nothrow_queryable_with<env_of_t<_Sndr>, _Query, _Args...>)
+      -> __query_result_t<env_of_t<_Sndr>, _Query, _Args...>
   {
-    return execution::get_env(__sndr_.__sndr_).query(_Query{});
+    return execution::get_env(__sndr_.__sndr_).query(_Query{}, static_cast<_Args&&>(__args)...);
   }
 
   const __sndr_t<_Sndr, _GetStream>& __sndr_;
@@ -407,8 +397,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t
     return __attrs_t<_Sndr, _GetStream>{*this};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS __tag_t<__stream::__tag_of_t<_Sndr>> __tag_;
-  _CCCL_NO_UNIQUE_ADDRESS _GetStream __get_stream_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ __tag_t<__stream::__tag_of_t<_Sndr>> __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ _GetStream __get_stream_;
   _Sndr __sndr_;
 };
 
diff --git a/cudax/include/cuda/experimental/__execution/stream/bulk.cuh b/cudax/include/cuda/experimental/__execution/stream/bulk.cuh
index 24beab3ab63..551b1226c33 100644
--- a/cudax/include/cuda/experimental/__execution/stream/bulk.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/bulk.cuh
@@ -89,7 +89,7 @@ struct __bulk_chunked_t : execution::__bulk_t<__bulk_chunked_t>
   // domain argument of stream_domain. It adapts a `bulk_chunked` sender to the stream
   // domain.
   template <class _Sndr>
-  _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const
+  _CCCL_API constexpr auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const
   {
     // Decompose the bulk sender into its components:
     auto& [__tag, __state, __child] = __sndr;
@@ -124,9 +124,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_unchunked_t : execution::__bulk_t<__
     {
       const _Shape __tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-      if (__tid < this->__shape_)
+      // Each thread processes exactly one element, if it is in range.
+      if (__tid < this->__state_->__shape_)
       {
-        this->__fn_(_Shape(__tid), __values...);
+        this->__state_->__fn_(_Shape(__tid), __values...);
       }
 
       __syncthreads();
@@ -135,7 +136,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_unchunked_t : execution::__bulk_t<__
       // elements.
       if (__tid == 0)
       {
-        execution::set_value(static_cast<_Rcvr&&>(this->__rcvr_), static_cast<_Values&&>(__values)...);
+        execution::set_value(static_cast<_Rcvr&&>(this->__state_->__rcvr_), static_cast<_Values&&>(__values)...);
       }
     }
   };
@@ -152,7 +153,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __bulk_unchunked_t : execution::__bulk_t<__
   // domain argument of stream_domain. It adapts a `bulk_unchunked` sender to the stream
   // domain.
   template <class _Sndr>
-  _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const
+  _CCCL_API constexpr auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const
   {
     // Decompose the bulk sender into its components:
     auto& [__tag, __state, __child] = __sndr;
@@ -186,11 +187,11 @@ struct __bulk_t : execution::__bulk_t<__bulk_t>
   {};
 
   template <class _Sndr>
-  _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const -> decltype(auto)
+  _CCCL_API constexpr auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const -> decltype(auto)
   {
     // This converts a bulk sender into a bulk_chunked sender, which will then be
     // further transformed by __bulk_chunked_t above.
-    return bulk.transform_sender(static_cast<_Sndr&&>(__sndr), env{});
+    return bulk.transform_sender(set_value, static_cast<_Sndr&&>(__sndr), env{});
   }
 };
 
diff --git a/cudax/include/cuda/experimental/__execution/stream/context.cuh b/cudax/include/cuda/experimental/__execution/stream/context.cuh
index 35af625124e..04eb2e7ca33 100644
--- a/cudax/include/cuda/experimental/__execution/stream/context.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/context.cuh
@@ -45,12 +45,12 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_context : private __immovable
     __stream_.sync();
   }
 
-  [[nodiscard]] _CCCL_HOST_API constexpr auto query(get_stream_t) const noexcept -> stream_ref
+  [[nodiscard]] _CCCL_API constexpr auto query(get_stream_t) const noexcept -> stream_ref
   {
     return __stream_;
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_HOST_API auto get_scheduler() noexcept -> stream_scheduler
+  [[nodiscard]] _CCCL_API auto get_scheduler() noexcept -> stream_scheduler
   {
     return stream_scheduler{__stream_};
   }
diff --git a/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh b/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh
index 43671359328..06bf7eff32a 100644
--- a/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/continues_on.cuh
@@ -45,6 +45,25 @@ namespace cuda::experimental::execution
 {
 namespace __stream
 {
+//! The customization of continues_on, when transferring back to the CPU, involves
+//! adapting the sender and receiver types.
+//!
+//! A continues_on sender such as continues_on(sndr, sch), where sndr completes on the GPU,
+//! needs to synchronize the CUDA stream to ensure that all queued GPU work is finished.
+//! Only then can the schedule operation be safely invoked -- from the CPU.
+//!
+//! To effect this, continues_on(sndr, sch) is transformed into
+//! continues_on(SYNC-STREAM-ADAPTOR(sndr), sch), where SYNC-STREAM-ADAPTOR(sndr) is a
+//! sender that does the following:
+//!
+//! 1. In connect (called on host): Connects sndr with a sink receiver that ignores values
+//!    passed to it and simply returns. The sink receiver's completion operations are
+//!    executed on device when the child sender completes.
+//!
+//! 2. In start (called on host): Starts the child sender, which launches kernels for the
+//!    predecessor operations, and then synchronizes the CUDA stream to ensure all queued
+//!    GPU work is finished. Then, it pulls the results from sndr's operation state and
+//!    passes them to the receiver on the host.
 struct __continues_on_t
 {
   // Transition from the GPU to the CPU domain
@@ -92,9 +111,18 @@ struct __continues_on_t
 
     _CCCL_IMMOVABLE(__opstate_t);
 
-    _CCCL_HOST_API void start() noexcept
+    _CCCL_API void start() noexcept
     {
+      NV_IF_TARGET(NV_IS_HOST, ({ __host_start(); }), ({ __device_start(); }));
+    }
+
+    _CCCL_HOST_API void __host_start() noexcept
+    {
+      // This launches all predecessor kernels on the given stream
       execution::start(__opstate_);
+
+      // Synchronize the CUDA stream to make sure all predecessor work has completed, and
+      // the results are available in __opstate_.
       if (auto __status = ::cudaStreamSynchronize(__stream_.get()); __status != ::cudaSuccess)
       {
         execution::set_error(static_cast<_Rcvr&&>(__rcvr_), cudaError_t(__status));
@@ -102,19 +130,31 @@ struct __continues_on_t
       else
       {
         // __opstate_ is an instance of __stream::__opstate_t, and it has a __set_results
-        // member function that will pass the results to the receiver on the host.
+        // member function that will pass the results to the receiver on the host. __rcvr_
+        // is the receiver of the parent default continues_on operation. That receiver
+        // will then start the schedule operation on the host.
         __opstate_.__set_results(__rcvr_);
       }
     }
 
+    [[noreturn]] _CCCL_DEVICE_API void __device_start() noexcept
+    {
+      _CCCL_ASSERT(false, "internal error: stream::continues_on opstate started on device");
+      ::cuda::std::terminate();
+
+      // We do not want the following to be called, but we need these code paths to be
+      // instantiated. Without this, the __device_start function in stream/adaptor.cuh
+      // will not be instantiated, and the kernel launch in the adaptor's __host_start
+      // function will fail.
+      execution::start(__opstate_);
+      __opstate_.__set_results(__rcvr_);
+    }
+
     _Rcvr __rcvr_;
     stream_ref __stream_;
     connect_result_t<_Sndr, __rcvr_t<_Rcvr>> __opstate_;
   };
 
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __thunk_t
-  {};
-
   template <class _Sndr>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t
   {
@@ -143,24 +183,36 @@ struct __continues_on_t
       return execution::get_env(__sndr_);
     }
 
-    _CCCL_NO_UNIQUE_ADDRESS __thunk_t __tag_;
-    _CCCL_NO_UNIQUE_ADDRESS ::cuda::std::__ignore_t __ignore_;
+    // The use of __tag_t here instructs the stream_domain not to apply any further
+    // transformations to this sender. See stream/domain.cuh.
+    /*_CCCL_NO_UNIQUE_ADDRESS*/ __tag_t<continues_on_t> __tag_;
+    /*_CCCL_NO_UNIQUE_ADDRESS*/ ::cuda::std::__ignore_t __ignore_;
     _Sndr __sndr_;
   };
 
-  // If the child sender has not already been adapted to be a stream sender, we adapt it
-  // now.
-  _CCCL_TEMPLATE(class _Sndr)
-  _CCCL_REQUIRES((!::cuda::__is_specialization_of_v<decay_t<__child_of_t<_Sndr>>, __stream::__sndr_t>) )
+  template <class _Sndr>
+  _CCCL_API static constexpr auto __mk_sndr(_Sndr&& __sndr)
+  {
+    return __sndr_t<_Sndr>{{}, {}, static_cast<_Sndr&&>(__sndr)};
+  }
+
+  // This function is called when a continues_on sender, with a predecessor that completes
+  // on the stream scheduler, is being connected. It wraps the child sender so that it
+  // synchronizes the stream after launching the child.
+  template <class _Sndr>
   [[nodiscard]] _CCCL_API auto operator()(set_value_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const
   {
-    auto& [__tag, __sched, __child] = __sndr;
-    using __child_t                 = ::cuda::std::__copy_cvref_t<_Sndr, decltype(__child)>;
+    [[maybe_unused]] auto& [__tag, __sched, __child] = __sndr;
+    using __child_t                                  = ::cuda::std::__copy_cvref_t<_Sndr, decltype(__child)>;
 
-    auto __adapted_sndr    = __stream::__adapt(static_cast<__child_t&&>(__child));
-    using __adapted_sndr_t = decltype(__adapted_sndr);
-    return execution::continues_on(
-      __sched, __sndr_t<__adapted_sndr_t>{{}, {}, static_cast<__adapted_sndr_t&&>(__adapted_sndr)});
+    if constexpr (::cuda::__is_specialization_of_v<decltype(__child), __sndr_t>)
+    {
+      return static_cast<_Sndr&&>(__sndr);
+    }
+    else
+    {
+      return execution::continues_on(__mk_sndr(static_cast<__child_t&&>(__child)), __sched);
+    }
   }
 };
 } // namespace __stream
diff --git a/cudax/include/cuda/experimental/__execution/stream/domain.cuh b/cudax/include/cuda/experimental/__execution/stream/domain.cuh
index fbbae7ffd79..c67440b0bba 100644
--- a/cudax/include/cuda/experimental/__execution/stream/domain.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/domain.cuh
@@ -27,6 +27,7 @@
 #include <cuda/std/__type_traits/is_callable.h>
 
 #include <cuda/experimental/__detail/type_traits.cuh>
+#include <cuda/experimental/__execution/cpos.cuh>
 #include <cuda/experimental/__execution/domain.cuh>
 #include <cuda/experimental/__execution/queries.cuh>
 #include <cuda/experimental/__execution/type_traits.cuh>
@@ -105,11 +106,8 @@ struct stream_domain
   struct __apply_adapt_t
   {
     // This is the default apply function that adapts a sender to a stream sender.
-    // The constraint prevents this function from applying an adaptor to a sender
-    // that has already been adapted. The __stream::__adapted_t query is present
-    // only on receivers that come from an adapted sender.
     template <class _Sndr>
-    _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const
+    _CCCL_API constexpr auto operator()(::cuda::std::__ignore_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const
       noexcept(__nothrow_decay_copyable<_Sndr>)
     {
       return __stream::__adapt(static_cast<_Sndr&&>(__sndr));
@@ -119,7 +117,7 @@ struct stream_domain
   struct __apply_passthru_t
   {
     template <class _Sndr>
-    _CCCL_API constexpr auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const
+    _CCCL_API constexpr auto operator()(::cuda::std::__ignore_t, _Sndr&& __sndr, ::cuda::std::__ignore_t) const
       noexcept(__nothrow_movable<_Sndr>) -> _Sndr
     {
       return static_cast<_Sndr&&>(__sndr);
@@ -158,20 +156,19 @@ struct stream_domain
 public:
   _CCCL_TEMPLATE(class _Tag, class _Sndr, class... _Args)
   _CCCL_REQUIRES(__callable<__apply_t<_Tag>, _Sndr, _Args...>)
-  _CCCL_NODEBUG_HOST_API static constexpr auto
+  _CCCL_API static constexpr auto
   apply_sender(_Tag, _Sndr&& __sndr, _Args&&... __args) noexcept(__nothrow_callable<__apply_t<_Tag>, _Sndr, _Args...>)
     -> __call_result_t<__apply_t<_Tag>, _Sndr, _Args...>
   {
-    return __apply_t<_Tag>{}(static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...);
+    return __apply_t<_Tag>()(static_cast<_Sndr&&>(__sndr), static_cast<_Args&&>(__args)...);
   }
 
-  _CCCL_TEMPLATE(class _Sndr, class _Env)
-  _CCCL_REQUIRES(__callable<__transform_strategy_t<_Sndr, _Env>, _Sndr, const _Env&>)
-  _CCCL_NODEBUG_API static constexpr auto transform_sender(_Sndr&& __sndr, const _Env& __env) noexcept(
-    __nothrow_callable<__transform_strategy_t<_Sndr, _Env>, _Sndr, const _Env&>)
-    -> __call_result_t<__transform_strategy_t<_Sndr, _Env>, _Sndr, const _Env&>
+  _CCCL_TEMPLATE(class _OpTag, class _Sndr, class _Env, class _Apply = __transform_strategy_t<_Sndr, _Env>)
+  _CCCL_REQUIRES(__callable<_Apply, _OpTag, _Sndr, const _Env&>)
+  _CCCL_API static constexpr auto transform_sender(_OpTag, _Sndr&& __sndr, const _Env& __env) noexcept(
+    __nothrow_callable<_Apply, _OpTag, _Sndr, const _Env&>) -> __call_result_t<_Apply, _OpTag, _Sndr, const _Env&>
   {
-    return __transform_strategy_t<_Sndr, _Env>{}(static_cast<_Sndr&&>(__sndr), __env);
+    return _Apply()(_OpTag(), static_cast<_Sndr&&>(__sndr), __env);
   }
 };
 
diff --git a/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh b/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh
index 2c0ab902948..1bb2b6553c6 100644
--- a/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/scheduler.cuh
@@ -45,11 +45,11 @@ namespace cuda::experimental
 {
 namespace execution
 {
-template <int _BlockThreads, class _Tag, class _Rcvr, class... _Args>
-_CCCL_VISIBILITY_HIDDEN __launch_bounds__(_BlockThreads) __global__
-  void __stream_complete(_Tag, _Rcvr* __rcvr, _Args... __args)
+template <int _BlockThreads, class _Tag, class _Rcvr>
+//_CCCL_VISIBILITY_HIDDEN
+__launch_bounds__(_BlockThreads) __global__ void __stream_complete(_Tag, _Rcvr* __rcvr)
 {
-  _Tag{}(static_cast<_Rcvr&&>(*__rcvr), static_cast<_Args&&>(__args)...);
+  _Tag{}(static_cast<_Rcvr&&>(*__rcvr));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////
@@ -91,7 +91,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
     [[nodiscard]] _CCCL_API constexpr auto query(get_completion_domain_t<set_error_t>, _Env&& __env) const noexcept
       -> __call_result_t<get_domain_t, _Env&>
     {
-      return execution::get_domain(__env);
+      return {};
     }
 
     [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(get_completion_behavior_t) const noexcept
@@ -114,8 +114,9 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
         : __rcvr_{static_cast<_Rcvr&&>(__rcvr)}
         , __stream_{__stream_ref}
     {
-      _CCCL_ASSERT(execution::__get_pointer_attributes(this).type == cudaMemoryTypeManaged,
-                   "stream scheduler's operation state must be allocated in managed memory");
+      NV_IF_TARGET(NV_IS_HOST,
+                   (_CCCL_ASSERT(execution::__get_pointer_attributes(this).type == cudaMemoryTypeManaged,
+                                 "stream scheduler's operation state must be allocated in managed memory");))
     }
 
     _CCCL_IMMOVABLE(__opstate_t);
@@ -136,13 +137,10 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
       int const __grid_blocks       = __launch_dims.count(experimental::block, experimental::grid);
       static_assert(__block_threads != ::cuda::std::dynamic_extent);
 
-      // printf("Launching completion kernel for stream_scheduler with %d block threads and %d grid blocks\n",
-      //        __block_threads,
-      //        __grid_blocks);
-
       // Launch the kernel that completes the receiver with the launch configuration from
       // the receiver.
-      __stream_complete<__block_threads><<<__grid_blocks, __block_threads, 0, __stream_.get()>>>(set_value, &__rcvr_);
+      __stream_complete<__block_threads, set_value_t, _Rcvr>
+        <<<__grid_blocks, __block_threads, 0, __stream_.get()>>>(set_value, &__rcvr_);
 
       if (auto __status = cudaGetLastError(); __status != cudaSuccess)
       {
@@ -158,7 +156,8 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
 
       // without the following, the kernel in __host_start will fail to launch with
       // cudaErrorInvalidDeviceFunction.
-      ::__cccl_unused(&__stream_complete<__block_threads, set_value_t, _Rcvr>);
+      ::cuda::std::ignore = &__stream_complete<__block_threads, set_value_t, _Rcvr>;
+
       execution::set_value(static_cast<_Rcvr&&>(__rcvr_));
     }
 
@@ -170,7 +169,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT stream_scheduler
   {};
 
 public:
-  _CCCL_API explicit constexpr stream_scheduler(stream_ref __stream) noexcept
+  _CCCL_API constexpr stream_scheduler(stream_ref __stream) noexcept
       : __stream_{__stream}
   {}
 
@@ -201,7 +200,7 @@ public:
       return __opstate_t<_Rcvr>{static_cast<_Rcvr&&>(__rcvr), __attrs_.__stream_};
     }
 
-    _CCCL_NO_UNIQUE_ADDRESS __tag_t __tag_;
+    /*_CCCL_NO_UNIQUE_ADDRESS*/ __tag_t __tag_;
     __attrs_t __attrs_;
   };
 
@@ -256,17 +255,40 @@ struct stream_domain::__apply_t<stream_scheduler::__tag_t> : stream_domain::__ap
 
 } // namespace execution
 
-_CCCL_HOST_API inline auto stream_ref::schedule() const noexcept
+_CCCL_API inline auto stream_ref::schedule() const noexcept
 {
   return execution::schedule(execution::stream_scheduler{*this});
 }
 
-[[nodiscard]] _CCCL_API constexpr auto stream_ref::query(const execution::get_domain_t&) const noexcept
+[[nodiscard]] _CCCL_API constexpr auto
+stream_ref::query(const execution::get_completion_scheduler_t<execution::set_value_t>&) const noexcept -> stream_ref
+{
+  return *this;
+}
+
+template <class _Env>
+[[nodiscard]] _CCCL_API constexpr auto stream_ref::query(
+  const execution::get_completion_scheduler_t<execution::set_error_t>&, const _Env& __env) const noexcept
+  -> execution::__scheduler_of_t<const _Env&>
+{
+  return execution::get_scheduler(__env);
+}
+
+[[nodiscard]] _CCCL_API constexpr auto
+stream_ref::query(const execution::get_completion_domain_t<execution::set_value_t>&) const noexcept
   -> execution::stream_domain
 {
   return {};
 }
 
+template <class _Env>
+[[nodiscard]] _CCCL_API constexpr auto
+stream_ref::query(const execution::get_completion_domain_t<execution::set_error_t>&, const _Env& __env) const noexcept
+  -> __call_result_t<execution::get_domain_t, const _Env&>
+{
+  return {};
+}
+
 } // namespace cuda::experimental
 
 _CCCL_DIAG_POP
diff --git a/cudax/include/cuda/experimental/__execution/stream/sequence.cuh b/cudax/include/cuda/experimental/__execution/stream/sequence.cuh
index f8e186ad022..39f23c0db06 100644
--- a/cudax/include/cuda/experimental/__execution/stream/sequence.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/sequence.cuh
@@ -29,18 +29,18 @@
 
 namespace cuda::experimental::execution
 {
-/////////////////////////////////////////////////////////////////////////////////
-// sequence: customization for the stream scheduler
-template <>
-struct stream_domain::__apply_t<sequence_t>
-{
-  template <class _Sndr, class _Env>
-  _CCCL_API auto operator()(_Sndr __sndr, const _Env& __env) const
-  {
-    static_assert(::cuda::std::__always_false_v<_Sndr>,
-                  "The CUDA stream scheduler does not yet support the `sequence` algorithm.");
-  }
-};
+// /////////////////////////////////////////////////////////////////////////////////
+// // sequence: customization for the stream scheduler
+// template <>
+// struct stream_domain::__apply_t<sequence_t>
+// {
+//   template <class _Sndr, class _Env>
+//   _CCCL_API auto operator()(_Sndr __sndr, const _Env& __env) const
+//   {
+//     static_assert(::cuda::std::__always_false_v<_Sndr>,
+//                   "The CUDA stream scheduler does not yet support the 'sequence' algorithm.");
+//   }
+// };
 
 } // namespace cuda::experimental::execution
 
diff --git a/cudax/include/cuda/experimental/__execution/stream/starts_on.cuh b/cudax/include/cuda/experimental/__execution/stream/starts_on.cuh
deleted file mode 100644
index d77d037e44b..00000000000
--- a/cudax/include/cuda/experimental/__execution/stream/starts_on.cuh
+++ /dev/null
@@ -1,107 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CUDAX_EXECUTION_STREAM_STARTS_ON
-#define __CUDAX_EXECUTION_STREAM_STARTS_ON
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/__stream/get_stream.h>
-#include <cuda/std/__functional/compose.h>
-#include <cuda/std/__tuple_dir/ignore.h>
-#include <cuda/std/__utility/forward_like.h>
-
-#include <cuda/experimental/__detail/type_traits.cuh>
-#include <cuda/experimental/__execution/env.cuh>
-#include <cuda/experimental/__execution/starts_on.cuh>
-#include <cuda/experimental/__execution/stream/adaptor.cuh>
-#include <cuda/experimental/__execution/stream/domain.cuh>
-#include <cuda/experimental/__execution/write_env.cuh>
-
-#include <cuda/experimental/__execution/prologue.cuh>
-
-namespace cuda::experimental::execution
-{
-namespace __stream
-{
-struct __starts_on_t
-{
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __get_stream_fn
-  {
-    template <class _Sndr>
-    [[nodiscard]] _CCCL_API auto operator()(const _Sndr& __sndr, ::cuda::std::__ignore_t) const
-    {
-      // __sndr is a write_env sender (see __mk_sndr_base below), which contains an
-      // environment that contains the stream scheduler, from which we can obtain the
-      // stream.
-      auto& [__ign0, __env, __ign1] = __sndr;
-      return cuda::get_stream(get_scheduler(__env));
-    }
-  };
-
-  template <class _Sch, class _Sndr>
-  [[nodiscard]] static _CCCL_API constexpr auto __mk_sndr_base(_Sch __sch, _Sndr&& __sndr)
-  {
-    // This is the implementation of the starts_on sender for the stream domain. _Sndr
-    // here is the child of the starts_on sender, and _Sch is the stream scheduler. We use
-    // write_env to let _Sndr and its children know that they are running on the stream
-    // scheduler. We construct the adaptor with a __get_stream_fn that knows how to obtain
-    // the stream from the write_env sender.
-    return __stream::__adapt(write_env(static_cast<_Sndr&&>(__sndr), __mk_sch_env(__sch)), __get_stream_fn{});
-  }
-
-  template <class _Sch, class _Sndr>
-  using __sndr_base_t = decltype(__starts_on_t::__mk_sndr_base(declval<_Sch>(), declval<_Sndr>()));
-
-  template <class _Sch, class _Sndr>
-  using __with_sch_t = __call_result_t<write_env_t, _Sndr, __call_result_t<__mk_sch_env_t, _Sch>>;
-
-  // Wrap the sender returned from __mk_sndr_base in a type that hides the complexity of
-  // the sender's type name. This results in more readable diagnostics.
-  template <class _Sch, class _Sndr>
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __sndr_t : __stream::__sndr_t<__with_sch_t<_Sch, _Sndr>, __get_stream_fn>
-  {
-    // BUGBUG NO this is a wrong use of __mk_sch_env. it needs to be passed an
-    // environment. turn this into a transform_sender.
-    _CCCL_API explicit constexpr __sndr_t(_Sch __sch, _Sndr __sndr)
-        : __stream::__sndr_t<__with_sch_t<_Sch, _Sndr>, __get_stream_fn>{
-            {}, {}, write_env(static_cast<_Sndr&&>(__sndr), __mk_sch_env(__sch))}
-    {}
-  };
-
-  // The connect cpo calls transform_sender, which is directed here for starts_on senders.
-  // It returns a custom sender that knows how to start the child sender on the specified
-  // stream.
-  template <class _Sndr>
-  [[nodiscard]] _CCCL_API auto operator()(_Sndr&& __sndr, ::cuda::std::__ignore_t) const
-  {
-    auto& [__ign0, __sch, __child] = __sndr;
-    return __sndr_t{__sch, ::cuda::std::forward_like<_Sndr>(__child)};
-  }
-};
-} // namespace __stream
-
-// Start work on the GPU
-template <>
-struct stream_domain::__apply_t<starts_on_t> : __stream::__starts_on_t
-{};
-} // namespace cuda::experimental::execution
-
-#include <cuda/experimental/__execution/epilogue.cuh>
-
-#endif // __CUDAX_EXECUTION_STREAM_STARTS_ON
diff --git a/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh b/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh
index e8919d94e71..e9c07f19bba 100644
--- a/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/sync_wait.cuh
@@ -23,6 +23,7 @@
 
 #include <cuda/std/__utility/move.h>
 
+#include <cuda/experimental/__execution/cpos.cuh>
 #include <cuda/experimental/__execution/stream/domain.cuh>
 #include <cuda/experimental/__execution/sync_wait.cuh>
 #include <cuda/experimental/__execution/utility.cuh>
@@ -45,7 +46,7 @@ struct __sync_wait_t : private sync_wait_t
     // The transformation would happen in due course in the connect cpo, so why transform
     // it here? This transformation shuffles the sender into one that can provide a
     // stream_ref, which is needed by __host_apply.
-    auto __new_sndr = execution::transform_sender(stream_domain{}, static_cast<_Sndr&&>(__sndr), __env);
+    auto __new_sndr = stream_domain{}.transform_sender(set_value, static_cast<_Sndr&&>(__sndr), __env);
 
     NV_IF_TARGET(NV_IS_HOST,
                  (return __host_apply(::cuda::std::move(__new_sndr), static_cast<_Env&&>(__env));),
@@ -61,7 +62,7 @@ struct __sync_wait_t : private sync_wait_t
 
 private:
   template <class _Sndr, class _Env>
-  struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t
+  struct _CCCL_TYPE_VISIBILITY_DEFAULT __managed_state_t
   {
     using __partial_completions_t = completion_signatures_of_t<_Sndr, __env_t<_Env>>;
     using __all_nothrow_t =
@@ -74,9 +75,9 @@ private:
     using __errors_t = __error_types<__completions_t, __decayed_variant>;
     using __rcvr_t   = sync_wait_t::__rcvr_t<__values_t, __errors_t, _Env>;
 
-    _CCCL_HOST_API explicit __state_t(_Sndr&& __sndr, _Env&& __env)
+    _CCCL_HOST_API explicit __managed_state_t(_Sndr&& __sndr, _Env&& __env)
         : __result_{}
-        , __state_{{{}, static_cast<_Env&&>(__env)}, &__result_, {}}
+        , __state_{static_cast<_Env&&>(__env), &__result_}
         , __opstate_{execution::connect(static_cast<_Sndr&&>(__sndr), __rcvr_t{&__state_})}
     {}
 
@@ -97,7 +98,7 @@ private:
     stream_ref __stream = __get_stream(__sndr, __env);
 
     // Launch the sender with a continuation that will fill in a variant
-    using __box_t = __managed_box<__state_t<_Sndr, _Env>>;
+    using __box_t = __managed_box<__managed_state_t<_Sndr, _Env>>;
     auto __box    = __box_t::__make_unique(static_cast<_Sndr&&>(__sndr), static_cast<_Env&&>(__env));
     execution::start(__box->__value.__opstate_);
 
diff --git a/cudax/include/cuda/experimental/__execution/stream_context.cuh b/cudax/include/cuda/experimental/__execution/stream_context.cuh
index e5bb2bf5602..ce60d0545fb 100644
--- a/cudax/include/cuda/experimental/__execution/stream_context.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream_context.cuh
@@ -21,17 +21,16 @@
 #  pragma system_header
 #endif // no system header
 
-// // IWYU pragma: begin_exports
-// #include <cuda/experimental/__execution/stream/adaptor.cuh>
-// #include <cuda/experimental/__execution/stream/bulk.cuh>
-// #include <cuda/experimental/__execution/stream/context.cuh>
-// #include <cuda/experimental/__execution/stream/continues_on.cuh>
-// #include <cuda/experimental/__execution/stream/domain.cuh>
-// #include <cuda/experimental/__execution/stream/launch.cuh>
-// #include <cuda/experimental/__execution/stream/let_value.cuh>
-// #include <cuda/experimental/__execution/stream/sequence.cuh>
-// #include <cuda/experimental/__execution/stream/starts_on.cuh>
-// #include <cuda/experimental/__execution/stream/sync_wait.cuh>
-// // IWYU pragma: end_exports
+// IWYU pragma: begin_exports
+#include <cuda/experimental/__execution/stream/adaptor.cuh>
+#include <cuda/experimental/__execution/stream/bulk.cuh>
+#include <cuda/experimental/__execution/stream/context.cuh>
+#include <cuda/experimental/__execution/stream/continues_on.cuh>
+#include <cuda/experimental/__execution/stream/domain.cuh>
+#include <cuda/experimental/__execution/stream/launch.cuh>
+#include <cuda/experimental/__execution/stream/let_value.cuh>
+#include <cuda/experimental/__execution/stream/sequence.cuh>
+#include <cuda/experimental/__execution/stream/sync_wait.cuh>
+// IWYU pragma: end_exports
 
 #endif //__CUDAX_EXECUTION_STREAM_CONTEXT
diff --git a/cudax/include/cuda/experimental/__execution/sync_wait.cuh b/cudax/include/cuda/experimental/__execution/sync_wait.cuh
index 2dd857f735b..b90f8cd4d97 100644
--- a/cudax/include/cuda/experimental/__execution/sync_wait.cuh
+++ b/cudax/include/cuda/experimental/__execution/sync_wait.cuh
@@ -62,7 +62,8 @@ struct sync_wait_t
     basic_run_loop<_Env> __loop_;
   };
 
-  template <class _Env>
+public:
+  template <class _Env = env<>>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __env_t
   {
     _CCCL_EXEC_CHECK_DISABLE
@@ -108,6 +109,7 @@ struct sync_wait_t
   template <class... _Ts>
   using __decayed_tuple = ::cuda::std::tuple<decay_t<_Ts>...>;
 
+  _CUDAX_SEMI_PRIVATE :
   template <class _Values, class _Errors, class _Env = env<>>
   struct _CCCL_TYPE_VISIBILITY_DEFAULT __state_t : __state_base_t<_Env>
   {
@@ -177,7 +179,15 @@ struct sync_wait_t
   struct __throw_error_fn
   {
     template <class _Error>
-    _CCCL_HOST_API void operator()(_Error __err) const
+    [[noreturn]]
+    _CCCL_API void operator()(_Error __err) const
+    {
+      NV_IF_TARGET(NV_IS_HOST, (__do_throw(static_cast<_Error&&>(__err));), (::cuda::std::terminate();))
+    }
+
+    template <class _Error>
+    [[noreturn]]
+    _CCCL_HOST_API static void __do_throw(_Error __err)
     {
       if constexpr (__same_as<_Error, ::std::exception_ptr>)
       {
@@ -195,6 +205,7 @@ struct sync_wait_t
       {
         throw static_cast<_Error&&>(__err);
       }
+      _CCCL_UNREACHABLE();
     }
   };
 
diff --git a/cudax/include/cuda/experimental/__execution/then.cuh b/cudax/include/cuda/experimental/__execution/then.cuh
index 67bb5f9e11e..90abee390d4 100644
--- a/cudax/include/cuda/experimental/__execution/then.cuh
+++ b/cudax/include/cuda/experimental/__execution/then.cuh
@@ -140,7 +140,7 @@ struct __upon_t
     }
 
     template <class _Tag, class... _Ts>
-    _CCCL_NODEBUG_API void __complete(_Tag, _Ts&&... __ts) noexcept
+    _CCCL_API void __complete(_Tag, _Ts&&... __ts) noexcept
     {
       if constexpr (_Tag{} == _SetTag{})
       {
@@ -227,13 +227,13 @@ struct __upon_t
                                                        // extended (host/device) lambda
   {
     template <class _Sndr>
-    _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) -> __call_result_t<__upon_tag_t, _Sndr, _Fn>
+    _CCCL_API constexpr auto operator()(_Sndr __sndr) -> __call_result_t<__upon_tag_t, _Sndr, _Fn>
     {
       return __upon_tag_t{}(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__fn_));
     }
 
     template <class _Sndr>
-    _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr __sndr, __closure_base_t __self) //
+    _CCCL_API friend constexpr auto operator|(_Sndr __sndr, __closure_base_t __self) //
       -> __call_result_t<__upon_tag_t, _Sndr, _Fn>
     {
       return __upon_tag_t{}(static_cast<_Sndr&&>(__sndr), static_cast<_Fn&&>(__self.__fn_));
@@ -244,10 +244,10 @@ struct __upon_t
 
 public:
   template <class _Sndr, class _Fn>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const;
+  _CCCL_API constexpr auto operator()(_Sndr __sndr, _Fn __fn) const;
 
   template <class _Fn>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Fn __fn) const;
+  _CCCL_API constexpr auto operator()(_Fn __fn) const;
 };
 
 struct then_t : __upon_t<then_t, set_value_t>
@@ -330,7 +330,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __upon_t<_UponTag, _SetTag>::__sndr_base_t
     return __fwd_env(execution::get_env(__sndr_));
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS __upon_tag_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ __upon_tag_t __tag_;
   _Fn __fn_;
   _Sndr __sndr_;
 };
@@ -365,7 +365,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT upon_stopped_t::__closure_t
 
 template <class _UponTag, class _SetTag>
 template <class _Sndr, class _Fn>
-_CCCL_NODEBUG_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Sndr __sndr, _Fn __fn) const
+_CCCL_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Sndr __sndr, _Fn __fn) const
 {
   using __sndr_t = typename _UponTag::template __sndr_t<_Sndr, _Fn>;
 
@@ -381,7 +381,7 @@ _CCCL_NODEBUG_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Sndr _
 
 template <class _UponTag, class _SetTag>
 template <class _Fn>
-_CCCL_NODEBUG_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Fn __fn) const
+_CCCL_API constexpr auto __upon_t<_UponTag, _SetTag>::operator()(_Fn __fn) const
 {
   using __closure_t = typename _UponTag::template __closure_t<_Fn>;
   return __closure_t{{static_cast<_Fn&&>(__fn)}};
diff --git a/cudax/include/cuda/experimental/__execution/thread_context.cuh b/cudax/include/cuda/experimental/__execution/thread_context.cuh
index 219bee64ca0..47b001647fa 100644
--- a/cudax/include/cuda/experimental/__execution/thread_context.cuh
+++ b/cudax/include/cuda/experimental/__execution/thread_context.cuh
@@ -51,7 +51,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT thread_context
     }
   }
 
-  _CCCL_HOST_API auto get_scheduler()
+  _CCCL_API auto get_scheduler()
   {
     return __loop_.get_scheduler();
   }
diff --git a/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh b/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh
index f18d65200cc..adb964ab7f4 100644
--- a/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh
+++ b/cudax/include/cuda/experimental/__execution/transform_completion_signatures.cuh
@@ -40,7 +40,7 @@ template <class _Tag>
 struct __default_transform_fn
 {
   template <class... _Ts>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(_Ts...)>
+  _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(_Ts...)>
   {
     return {};
   }
@@ -49,7 +49,7 @@ struct __default_transform_fn
 struct __swallow_transform
 {
   template <class... _Ts>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<>
+  _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<>
   {
     return {};
   }
@@ -59,7 +59,7 @@ template <class _Tag>
 struct __decay_transform
 {
   template <class... _Ts>
-  _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(decay_t<_Ts>...)>
+  _CCCL_API _CCCL_CONSTEVAL auto operator()() const noexcept -> completion_signatures<_Tag(decay_t<_Ts>...)>
   {
     return {};
   }
@@ -70,7 +70,7 @@ using __meta_call_result_t _CCCL_NODEBUG_ALIAS = decltype(declval<_Fn>().templat
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Ay, class... _As, class _Fn>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn)
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn)
   -> __meta_call_result_t<const _Fn&, _Ay, _As...>
 {
   return __fn.template operator()<_Ay, _As...>();
@@ -78,7 +78,7 @@ template <class _Ay, class... _As, class _Fn>
 
 _CCCL_EXEC_CHECK_DISABLE
 template <class _Fn>
-[[nodiscard]] _CCCL_NODEBUG_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn) -> __call_result_t<const _Fn&>
+[[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __transform_expr(const _Fn& __fn) -> __call_result_t<const _Fn&>
 {
   return __fn();
 }
diff --git a/cudax/include/cuda/experimental/__execution/transform_sender.cuh b/cudax/include/cuda/experimental/__execution/transform_sender.cuh
index 5de2017450c..187cdfeab48 100644
--- a/cudax/include/cuda/experimental/__execution/transform_sender.cuh
+++ b/cudax/include/cuda/experimental/__execution/transform_sender.cuh
@@ -36,10 +36,18 @@ namespace cuda::experimental::execution
 {
 namespace __detail
 {
+template <class _Env>
+using __starting_domain = __call_result_or_t<get_domain_t, default_domain, const _Env&>;
+
+template <class _Sndr, class _Env>
+using __completing_domain =
+  __call_result_t<__first_callable<get_domain_override_t, get_completion_domain_t<set_value_t>>,
+                  env_of_t<_Sndr>,
+                  const _Env&>;
+
 template <class _Domain, class _OpTag>
 struct __transform_sender_t
 {
-private:
   template <class _Sndr, class _Env>
   using __domain_for_t = ::cuda::std::_If< //
     __has_transform_sender<_Domain, _OpTag, _Sndr, _Env>,
@@ -58,13 +66,17 @@ private:
     {
       return __declfn<__result_t, __is_nothrow>;
     }
-    else
+    else if constexpr (__same_as<_OpTag, start_t>)
     {
       return __get_declfn<__result_t, const _Env&, (_Nothrow && __is_nothrow)>();
     }
+    else
+    {
+      using __transform_recurse_t = __transform_sender_t<__completing_domain<__result_t, _Env>, set_value_t>;
+      return __transform_recurse_t::template __get_declfn<__result_t, set_value_t, (_Nothrow && __is_nothrow)>();
+    }
   }
 
-public:
   template <class _Sndr, class _Env, auto _DeclFn = __get_declfn<_Sndr, _Env>()>
   [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr&& __sndr, const _Env& __env) const
     noexcept(noexcept(_DeclFn())) -> decltype(_DeclFn())
@@ -76,10 +88,16 @@ public:
     {
       return __domain_t().transform_sender(_OpTag(), static_cast<_Sndr&&>(__sndr), __env);
     }
-    else
+    else if constexpr (__same_as<_OpTag, start_t>)
     {
       return (*this)(__domain_t().transform_sender(_OpTag(), static_cast<_Sndr&&>(__sndr), __env), __env);
     }
+    else
+    {
+      using __transform_recurse_t = __transform_sender_t<__completing_domain<__result_t, _Env>, set_value_t>;
+      return __transform_recurse_t()(
+        __domain_t().transform_sender(_OpTag(), static_cast<_Sndr&&>(__sndr), __env), __env);
+    }
   }
 };
 } // namespace __detail
@@ -92,25 +110,17 @@ private:
   {
     template <class _Sndr, class _Env>
     _CCCL_API constexpr auto operator()(_Sndr&& __sndr, const _Env& __env) const
-      noexcept(noexcept(_Fn2()(_Fn1()(static_cast<_Sndr&&>(__sndr), __env), __env)))
-        -> decltype(_Fn2()(_Fn1()(static_cast<_Sndr&&>(__sndr), __env), __env))
+      noexcept(noexcept(_Fn1()(_Fn2()(static_cast<_Sndr&&>(__sndr), __env), __env)))
+        -> decltype(_Fn1()(_Fn2()(static_cast<_Sndr&&>(__sndr), __env), __env))
     {
-      return _Fn2()(_Fn1()(static_cast<_Sndr&&>(__sndr), __env), __env);
+      return _Fn1()(_Fn2()(static_cast<_Sndr&&>(__sndr), __env), __env);
     }
   };
 
-  template <class _Env>
-  using __starting_domain = __call_result_or_t<get_domain_t, default_domain, const _Env&>;
-
-  template <class _Sndr, class _Env>
-  using __completing_domain =
-    __call_result_t<__first_callable<get_domain_override_t, get_completion_domain_t<set_value_t>>,
-                    env_of_t<_Sndr>,
-                    const _Env&>;
-
   template <class _Sndr, class _Env>
-  using __impl_fn_t = __compose<__detail::__transform_sender_t<__starting_domain<const _Env&>, start_t>,
-                                __detail::__transform_sender_t<__completing_domain<_Sndr, const _Env&>, set_value_t>>;
+  using __impl_fn_t =
+    __compose<__detail::__transform_sender_t<__detail::__starting_domain<_Env>, start_t>,
+              __detail::__transform_sender_t<__detail::__completing_domain<_Sndr, _Env>, set_value_t>>;
 
 public:
   template <class _Sndr, class _Env, class _ImplFn = __impl_fn_t<_Sndr, _Env>>
diff --git a/cudax/include/cuda/experimental/__execution/utility.cuh b/cudax/include/cuda/experimental/__execution/utility.cuh
index 3d4205d2398..6e5e4d90c0a 100644
--- a/cudax/include/cuda/experimental/__execution/utility.cuh
+++ b/cudax/include/cuda/experimental/__execution/utility.cuh
@@ -134,6 +134,8 @@ template <class _Ty>
   return __attrs;
 }
 
+#define __debug_printf(...) (printf(__VA_ARGS__), [] NV_IF_TARGET(NV_IS_HOST, (fflush(stdout);), (void(0);))())
+
 // This function can only be called from a catch handler.
 [[nodiscard]] _CCCL_HOST_API inline auto __get_cuda_error_from_active_exception() -> ::cudaError_t
 {
@@ -208,7 +210,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __first_callable
 private:
   //! @brief Returns the first function that is callable with a given set of arguments.
   template <class... _Args, class _Self>
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr auto __get_1st(_Self&& __self) noexcept -> decltype(auto)
+  [[nodiscard]] _CCCL_API static constexpr auto __get_1st(_Self&& __self) noexcept -> decltype(auto)
   {
     // NOLINTNEXTLINE (modernize-avoid-c-arrays)
     constexpr bool __flags[] = {__callable<::cuda::std::__copy_cvref_t<_Self, _Fns>, _Args...>..., false};
@@ -227,7 +229,7 @@ public:
   //! @brief Calls the first function that is callable with a given set of arguments.
   _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
-  _CCCL_NODEBUG_API constexpr auto
+  _CCCL_API constexpr auto
   operator()(_Args&&... __args) && noexcept(__nothrow_callable<__1st_fn_t<__first_callable, _Args...>, _Args...>)
     -> __call_result_t<__1st_fn_t<__first_callable, _Args...>, _Args...>
   {
@@ -238,7 +240,7 @@ public:
   //! @overload
   _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Args&&... __args) const& noexcept(
+  _CCCL_API constexpr auto operator()(_Args&&... __args) const& noexcept(
     __nothrow_callable<__1st_fn_t<__first_callable const&, _Args...>, _Args...>)
     -> __call_result_t<__1st_fn_t<__first_callable const&, _Args...>, _Args...>
   {
@@ -255,7 +257,8 @@ _CCCL_HOST_DEVICE __first_callable(_Fns...) -> __first_callable<_Fns...>;
 // __call_or
 namespace __detail
 {
-// query an environment, or return a default value if the query is not supported
+// call a function with a set of arguments or return a default value if the function is
+// not callable.
 struct __call_or_t
 {
   _CCCL_EXEC_CHECK_DISABLE
@@ -280,8 +283,7 @@ struct __call_or_t
 _CCCL_GLOBAL_CONSTANT __detail::__call_or_t __call_or{};
 
 template <class _Fn, class _Default, class... _Args>
-using __call_result_or_t _CCCL_NODEBUG_ALIAS =
-  decltype(__call_or(::cuda::std::declval<_Fn>(), ::cuda::std::declval<_Default>(), ::cuda::std::declval<_Args>()...));
+using __call_result_or_t _CCCL_NODEBUG_ALIAS = __call_result_t<__detail::__call_or_t, _Fn, _Default, _Args...>;
 
 //! @brief A callable that always return a value of type _Ty, regardless of the arguments
 //! passed to it.
@@ -289,18 +291,18 @@ template <class _Ty>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT __always
 {
   template <class... _Args>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Args&&...) && noexcept -> _Ty&&
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Args&&...) && noexcept -> _Ty&&
   {
     return static_cast<_Ty&&>(__value);
   }
 
   template <class... _Args>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Args&&...) const& noexcept -> _Ty const&
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Args&&...) const& noexcept -> _Ty const&
   {
     return __value;
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS _Ty __value{};
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ _Ty __value{};
 };
 
 template <class _Ty>
diff --git a/cudax/include/cuda/experimental/__execution/variant.cuh b/cudax/include/cuda/experimental/__execution/variant.cuh
index 3dcf9627a2c..0d8eb226cff 100644
--- a/cudax/include/cuda/experimental/__execution/variant.cuh
+++ b/cudax/include/cuda/experimental/__execution/variant.cuh
@@ -62,7 +62,7 @@ public:
     _CCCL_ASSERT(false, "cannot visit a stateless variant");
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API static constexpr size_t __index() noexcept
+  [[nodiscard]] _CCCL_API static constexpr size_t __index() noexcept
   {
     return __npos;
   }
@@ -112,12 +112,12 @@ public:
     __destroy();
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API void* __ptr() noexcept
+  [[nodiscard]] _CCCL_API void* __ptr() noexcept
   {
     return __storage_;
   }
 
-  [[nodiscard]] _CCCL_NODEBUG_API size_t __index() const noexcept
+  [[nodiscard]] _CCCL_API size_t __index() const noexcept
   {
     return __index_;
   }
diff --git a/cudax/include/cuda/experimental/__execution/visit.cuh b/cudax/include/cuda/experimental/__execution/visit.cuh
index 7bd5c669b6d..47dd356ba58 100644
--- a/cudax/include/cuda/experimental/__execution/visit.cuh
+++ b/cudax/include/cuda/experimental/__execution/visit.cuh
@@ -96,7 +96,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT visit_t
   _CCCL_EXEC_CHECK_DISABLE
   template <class _Visitor, class _CvSndr, class _Context>
     requires(static_cast<int>(structured_binding_size<_CvSndr>) >= 2)
-  _CCCL_NODEBUG_API constexpr auto operator()(_Visitor& __visitor, _CvSndr&& __sndr, _Context& __context) const
+  _CCCL_API constexpr auto operator()(_Visitor& __visitor, _CvSndr&& __sndr, _Context& __context) const
     -> decltype(auto)
   {
     auto&& [__tag, __data, ... __children] = static_cast<_CvSndr&&>(__sndr);
@@ -146,8 +146,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT visit_t
 {
   _CCCL_TEMPLATE(class _Visitor, class _Sndr, class _Context)
   _CCCL_REQUIRES((static_cast<int>(structured_binding_size<_Sndr>) >= 2))
-  _CCCL_NODEBUG_API constexpr auto operator()(_Visitor& __visitor, _Sndr&& __sndr, _Context& __context) const
-    -> decltype(auto)
+  _CCCL_API constexpr auto operator()(_Visitor& __visitor, _Sndr&& __sndr, _Context& __context) const -> decltype(auto)
   {
     // This `if constexpr` shouldn't be needed given the `requires` clause above. It is
     // here because nvcc 12.0 has a bug where the full signature of the function template
diff --git a/cudax/include/cuda/experimental/__execution/when_all.cuh b/cudax/include/cuda/experimental/__execution/when_all.cuh
index 2ff9c80499e..2dc994b3e20 100644
--- a/cudax/include/cuda/experimental/__execution/when_all.cuh
+++ b/cudax/include/cuda/experimental/__execution/when_all.cuh
@@ -115,7 +115,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t
     __state_t& __state_;
 
     template <class... _Ts>
-    _CCCL_NODEBUG_API constexpr void set_value(_Ts&&... __ts) noexcept
+    _CCCL_API constexpr void set_value(_Ts&&... __ts) noexcept
     {
       constexpr ::cuda::std::index_sequence_for<_Ts...>* idx = nullptr;
       __state_.template __set_value<_Index>(idx, static_cast<_Ts&&>(__ts)...);
@@ -123,7 +123,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t
     }
 
     template <class _Error>
-    _CCCL_NODEBUG_API constexpr void set_error(_Error&& __error) noexcept
+    _CCCL_API constexpr void set_error(_Error&& __error) noexcept
     {
       __state_.__set_error(static_cast<_Error&&>(__error));
       __state_.__arrive();
@@ -293,7 +293,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t
   struct __start_all
   {
     template <class... _Ops>
-    _CCCL_NODEBUG_API void operator()(_Ops&... __ops) const noexcept
+    _CCCL_API void operator()(_Ops&... __ops) const noexcept
     {
       (execution::start(__ops), ...);
     }
@@ -390,7 +390,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t
 
 public:
   template <class... _Sndrs>
-  _CCCL_NODEBUG_API constexpr auto operator()(_Sndrs... __sndrs) const;
+  _CCCL_API constexpr auto operator()(_Sndrs... __sndrs) const;
 };
 
 template <class _Child, class... _Env>
@@ -521,7 +521,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT when_all_t::__sndr_t
 };
 
 template <class... _Sndrs>
-_CCCL_NODEBUG_API constexpr auto when_all_t::operator()(_Sndrs... __sndrs) const
+_CCCL_API constexpr auto when_all_t::operator()(_Sndrs... __sndrs) const
 {
   if constexpr (sizeof...(_Sndrs) == 0)
   {
diff --git a/cudax/include/cuda/experimental/__execution/write_attrs.cuh b/cudax/include/cuda/experimental/__execution/write_attrs.cuh
index 96ef9ad8c06..8359508f2e3 100644
--- a/cudax/include/cuda/experimental/__execution/write_attrs.cuh
+++ b/cudax/include/cuda/experimental/__execution/write_attrs.cuh
@@ -136,7 +136,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT write_attrs_t::__sndr_t
     return {{__env_ref(__attrs_), __fwd_env(execution::get_env(__sndr_))}};
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS write_attrs_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ write_attrs_t __tag_;
   _Attrs __attrs_;
   _Sndr __sndr_;
 };
diff --git a/cudax/include/cuda/experimental/__execution/write_env.cuh b/cudax/include/cuda/experimental/__execution/write_env.cuh
index 7f289ffc2ca..9e176a153b9 100644
--- a/cudax/include/cuda/experimental/__execution/write_env.cuh
+++ b/cudax/include/cuda/experimental/__execution/write_env.cuh
@@ -141,7 +141,7 @@ public:
   /// @brief Wraps one sender in another that modifies the execution
   /// environment by merging in the environment specified.
   template <class _Sndr, class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr, _Env __env) const
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr, _Env __env) const
   {
     return __sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__env), static_cast<_Sndr&&>(__sndr)};
   }
@@ -149,7 +149,7 @@ public:
   /// @brief Returns a closure that can be used with the pipe operator
   /// to modify the execution environment.
   template <class _Env>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Env __env) const
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Env __env) const
   {
     return __closure_t<_Env>{static_cast<_Env&&>(__env)};
   }
@@ -185,7 +185,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT write_env_t::__sndr_t
     return __fwd_env(execution::get_env(__sndr_));
   }
 
-  _CCCL_NO_UNIQUE_ADDRESS write_env_t __tag_;
+  /*_CCCL_NO_UNIQUE_ADDRESS*/ write_env_t __tag_;
   _Env __env_;
   _Sndr __sndr_;
 };
@@ -194,14 +194,13 @@ template <class _Env>
 struct _CCCL_TYPE_VISIBILITY_DEFAULT write_env_t::__closure_t
 {
   template <class _Sndr>
-  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(_Sndr __sndr) const -> __sndr_t<_Sndr, _Env>
+  [[nodiscard]] _CCCL_API constexpr auto operator()(_Sndr __sndr) const -> __sndr_t<_Sndr, _Env>
   {
     return __sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__env_), static_cast<_Sndr&&>(__sndr)};
   }
 
   template <class _Sndr>
-  [[nodiscard]] _CCCL_NODEBUG_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self)
-    -> __sndr_t<_Sndr, _Env>
+  [[nodiscard]] _CCCL_API friend constexpr auto operator|(_Sndr __sndr, __closure_t __self) -> __sndr_t<_Sndr, _Env>
   {
     return __sndr_t<_Sndr, _Env>{{}, static_cast<_Env&&>(__self.__env_), static_cast<_Sndr&&>(__sndr)};
   }
diff --git a/cudax/include/cuda/experimental/__stream/stream_ref.cuh b/cudax/include/cuda/experimental/__stream/stream_ref.cuh
index e9ec9215916..ae623c83b61 100644
--- a/cudax/include/cuda/experimental/__stream/stream_ref.cuh
+++ b/cudax/include/cuda/experimental/__stream/stream_ref.cuh
@@ -26,6 +26,7 @@
 #include <cuda/stream_ref>
 
 #include <cuda/experimental/__device/logical_device.cuh>
+#include <cuda/experimental/__execution/completion_behavior.cuh>
 #include <cuda/experimental/__execution/fwd.cuh>
 #include <cuda/experimental/__utility/ensure_current_device.cuh>
 
@@ -48,7 +49,7 @@ struct stream_ref : ::cuda::stream_ref
   //! @brief Converting constructor from \c ::cuda::stream_ref
   //!
   //! @post `*this == __other`
-  _CCCL_HOST_API constexpr stream_ref(const ::cuda::stream_ref& __other) noexcept
+  _CCCL_API constexpr stream_ref(const ::cuda::stream_ref& __other) noexcept
       : ::cuda::stream_ref(__other)
   {}
 
@@ -63,7 +64,7 @@ struct stream_ref : ::cuda::stream_ref
   //! @brief Returns a \c execution::sender that completes on this stream.
   //!
   //! @note Equivalent to `execution::schedule(execution::stream_scheduler{*this})`.
-  _CCCL_HOST_API auto schedule() const noexcept;
+  _CCCL_API auto schedule() const noexcept;
 
   //! @brief Get the logical device under which this stream was created.
   //!
@@ -108,8 +109,26 @@ struct stream_ref : ::cuda::stream_ref
     return execution::forward_progress_guarantee::weakly_parallel;
   }
 
-  [[nodiscard]] _CCCL_API constexpr auto query(const execution::get_domain_t&) const noexcept
-    -> execution::stream_domain;
+  [[nodiscard]] _CCCL_API constexpr auto query(const execution::get_completion_behavior_t&) const noexcept
+  {
+    return execution::completion_behavior::asynchronous;
+  }
+
+  [[nodiscard]] _CCCL_API constexpr auto
+  query(const execution::get_completion_scheduler_t<execution::set_value_t>&) const noexcept -> stream_ref;
+
+  template <class _Env>
+  [[nodiscard]] _CCCL_API constexpr auto
+  query(const execution::get_completion_scheduler_t<execution::set_error_t>&, const _Env& __env) const noexcept
+    -> execution::__scheduler_of_t<const _Env&>;
+
+  [[nodiscard]] _CCCL_API constexpr auto
+  query(const execution::get_completion_domain_t<execution::set_value_t>&) const noexcept -> execution::stream_domain;
+
+  template <class _Env>
+  [[nodiscard]] _CCCL_API constexpr auto
+  query(const execution::get_completion_domain_t<execution::set_error_t>&, const _Env& __env) const noexcept
+    -> __call_result_t<execution::get_domain_t, const _Env&>;
 };
 
 } // namespace cuda::experimental
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index d6e6c56fe5a..4888de26d87 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -25,6 +25,7 @@ function(cudax_add_catch2_test target_name_var test_name cn_target) # ARGN=test
   target_link_libraries(${test_target} PRIVATE
     ${cn_target}
     cccl.c2h.main
+    cudart
   )
   target_compile_options(${test_target} PRIVATE
     "-DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
@@ -78,18 +79,20 @@ foreach(cn_target IN LISTS cudax_TARGETS)
     execution/test_continues_on.cu
     execution/test_just.cu
     execution/test_let_value.cu
-    #execution/test_on.cu
+    execution/test_on.cu
     execution/test_sequence.cu
     execution/test_starts_on.cu
-    #execution/test_stream_context.cu
+    execution/test_stream_context.cu
     execution/test_then.cu
     execution/test_visit.cu
     execution/test_when_all.cu
     execution/test_write_attrs.cu
     execution/test_write_env.cu
   )
-  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
-  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>)
+
+  target_compile_options(${test_target} PRIVATE
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-allow-unsupported-compiler>
+  )
 
   # The stream context needs atomic wait/notify, which is only available on sm70 and
   # above.
diff --git a/cudax/test/execution/test_on.cu b/cudax/test/execution/test_on.cu
index 0baec6d7bc3..e896f3f3e78 100644
--- a/cudax/test/execution/test_on.cu
+++ b/cudax/test/execution/test_on.cu
@@ -8,14 +8,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+// BUGBUG
+
 #include <cuda/experimental/execution.cuh>
 
 #include "testing.cuh"
 
 namespace ex = cudax::execution;
 
-namespace
-{
 __host__ __device__ bool _on_device() noexcept
 {
   NV_IF_ELSE_TARGET(NV_IS_HOST, //
@@ -59,10 +59,10 @@ void simple_start_on_stream_test()
 {
   cudax::stream str{cuda::device_ref(0)};
   auto sch  = cudax::stream_ref{str};
-  auto sndr = ex::on(sch, ex::just(42) | ex::then([] __host__ __device__(int i) -> int {
+  auto sndr = ex::on(sch, ex::just(42) | ex::then([] __host__ __device__(int i) noexcept -> int {
                             return _on_device() ? i : -i;
                           }))
-            | ex::then([] __host__ __device__(int i) -> int {
+            | ex::then([] __host__ __device__(int i) noexcept -> int {
                 return _on_device() ? -1 : i;
               });
   auto [result] = ex::sync_wait(std::move(sndr)).value();
@@ -73,16 +73,18 @@ void simple_continue_on_stream_test()
 {
   cudax::stream str{cuda::device_ref(0)};
   auto sch  = cudax::stream_ref{str};
-  auto sndr = ex::just(42) | ex::on(sch, ex::then([] __host__ __device__(int i) -> int {
+  auto sndr = ex::just(42) | ex::on(sch, ex::then([] __host__ __device__(int i) noexcept -> int {
                                       return _on_device() ? i : -i;
                                     }))
-            | ex::then([] __host__ __device__(int i) -> int {
+            | ex::then([] __host__ __device__(int i) noexcept -> int {
                 return _on_device() ? -1 : i;
               });
   auto [result] = ex::sync_wait(std::move(sndr)).value();
   CUDAX_CHECK(result == 42);
 }
 
+namespace
+{
 C2H_TEST("simple on(sch, sndr) thread test", "[on]")
 {
   simple_start_on_thread_test();
diff --git a/cudax/test/execution/test_stream_context.cu b/cudax/test/execution/test_stream_context.cu
index 101d8fc7df3..af1f25fd179 100644
--- a/cudax/test/execution/test_stream_context.cu
+++ b/cudax/test/execution/test_stream_context.cu
@@ -15,6 +15,7 @@
 #include <thrust/equal.h>
 
 #include <cuda/experimental/container.cuh>
+#include <cuda/experimental/memory_resource.cuh>
 
 #include <nv/target>
 
@@ -24,8 +25,6 @@ _CCCL_BEGIN_NV_DIAG_SUPPRESS(177) // function "_is_on_device" was declared but n
 
 namespace ex = cuda::experimental::execution;
 
-namespace
-{
 __host__ __device__ bool _is_on_device() noexcept
 {
   NV_IF_ELSE_TARGET(NV_IS_HOST, //
@@ -135,8 +134,10 @@ void bulk_on_stream_scheduler()
   auto sch = sctx.get_scheduler();
 
   using _env_t = cudax::env_t<cuda::mr::device_accessible>;
-  _env_t env{cudax::device_memory_resource{_dev}, cuda::get_stream(sch), ex::par_unseq};
-  cudax::async_device_buffer<int> buf{env, 10, 40}; // a device buffer of 10 integers, initialized to 40
+  auto mr      = cudax::device_memory_resource{_dev};
+  auto mr2     = cudax::any_resource<cudax::device_accessible>(mr);
+  _env_t env{mr, cuda::get_stream(sch), ex::par_unseq};
+  auto buf = cudax::make_async_buffer<int>(sctx, mr2, 10, 40, env); // a device buffer of 10 integers, initialized to 40
   cuda::std::span data{buf};
 
   auto start = //
@@ -153,7 +154,8 @@ void bulk_on_stream_scheduler()
         data[i] += 2;
       });
 
-  cudax::async_device_buffer<int> expected{env, 10, 42}; // a device buffer of 10 integers, initialized to 42
+  auto expected = cudax::make_async_buffer<int>(sctx, mr2, 10, 42, env); // a device buffer of 10 integers, initialized
+                                                                         // to 42
 
   // start the sender and wait for it to finish
   auto [span] = ex::sync_wait(std::move(start)).value();
@@ -206,6 +208,8 @@ void starts_on_with_stream_scheduler2()
   CHECK(i == 43);
 }
 
+namespace
+{
 // Test code is placed in separate functions to avoid an nvc++ issue with
 // extended lambdas in functions with internal linkage (as is the case
 // with C2H tests).
diff --git a/cudax/test/execution/test_visit.cu b/cudax/test/execution/test_visit.cu
index b55fcaf820e..9d8ef1c6e1a 100644
--- a/cudax/test/execution/test_visit.cu
+++ b/cudax/test/execution/test_visit.cu
@@ -33,13 +33,13 @@ static_assert(cudax_async::structured_binding_size<S2> == 2);
 template <class Fn>
 struct recursive_lambda
 {
-  Fn fn;
-
   template <class... Args>
-  __host__ __device__ auto operator()(Args&&... args)
+  auto operator()(Args&&... args)
   {
     return fn(*this, cuda::std::forward<Args>(args)...);
   }
+
+  Fn fn;
 };
 
 template <class Fn>