sycl/source/detail/graph_impl.cpp

//==--------- graph_impl.cpp - SYCL graph extension -----------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#define __SYCL_GRAPH_IMPL_CPP

#include <stack>
#include <detail/graph_impl.hpp>
#include <detail/handler_impl.hpp>
#include <detail/kernel_arg_mask.hpp>
#include <detail/program_manager/program_manager.hpp>
#include <detail/queue_impl.hpp>
#include <detail/scheduler/commands.hpp>
#include <detail/sycl_mem_obj_t.hpp>
#include <sycl/detail/common.hpp>
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
#include <sycl/detail/string_view.hpp>
#endif
#include <sycl/feature_test.hpp>
#include <sycl/queue.hpp>

namespace sycl {
inline namespace _V1 {

namespace ext {
namespace oneapi {
namespace experimental {
namespace detail {

namespace {
/// Topologically sorts the graph in order to schedule nodes for execution.
/// This implementation is based on Kahn's algorithm which uses a Breadth-first
/// search approach.
/// For performance reasons, this function uses the MTotalVisitedEdges
/// member variable of the node_impl class. It's the caller responsibility to
/// make sure that MTotalVisitedEdges is set to 0 for all nodes in the graph
/// before calling this function.
/// @param[in] Roots List of root nodes.
/// @param[out] SortedNodes The graph nodes sorted in topological order.
/// @param[in] PartitionBounded If set to true, the topological sort is stopped
/// at partition borders. Hence, nodes belonging to a partition different from
/// the NodeImpl partition are not processed.
void sortTopological(std::set<std::weak_ptr<node_impl>,
                              std::owner_less<std::weak_ptr<node_impl>>> &Roots,
                     std::list<std::shared_ptr<node_impl>> &SortedNodes,
                     bool PartitionBounded) {
  std::stack<std::weak_ptr<node_impl>> Source;

  for (auto &Node : Roots) {
    Source.push(Node);
  }

  while (!Source.empty()) {
    auto Node = Source.top().lock();
    Source.pop();
    SortedNodes.push_back(Node);

    for (auto &SuccWP : Node->MSuccessors) {
      auto Succ = SuccWP.lock();

      if (PartitionBounded && (Succ->MPartitionNum != Node->MPartitionNum)) {
        continue;
      }

      auto &TotalVisitedEdges = Succ->MTotalVisitedEdges;
      ++TotalVisitedEdges;
      if (TotalVisitedEdges == Succ->MPredecessors.size()) {
        Source.push(Succ);
      }
    }
  }
}

/// Propagates the partition number `PartitionNum` to predecessors.
/// Propagation stops when a host task is encountered or when no predecessors
/// remain or when we encounter a node that has already been processed and has a
/// partition number lower that the one propagated here. Indeed,
/// partition numbers reflect the execution order. Hence, the partition number
/// of a node can be decreased but not increased. Moreover, as predecessors of a
/// node are either in the same partition or a partition with a smaller number,
/// we do not need to continue propagating the partition number if we encounter
/// a node with a smaller partition number.
/// @param Node Node to assign to the partition.
/// @param PartitionNum Number to propagate.
void propagatePartitionUp(std::shared_ptr<node_impl> Node, int PartitionNum) {
  if (((Node->MPartitionNum != -1) && (Node->MPartitionNum <= PartitionNum)) ||
      (Node->MCGType == sycl::detail::CGType::CodeplayHostTask)) {
    return;
  }
  Node->MPartitionNum = PartitionNum;
  for (auto &Predecessor : Node->MPredecessors) {
    propagatePartitionUp(Predecessor.lock(), PartitionNum);
  }
}

/// Propagates the partition number `PartitionNum` to successors.
/// Propagation stops when an host task is encountered or when no successors
/// remain.
/// @param Node Node to assign to the partition.
/// @param PartitionNum Number to propagate.
/// @param HostTaskList List of host tasks that have already been processed and
/// are encountered as successors to the node Node.
void propagatePartitionDown(
    std::shared_ptr<node_impl> Node, int PartitionNum,
    std::list<std::shared_ptr<node_impl>> &HostTaskList) {
  if (Node->MCGType == sycl::detail::CGType::CodeplayHostTask) {
    if (Node->MPartitionNum != -1) {
      HostTaskList.push_front(Node);
    }
    return;
  }
  Node->MPartitionNum = PartitionNum;
  for (auto &Successor : Node->MSuccessors) {
    propagatePartitionDown(Successor.lock(), PartitionNum, HostTaskList);
  }
}

/// Tests if the node is a root of its partition (i.e. no predecessors that
/// belong to the same partition)
/// @param Node node to test
/// @return True is `Node` is a root of its partition
bool isPartitionRoot(std::shared_ptr<node_impl> Node) {
  for (auto &Predecessor : Node->MPredecessors) {
    if (Predecessor.lock()->MPartitionNum == Node->MPartitionNum) {
      return false;
    }
  }
  return true;
}

/// Takes a vector of weak_ptrs to node_impls and returns a vector of node
/// objects created from those impls, in the same order.
std::vector<node> createNodesFromImpls(
    const std::vector<std::weak_ptr<detail::node_impl>> &Impls) {
  std::vector<node> Nodes{};
  Nodes.reserve(Impls.size());

  for (std::weak_ptr<detail::node_impl> Impl : Impls) {
    Nodes.push_back(sycl::detail::createSyclObjFromImpl<node>(Impl.lock()));
  }

  return Nodes;
}

/// Takes a vector of shared_ptrs to node_impls and returns a vector of node
/// objects created from those impls, in the same order.
std::vector<node> createNodesFromImpls(
    const std::vector<std::shared_ptr<detail::node_impl>> &Impls) {
  std::vector<node> Nodes{};
  Nodes.reserve(Impls.size());

  for (std::shared_ptr<detail::node_impl> Impl : Impls) {
    Nodes.push_back(sycl::detail::createSyclObjFromImpl<node>(Impl));
  }

  return Nodes;
}

} // anonymous namespace

void partition::schedule() {
  if (MSchedule.empty()) {
    // There is no need to reset MTotalVisitedEdges before calling
    // sortTopological because this function is only called once per partition.
    sortTopological(MRoots, MSchedule, true);
  }
}

void exec_graph_impl::makePartitions() {
  int CurrentPartition = -1;
  std::list<std::shared_ptr<node_impl>> HostTaskList;
  // find all the host-tasks in the graph
  for (auto &Node : MNodeStorage) {
    if (Node->MCGType == sycl::detail::CGType::CodeplayHostTask) {
      HostTaskList.push_back(Node);
    }
  }

  MContainsHostTask = HostTaskList.size() > 0;
  // Annotate nodes
  // The first step in graph partitioning is to annotate all nodes of the graph
  // with a temporary partition or group number. This step allows us to group
  // the graph nodes into sets of nodes with kind of meta-dependencies that must
  // be enforced by the runtime. For example, Group 2 depends on Groups 0 and 1,
  // which means that we should not try to run Group 2 before Groups 0 and 1
  // have finished executing. Since host-tasks are currently the only tasks that
  // require runtime dependency handling, groups of nodes are created from
  // host-task nodes. We therefore loop over all the host-task nodes, and for
  // each node:
  //  - Its predecessors are assigned to group number `n-1`
  //  - The node itself constitutes a group, group number `n`
  //  - Its successors are assigned to group number `n+1`
  // Since running multiple partitions slows down the whole graph execution, we
  // then try to reduce the number of partitions by merging them when possible.
  // Typically, the grouping algorithm can create two successive partitions
  // of target nodes in the following case:
  // A host-task `A` is added to the graph. Later, another host task `B` is
  // added to the graph. Consequently, the node `A` is stored before the node
  // `B` in the node storage vector. Now, if `A` is placed as a successor of `B`
  // (using make_edge function to make node `A` dependent on node `B`.) In this
  // case, the host-task node `A` must be reprocessed after the node `B` and the
  // group that includes the predecessor of `B` can be merged with the group of
  // the predecessors of the node `A`.
  while (HostTaskList.size() > 0) {
    auto Node = HostTaskList.front();
    HostTaskList.pop_front();
    CurrentPartition++;
    for (auto &Predecessor : Node->MPredecessors) {
      propagatePartitionUp(Predecessor.lock(), CurrentPartition);
    }
    CurrentPartition++;
    Node->MPartitionNum = CurrentPartition;
    CurrentPartition++;
    auto TmpSize = HostTaskList.size();
    for (auto &Successor : Node->MSuccessors) {
      propagatePartitionDown(Successor.lock(), CurrentPartition, HostTaskList);
    }
    if (HostTaskList.size() > TmpSize) {
      // At least one HostTask has been re-numbered so group merge opportunities
      for (const auto &HT : HostTaskList) {
        auto HTPartitionNum = HT->MPartitionNum;
        if (HTPartitionNum != -1) {
          // can merge predecessors of node `Node` with predecessors of node
          // `HT` (HTPartitionNum-1) since HT must be reprocessed
          for (const auto &NodeImpl : MNodeStorage) {
            if (NodeImpl->MPartitionNum == Node->MPartitionNum - 1) {
              NodeImpl->MPartitionNum = HTPartitionNum - 1;
            }
          }
        } else {
          break;
        }
      }
    }
  }

  // Create partitions
  int PartitionFinalNum = 0;
  for (int i = -1; i <= CurrentPartition; i++) {
    const std::shared_ptr<partition> &Partition = std::make_shared<partition>();
    for (auto &Node : MNodeStorage) {
      if (Node->MPartitionNum == i) {
        MPartitionNodes[Node] = PartitionFinalNum;
        if (isPartitionRoot(Node)) {
          Partition->MRoots.insert(Node);
        }
      }
    }
    if (Partition->MRoots.size() > 0) {
      Partition->schedule();
      Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath();
      MPartitions.push_back(Partition);
      PartitionFinalNum++;
    }
  }

  // Add an empty partition if there is no partition, i.e. empty graph
  if (MPartitions.size() == 0) {
    MPartitions.push_back(std::make_shared<partition>());
  }

  // Make global schedule list
  for (const auto &Partition : MPartitions) {
    MSchedule.insert(MSchedule.end(), Partition->MSchedule.begin(),
                     Partition->MSchedule.end());
  }

  // Compute partition dependencies
  for (const auto &Partition : MPartitions) {
    for (auto const &Root : Partition->MRoots) {
      auto RootNode = Root.lock();
      for (const auto &Dep : RootNode->MPredecessors) {
        auto NodeDep = Dep.lock();
        Partition->MPredecessors.push_back(
            MPartitions[MPartitionNodes[NodeDep]]);
      }
    }
  }

  // Reset node groups (if node have to be re-processed - e.g. subgraph)
  for (auto &Node : MNodeStorage) {
    Node->MPartitionNum = -1;
  }
}

static void checkGraphPropertiesAndThrow(const property_list &Properties) {
  auto CheckDataLessProperties = [](int PropertyKind) {
#define __SYCL_DATA_LESS_PROP(NS_QUALIFIER, PROP_NAME, ENUM_VAL)               \
  case NS_QUALIFIER::PROP_NAME::getKind():                                     \
    return true;
#define __SYCL_MANUALLY_DEFINED_PROP(NS_QUALIFIER, PROP_NAME)
    switch (PropertyKind) {
#include <sycl/ext/oneapi/experimental/detail/properties/graph_properties.def>

    default:
      return false;
    }
  };
  // No properties with data for graph now.
  auto NoAllowedPropertiesCheck = [](int) { return false; };
  sycl::detail::PropertyValidator::checkPropsAndThrow(
      Properties, CheckDataLessProperties, NoAllowedPropertiesCheck);
}

graph_impl::graph_impl(const sycl::context &SyclContext,
                       const sycl::device &SyclDevice,
                       const sycl::property_list &PropList)
    : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(),
      MEventsMap(), MInorderQueueMap(),
      MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) {
  checkGraphPropertiesAndThrow(PropList);
  if (PropList.has_property<property::graph::no_cycle_check>()) {
    MSkipCycleChecks = true;
  }
  if (PropList.has_property<property::graph::assume_buffer_outlives_graph>()) {
    MAllowBuffers = true;
  }

  if (!SyclDevice.has(aspect::ext_oneapi_limited_graph) &&
      !SyclDevice.has(aspect::ext_oneapi_graph)) {
    std::stringstream Stream;
    Stream << SyclDevice.get_backend();
    std::string BackendString = Stream.str();
    throw sycl::exception(
        sycl::make_error_code(errc::invalid),
        BackendString + " backend is not supported by SYCL Graph extension.");
  }
}

graph_impl::~graph_impl() {
  try {
    clearQueues();
    for (auto &MemObj : MMemObjs) {
      MemObj->markNoLongerBeingUsedInGraph();
    }
  } catch (std::exception &e) {
    __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~graph_impl", e);
  }
}

std::shared_ptr<node_impl> graph_impl::addNodesToExits(
    const std::list<std::shared_ptr<node_impl>> &NodeList) {
  // Find all input and output nodes from the node list
  std::vector<std::shared_ptr<node_impl>> Inputs;
  std::vector<std::shared_ptr<node_impl>> Outputs;
  for (auto &NodeImpl : NodeList) {
    if (NodeImpl->MPredecessors.size() == 0) {
      Inputs.push_back(NodeImpl);
    }
    if (NodeImpl->MSuccessors.size() == 0) {
      Outputs.push_back(NodeImpl);
    }
  }

  // Find all exit nodes in the current graph and register the Inputs as
  // successors
  for (auto &NodeImpl : MNodeStorage) {
    if (NodeImpl->MSuccessors.size() == 0) {
      for (auto &Input : Inputs) {
        NodeImpl->registerSuccessor(Input);
      }
    }
  }

  // Add all the new nodes to the node storage
  for (auto &Node : NodeList) {
    MNodeStorage.push_back(Node);
    addEventForNode(std::make_shared<sycl::detail::event_impl>(), Node);
  }

  return this->add(Outputs);
}

void graph_impl::addRoot(const std::shared_ptr<node_impl> &Root) {
  MRoots.insert(Root);
}

void graph_impl::removeRoot(const std::shared_ptr<node_impl> &Root) {
  MRoots.erase(Root);
}

std::set<std::shared_ptr<node_impl>> graph_impl::getCGEdges(
    const std::shared_ptr<sycl::detail::CG> &CommandGroup) const {
  const auto &Requirements = CommandGroup->getRequirements();
  if (!MAllowBuffers && Requirements.size()) {
    throw sycl::exception(make_error_code(errc::invalid),
                          "Cannot use buffers in a graph without passing the "
                          "assume_buffer_outlives_graph property on "
                          "Graph construction.");
  }

  if (CommandGroup->getType() == sycl::detail::CGType::Kernel) {
    auto CGKernel =
        static_cast<sycl::detail::CGExecKernel *>(CommandGroup.get());
    if (CGKernel->hasStreams()) {
      throw sycl::exception(
          make_error_code(errc::invalid),
          "Using sycl streams in a graph node is unsupported.");
    }
  }

  // Add any nodes specified by event dependencies into the dependency list
  std::set<std::shared_ptr<node_impl>> UniqueDeps;
  for (auto &Dep : CommandGroup->getEvents()) {
    if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl == MEventsMap.end()) {
      throw sycl::exception(sycl::make_error_code(errc::invalid),
                            "Event dependency from handler::depends_on does "
                            "not correspond to a node within the graph");
    } else {
      UniqueDeps.insert(NodeImpl->second);
    }
  }

  // A unique set of dependencies obtained by checking requirements and events
  for (auto &Req : Requirements) {
    // Look through the graph for nodes which share this requirement
    for (auto &Node : MNodeStorage) {
      if (Node->hasRequirementDependency(Req)) {
        bool ShouldAddDep = true;
        // If any of this node's successors have this requirement then we skip
        // adding the current node as a dependency.
        for (auto &Succ : Node->MSuccessors) {
          if (Succ.lock()->hasRequirementDependency(Req)) {
            ShouldAddDep = false;
            break;
          }
        }
        if (ShouldAddDep) {
          UniqueDeps.insert(Node);
        }
      }
    }
  }

  return UniqueDeps;
}

void graph_impl::markCGMemObjs(
    const std::shared_ptr<sycl::detail::CG> &CommandGroup) {
  const auto &Requirements = CommandGroup->getRequirements();
  for (auto &Req : Requirements) {
    auto MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
    bool WasInserted = MMemObjs.insert(MemObj).second;
    if (WasInserted) {
      MemObj->markBeingUsedInGraph();
    }
  }
}

std::shared_ptr<node_impl>
graph_impl::add(std::vector<std::shared_ptr<node_impl>> &Deps) {
  const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>();

  MNodeStorage.push_back(NodeImpl);

  addDepsToNode(NodeImpl, Deps);
  // Add an event associated with this explicit node for mixed usage
  addEventForNode(std::make_shared<sycl::detail::event_impl>(), NodeImpl);
  return NodeImpl;
}

std::shared_ptr<node_impl>
graph_impl::add(std::function<void(handler &)> CGF,
                const std::vector<sycl::detail::ArgDesc> &Args,
                std::vector<std::shared_ptr<node_impl>> &Deps) {
  (void)Args;
  sycl::handler Handler{shared_from_this()};

  // save code location if one was set in TLS.
  // idealy it would be nice to capture user's call code location
  // by adding a parameter to the graph.add function, but this will
  // break the API. At least capture code location from TLS, user
  // can set it before calling graph.add
  sycl::detail::tls_code_loc_t Tls;
  Handler.saveCodeLoc(Tls.query(), Tls.isToplevel());

  CGF(Handler);

  if (Handler.getType() == sycl::detail::CGType::Barrier) {
    throw sycl::exception(
        make_error_code(errc::invalid),
        "The sycl_ext_oneapi_enqueue_barrier feature is not available with "
        "SYCL Graph Explicit API. Please use empty nodes instead.");
  }

  Handler.finalize();

  node_type NodeType =
      Handler.impl->MUserFacingNodeType !=
              ext::oneapi::experimental::node_type::empty
          ? Handler.impl->MUserFacingNodeType
          : ext::oneapi::experimental::detail::getNodeTypeFromCG(
                Handler.getType());

  auto NodeImpl =
      this->add(NodeType, std::move(Handler.impl->MGraphNodeCG), Deps);

  // Add an event associated with this explicit node for mixed usage
  addEventForNode(std::make_shared<sycl::detail::event_impl>(), NodeImpl);

  // Retrieve any dynamic parameters which have been registered in the CGF and
  // register the actual nodes with them.
  auto &DynamicParams = Handler.impl->MDynamicParameters;

  if (NodeType != node_type::kernel && DynamicParams.size() > 0) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "dynamic_parameters cannot be registered with graph "
                          "nodes which do not represent kernel executions");
  }

  for (auto &[DynamicParam, ArgIndex] : DynamicParams) {
    DynamicParam->registerNode(NodeImpl, ArgIndex);
  }

  return NodeImpl;
}

std::shared_ptr<node_impl>
graph_impl::add(const std::vector<sycl::detail::EventImplPtr> Events) {

  std::vector<std::shared_ptr<node_impl>> Deps;

  // Add any nodes specified by event dependencies into the dependency list
  for (const auto &Dep : Events) {
    if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) {
      Deps.push_back(NodeImpl->second);
    } else {
      throw sycl::exception(sycl::make_error_code(errc::invalid),
                            "Event dependency from handler::depends_on does "
                            "not correspond to a node within the graph");
    }
  }

  return this->add(Deps);
}

std::shared_ptr<node_impl>
graph_impl::add(node_type NodeType,
                std::shared_ptr<sycl::detail::CG> CommandGroup,
                std::vector<std::shared_ptr<node_impl>> &Deps) {

  // A unique set of dependencies obtained by checking requirements and events
  std::set<std::shared_ptr<node_impl>> UniqueDeps = getCGEdges(CommandGroup);

  // Track and mark the memory objects being used by the graph.
  markCGMemObjs(CommandGroup);

  // Add any deps determined from requirements and events into the dependency
  // list
  Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());

  const std::shared_ptr<node_impl> &NodeImpl =
      std::make_shared<node_impl>(NodeType, std::move(CommandGroup));
  MNodeStorage.push_back(NodeImpl);

  addDepsToNode(NodeImpl, Deps);

  return NodeImpl;
}

std::shared_ptr<node_impl>
graph_impl::add(std::shared_ptr<dynamic_command_group_impl> &DynCGImpl,
                std::vector<std::shared_ptr<detail::node_impl>> &Deps) {
  // Set of Dependent nodes based on CG event and accessor dependencies.
  std::set<std::shared_ptr<node_impl>> DynCGDeps =
      getCGEdges(DynCGImpl->MCommandGroups[0]);
  for (unsigned i = 1; i < DynCGImpl->getNumCGs(); i++) {
    auto &CG = DynCGImpl->MCommandGroups[i];
    auto CGEdges = getCGEdges(CG);
    if (CGEdges != DynCGDeps) {
      throw sycl::exception(make_error_code(sycl::errc::invalid),
                            "Command-groups in dynamic command-group don't have"
                            "equivalent dependencies to other graph nodes.");
    }
  }

  // Track and mark the memory objects being used by the graph.
  for (auto &CG : DynCGImpl->MCommandGroups) {
    markCGMemObjs(CG);
  }

  // Get active dynamic command-group CG and use to create a node object
  const auto &ActiveKernel = DynCGImpl->getActiveCG();
  node_type NodeType =
      ext::oneapi::experimental::detail::getNodeTypeFromCG(DynCGImpl->MCGType);
  std::shared_ptr<detail::node_impl> NodeImpl =
      add(NodeType, ActiveKernel, Deps);

  // Add an event associated with this explicit node for mixed usage
  addEventForNode(std::make_shared<sycl::detail::event_impl>(), NodeImpl);

  // Track the dynamic command-group used inside the node object
  DynCGImpl->MNodes.push_back(NodeImpl);

  return NodeImpl;
}

bool graph_impl::clearQueues() {
  bool AnyQueuesCleared = false;
  for (auto &Queue : MRecordingQueues) {
    if (auto ValidQueue = Queue.lock(); ValidQueue) {
      ValidQueue->setCommandGraph(nullptr);
      AnyQueuesCleared = true;
    }
  }
  MRecordingQueues.clear();

  return AnyQueuesCleared;
}

bool graph_impl::checkForCycles() {
  std::list<std::shared_ptr<node_impl>> SortedNodes;
  sortTopological(MRoots, SortedNodes, false);

  // If after a topological sort, not all the nodes in the graph are sorted,
  // then there must be at least one cycle in the graph. This is guaranteed
  // by Kahn's algorithm, which sortTopological() implements.
  bool CycleFound = SortedNodes.size() != MNodeStorage.size();

  // Reset the MTotalVisitedEdges variable to prepare for the next cycle check.
  for (auto &Node : MNodeStorage) {
    Node->MTotalVisitedEdges = 0;
  }

  return CycleFound;
}

void graph_impl::makeEdge(std::shared_ptr<node_impl> Src,
                          std::shared_ptr<node_impl> Dest) {
  throwIfGraphRecordingQueue("make_edge()");
  if (Src == Dest) {
    throw sycl::exception(
        make_error_code(sycl::errc::invalid),
        "make_edge() cannot be called when Src and Dest are the same.");
  }

  bool SrcFound = false;
  bool DestFound = false;
  for (const auto &Node : MNodeStorage) {

    SrcFound |= Node == Src;
    DestFound |= Node == Dest;

    if (SrcFound && DestFound) {
      break;
    }
  }

  if (!SrcFound) {
    throw sycl::exception(make_error_code(sycl::errc::invalid),
                          "Src must be a node inside the graph.");
  }
  if (!DestFound) {
    throw sycl::exception(make_error_code(sycl::errc::invalid),
                          "Dest must be a node inside the graph.");
  }

  bool DestWasGraphRoot = Dest->MPredecessors.size() == 0;

  // We need to add the edges first before checking for cycles
  Src->registerSuccessor(Dest);

  bool DestLostRootStatus = DestWasGraphRoot && Dest->MPredecessors.size() == 1;
  if (DestLostRootStatus) {
    // Dest is no longer a Root node, so we need to remove it from MRoots.
    MRoots.erase(Dest);
  }

  // We can skip cycle checks if either Dest has no successors (cycle not
  // possible) or cycle checks have been disabled with the no_cycle_check
  // property;
  if (Dest->MSuccessors.empty() || !MSkipCycleChecks) {
    bool CycleFound = checkForCycles();

    if (CycleFound) {
      // Remove the added successor and predecessor.
      Src->MSuccessors.pop_back();
      Dest->MPredecessors.pop_back();
      if (DestLostRootStatus) {
        // Add Dest back into MRoots.
        MRoots.insert(Dest);
      }

      throw sycl::exception(make_error_code(sycl::errc::invalid),
                            "Command graphs cannot contain cycles.");
    }
  }
  removeRoot(Dest); // remove receiver from root node list
}

std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents(
    std::weak_ptr<sycl::detail::queue_impl> RecordedQueue) {
  std::vector<sycl::detail::EventImplPtr> Events;

  auto RecordedQueueSP = RecordedQueue.lock();
  for (auto &Node : MNodeStorage) {
    if (Node->MSuccessors.empty()) {
      auto EventForNode = getEventForNode(Node);
      if (EventForNode->getSubmittedQueue() == RecordedQueueSP) {
        Events.push_back(getEventForNode(Node));
      }
    }
  }

  return Events;
}

void graph_impl::beginRecording(
    std::shared_ptr<sycl::detail::queue_impl> Queue) {
  graph_impl::WriteLock Lock(MMutex);
  if (!Queue->hasCommandGraph()) {
    Queue->setCommandGraph(shared_from_this());
    addQueue(Queue);
  }
}

// Check if nodes are empty and if so loop back through predecessors until we
// find the real dependency.
void exec_graph_impl::findRealDeps(
    std::vector<ur_exp_command_buffer_sync_point_t> &Deps,
    std::shared_ptr<node_impl> CurrentNode, int ReferencePartitionNum) {
  if (CurrentNode->isEmpty()) {
    for (auto &N : CurrentNode->MPredecessors) {
      auto NodeImpl = N.lock();
      findRealDeps(Deps, NodeImpl, ReferencePartitionNum);
    }
  } else {
    // Verify if CurrentNode belong the the same partition
    if (MPartitionNodes[CurrentNode] == ReferencePartitionNum) {
      // Verify that the sync point has actually been set for this node.
      auto SyncPoint = MSyncPoints.find(CurrentNode);
      assert(SyncPoint != MSyncPoints.end() &&
             "No sync point has been set for node dependency.");
      // Check if the dependency has already been added.
      if (std::find(Deps.begin(), Deps.end(), SyncPoint->second) ==
          Deps.end()) {
        Deps.push_back(SyncPoint->second);
      }
    }
  }
}

ur_exp_command_buffer_sync_point_t
exec_graph_impl::enqueueNodeDirect(sycl::context Ctx,
                                   sycl::detail::DeviceImplPtr DeviceImpl,
                                   ur_exp_command_buffer_handle_t CommandBuffer,
                                   std::shared_ptr<node_impl> Node) {
  std::vector<ur_exp_command_buffer_sync_point_t> Deps;
  for (auto &N : Node->MPredecessors) {
    findRealDeps(Deps, N.lock(), MPartitionNodes[Node]);
  }
  ur_exp_command_buffer_sync_point_t NewSyncPoint;
  ur_exp_command_buffer_command_handle_t NewCommand = 0;

#ifdef XPTI_ENABLE_INSTRUMENTATION
  int32_t StreamID = xptiRegisterStream(sycl::detail::SYCL_STREAM_NAME);
  sycl::detail::CGExecKernel *CGExec =
      static_cast<sycl::detail::CGExecKernel *>(Node->MCommandGroup.get());
  sycl::detail::code_location CodeLoc(CGExec->MFileName.c_str(),
                                      CGExec->MFunctionName.c_str(),
                                      CGExec->MLine, CGExec->MColumn);
  auto [CmdTraceEvent, InstanceID] = emitKernelInstrumentationData(
      StreamID, CGExec->MSyclKernel, CodeLoc, CGExec->MIsTopCodeLoc,
      CGExec->MKernelName.c_str(), nullptr, CGExec->MNDRDesc,
      CGExec->MKernelBundle, CGExec->MArgs);
  if (CmdTraceEvent)
    sycl::detail::emitInstrumentationGeneral(
        StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr);
#endif

  ur_result_t Res = sycl::detail::enqueueImpCommandBufferKernel(
      Ctx, DeviceImpl, CommandBuffer,
      *static_cast<sycl::detail::CGExecKernel *>((Node->MCommandGroup.get())),
      Deps, &NewSyncPoint, MIsUpdatable ? &NewCommand : nullptr, nullptr);

  if (MIsUpdatable) {
    MCommandMap[Node] = NewCommand;
  }

  if (Res != UR_RESULT_SUCCESS) {
    throw sycl::exception(errc::invalid,
                          "Failed to add kernel to UR command-buffer");
  }

#ifdef XPTI_ENABLE_INSTRUMENTATION
  if (CmdTraceEvent)
    sycl::detail::emitInstrumentationGeneral(
        StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_end, nullptr);
#endif

  return NewSyncPoint;
}

ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode(
    sycl::context Ctx, std::shared_ptr<sycl::detail::device_impl> DeviceImpl,
    ur_exp_command_buffer_handle_t CommandBuffer,
    std::shared_ptr<node_impl> Node) {

  // Queue which will be used for allocation operations for accessors.
  // Will also be used in native commands to return to the user in
  // `interop_handler::get_native_queue()` calls.
  auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
      DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{},
      sycl::property_list{});

  std::vector<ur_exp_command_buffer_sync_point_t> Deps;
  for (auto &N : Node->MPredecessors) {
    findRealDeps(Deps, N.lock(), MPartitionNodes[Node]);
  }

  sycl::detail::EventImplPtr Event =
      sycl::detail::Scheduler::getInstance().addCG(
          Node->getCGCopy(), AllocaQueue, /*EventNeeded=*/true, CommandBuffer,
          Deps);

  if (MIsUpdatable) {
    MCommandMap[Node] = Event->getCommandBufferCommand();
  }

  return Event->getSyncPoint();
}
void exec_graph_impl::createCommandBuffers(
    sycl::device Device, std::shared_ptr<partition> &Partition) {
  ur_exp_command_buffer_handle_t OutCommandBuffer;
  ur_exp_command_buffer_desc_t Desc{
      UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, MIsUpdatable,
      Partition->MIsInOrderGraph && !MEnableProfiling, MEnableProfiling};
  auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
  const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter();
  auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
  ur_result_t Res =
      Adapter->call_nocheck<sycl::detail::UrApiKind::urCommandBufferCreateExp>(
          ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc,
          &OutCommandBuffer);
  if (Res != UR_RESULT_SUCCESS) {
    throw sycl::exception(errc::invalid, "Failed to create UR command-buffer");
  }

  Partition->MCommandBuffers[Device] = OutCommandBuffer;

  for (const auto &Node : Partition->MSchedule) {
    // Empty nodes are not processed as other nodes, but only their
    // dependencies are propagated in findRealDeps
    if (Node->isEmpty())
      continue;

    sycl::detail::CGType type = Node->MCGType;
    // If the node is a kernel with no special requirements we can enqueue it
    // directly.
    if (type == sycl::detail::CGType::Kernel &&
        Node->MCommandGroup->getRequirements().size() +
                static_cast<sycl::detail::CGExecKernel *>(
                    Node->MCommandGroup.get())
                    ->MStreams.size() ==
            0) {
      MSyncPoints[Node] =
          enqueueNodeDirect(MContext, DeviceImpl, OutCommandBuffer, Node);
    } else {
      MSyncPoints[Node] =
          enqueueNode(MContext, DeviceImpl, OutCommandBuffer, Node);
    }

    // Append Node requirements to overall graph requirements
    MRequirements.insert(MRequirements.end(),
                         Node->MCommandGroup->getRequirements().begin(),
                         Node->MCommandGroup->getRequirements().end());
    // Also store the actual accessor to make sure they are kept alive when
    // commands are submitted
    MAccessors.insert(MAccessors.end(),
                      Node->MCommandGroup->getAccStorage().begin(),
                      Node->MCommandGroup->getAccStorage().end());
  }

  Res = Adapter
            ->call_nocheck<sycl::detail::UrApiKind::urCommandBufferFinalizeExp>(
                OutCommandBuffer);
  if (Res != UR_RESULT_SUCCESS) {
    throw sycl::exception(errc::invalid,
                          "Failed to finalize UR command-buffer");
  }
}

exec_graph_impl::exec_graph_impl(sycl::context Context,
                                 const std::shared_ptr<graph_impl> &GraphImpl,
                                 const property_list &PropList)
    : MSchedule(), MGraphImpl(GraphImpl), MSyncPoints(),
      MDevice(GraphImpl->getDevice()), MContext(Context), MRequirements(),
      MExecutionEvents(),
      MIsUpdatable(PropList.has_property<property::graph::updatable>()),
      MEnableProfiling(
          PropList.has_property<property::graph::enable_profiling>()),
      MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) {
  checkGraphPropertiesAndThrow(PropList);
  // If the graph has been marked as updatable then check if the backend
  // actually supports that. Devices supporting aspect::ext_oneapi_graph must
  // have support for graph update.
  if (MIsUpdatable) {
    bool SupportsUpdate = MGraphImpl->getDevice().has(aspect::ext_oneapi_graph);
    if (!SupportsUpdate) {
      throw sycl::exception(sycl::make_error_code(errc::feature_not_supported),
                            "Device does not support Command Graph update");
    }
  }
  // Copy nodes from GraphImpl and merge any subgraph nodes into this graph.
  duplicateNodes();
}

exec_graph_impl::~exec_graph_impl() {
  try {
    const sycl::detail::AdapterPtr &Adapter =
        sycl::detail::getSyclObjImpl(MContext)->getAdapter();
    MSchedule.clear();
    // We need to wait on all command buffer executions before we can release
    // them.
    for (auto &Event : MExecutionEvents) {
      Event->wait(Event);
    }

    for (const auto &Partition : MPartitions) {
      Partition->MSchedule.clear();
      for (const auto &Iter : Partition->MCommandBuffers) {
        if (auto CmdBuf = Iter.second; CmdBuf) {
          ur_result_t Res = Adapter->call_nocheck<
              sycl::detail::UrApiKind::urCommandBufferReleaseExp>(CmdBuf);
          (void)Res;
          assert(Res == UR_RESULT_SUCCESS);
        }
      }
    }
  } catch (std::exception &e) {
    __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~exec_graph_impl", e);
  }
}

sycl::event
exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
                         sycl::detail::CG::StorageInitHelper CGData) {
  WriteLock Lock(MMutex);

  // Map of the partitions to their execution events
  std::unordered_map<std::shared_ptr<partition>, sycl::detail::EventImplPtr>
      PartitionsExecutionEvents;

  auto CreateNewEvent([&]() {
    auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
    NewEvent->setContextImpl(Queue->getContextImplPtr());
    NewEvent->setStateIncomplete();
    return NewEvent;
  });

  sycl::detail::EventImplPtr NewEvent;
  std::vector<sycl::detail::EventImplPtr> BackupCGDataMEvents;
  if (MPartitions.size() > 1) {
    BackupCGDataMEvents = CGData.MEvents;
  }
  for (uint32_t currentPartitionsNum = 0;
       currentPartitionsNum < MPartitions.size(); currentPartitionsNum++) {
    auto CurrentPartition = MPartitions[currentPartitionsNum];
    // restore initial MEvents to add only needed additional depenencies
    if (currentPartitionsNum > 0) {
      CGData.MEvents = BackupCGDataMEvents;
    }

    for (auto const &DepPartition : CurrentPartition->MPredecessors) {
      CGData.MEvents.push_back(PartitionsExecutionEvents[DepPartition]);
    }

    auto CommandBuffer = CurrentPartition->MCommandBuffers[Queue->get_device()];

    if (CommandBuffer) {
      // if previous submissions are incompleted, we automatically
      // add completion events of previous submissions as dependencies.
      // With Level-Zero backend we cannot resubmit a command-buffer until the
      // previous one has already completed.
      // Indeed, since a command-list does not accept a list a dependencies at
      // submission, we circumvent this lack by adding a barrier that waits on a
      // specific event and then define the conditions to signal this event in
      // another command-list. Consequently, if a second submission is
      // performed, the signal conditions of this single event are redefined by
      // this second submission. Thus, this can lead to an undefined behaviour
      // and potential hangs. We have therefore to expliclty wait in the host
      // for previous submission to complete before resubmitting the
      // command-buffer for level-zero backend.
      // TODO : add a check to release this constraint and allow multiple
      // concurrent submissions if the exec_graph has been updated since the
      // last submission.
      for (std::vector<sycl::detail::EventImplPtr>::iterator It =
               MExecutionEvents.begin();
           It != MExecutionEvents.end();) {
        auto Event = *It;
        if (!Event->isCompleted()) {
          if (Queue->get_device().get_backend() ==
              sycl::backend::ext_oneapi_level_zero) {
            Event->wait(Event);
          } else {
            auto &AttachedEventsList = Event->getPostCompleteEvents();
            CGData.MEvents.reserve(AttachedEventsList.size() + 1);
            CGData.MEvents.push_back(Event);
            // Add events of the previous execution of all graph partitions.
            for (auto &AttachedEvent : AttachedEventsList) {
              CGData.MEvents.push_back(AttachedEvent);
            }
          }
          ++It;
        } else {
          // Remove completed events
          It = MExecutionEvents.erase(It);
        }
      }

      NewEvent = CreateNewEvent();
      ur_event_handle_t UREvent = nullptr;
      // Merge requirements from the nodes into requirements (if any) from the
      // handler.
      CGData.MRequirements.insert(CGData.MRequirements.end(),
                                  MRequirements.begin(), MRequirements.end());
      CGData.MAccStorage.insert(CGData.MAccStorage.end(), MAccessors.begin(),
                                MAccessors.end());

      // If we have no requirements or dependent events for the command buffer,
      // enqueue it directly
      if (CGData.MRequirements.empty() && CGData.MEvents.empty()) {
        NewEvent->setSubmissionTime();
        NewEvent->setHostEnqueueTime();
        ur_result_t Res =
            Queue->getAdapter()
                ->call_nocheck<
                    sycl::detail::UrApiKind::urEnqueueCommandBufferExp>(
                    Queue->getHandleRef(), CommandBuffer, 0, nullptr, &UREvent);
        NewEvent->setHandle(UREvent);
        if (Res == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) {
          throw sycl::exception(
              make_error_code(errc::invalid),
              "Graphs cannot be submitted to a queue which uses "
              "immediate command lists. Use "
              "sycl::ext::intel::property::queue::no_immediate_"
              "command_list to disable them.");
        } else if (Res != UR_RESULT_SUCCESS) {
          throw sycl::exception(
              errc::event,
              "Failed to enqueue event for command buffer submission");
        }
      } else {
        std::unique_ptr<sycl::detail::CG> CommandGroup =
            std::make_unique<sycl::detail::CGExecCommandBuffer>(
                CommandBuffer, nullptr, std::move(CGData));

        NewEvent = sycl::detail::Scheduler::getInstance().addCG(
            std::move(CommandGroup), Queue, /*EventNeeded=*/true);
      }
      NewEvent->setEventFromSubmittedExecCommandBuffer(true);
    } else if ((CurrentPartition->MSchedule.size() > 0) &&
               (CurrentPartition->MSchedule.front()->MCGType ==
                sycl::detail::CGType::CodeplayHostTask)) {

      auto NodeImpl = CurrentPartition->MSchedule.front();
      // Schedule host task
      NodeImpl->MCommandGroup->getEvents().insert(
          NodeImpl->MCommandGroup->getEvents().end(), CGData.MEvents.begin(),
          CGData.MEvents.end());
      // HostTask CG stores the Queue on which the task was submitted.
      // In case of graph, this queue may differ from the actual execution
      // queue. We therefore overload this Queue before submitting the task.
      static_cast<sycl::detail::CGHostTask &>(*NodeImpl->MCommandGroup.get())
          .MQueue = Queue;

      NewEvent = sycl::detail::Scheduler::getInstance().addCG(
          NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
    } else {
      std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
      for (auto &NodeImpl : CurrentPartition->MSchedule) {
        std::vector<ur_event_handle_t> RawEvents;

        // If the node has no requirements for accessors etc. then we skip the
        // scheduler and enqueue directly.
        if (NodeImpl->MCGType == sycl::detail::CGType::Kernel &&
            NodeImpl->MCommandGroup->getRequirements().size() +
                    static_cast<sycl::detail::CGExecKernel *>(
                        NodeImpl->MCommandGroup.get())
                        ->MStreams.size() ==
                0) {
          sycl::detail::CGExecKernel *CG =
              static_cast<sycl::detail::CGExecKernel *>(
                  NodeImpl->MCommandGroup.get());
          auto OutEvent = CreateNewEvent();
          sycl::detail::enqueueImpKernel(
              Queue, CG->MNDRDesc, CG->MArgs, CG->MKernelBundle,
              CG->MSyclKernel, CG->MKernelName, RawEvents, OutEvent,
              // TODO: Pass accessor mem allocations
              nullptr,
              // TODO: Extract from handler
              UR_KERNEL_CACHE_CONFIG_DEFAULT, CG->MKernelIsCooperative,
              CG->MKernelUsesClusterLaunch, CG->MKernelWorkGroupMemorySize);
          ScheduledEvents.push_back(NewEvent);
        } else if (!NodeImpl->isEmpty()) {
          // Empty nodes are node processed as other nodes, but only their
          // dependencies are propagated in findRealDeps
          sycl::detail::EventImplPtr EventImpl =
              sycl::detail::Scheduler::getInstance().addCG(
                  NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);

          ScheduledEvents.push_back(EventImpl);
        }
      }
      // Create an event which has all kernel events as dependencies
      NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
      NewEvent->setStateIncomplete();
      NewEvent->getPreparedDepsEvents() = ScheduledEvents;
    }
    PartitionsExecutionEvents[CurrentPartition] = NewEvent;
  }

  // Keep track of this execution event so we can make sure it's completed in
  // the destructor.
  MExecutionEvents.push_back(NewEvent);
  // Attach events of previous partitions to ensure that when the returned event
  // is complete all execution associated with the graph have been completed.
  for (auto const &Elem : PartitionsExecutionEvents) {
    if (Elem.second != NewEvent) {
      NewEvent->attachEventToComplete(Elem.second);
    }
  }
  NewEvent->setProfilingEnabled(MEnableProfiling);
  sycl::event QueueEvent =
      sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
  return QueueEvent;
}

void exec_graph_impl::duplicateNodes() {
  // Map of original modifiable nodes (keys) to new duplicated nodes (values)
  std::map<std::shared_ptr<node_impl>, std::shared_ptr<node_impl>> NodesMap;

  const std::vector<std::shared_ptr<node_impl>> &ModifiableNodes =
      MGraphImpl->MNodeStorage;
  std::deque<std::shared_ptr<node_impl>> NewNodes;

  for (size_t i = 0; i < ModifiableNodes.size(); i++) {
    auto OriginalNode = ModifiableNodes[i];
    std::shared_ptr<node_impl> NodeCopy =
        std::make_shared<node_impl>(*OriginalNode);

    // Associate the ID of the original node with the node copy for later quick
    // access
    MIDCache.insert(std::make_pair(OriginalNode->MID, NodeCopy));

    // Clear edges between nodes so that we can replace with new ones
    NodeCopy->MSuccessors.clear();
    NodeCopy->MPredecessors.clear();
    // Push the new node to the front of the stack
    NewNodes.push_back(NodeCopy);
    // Associate the new node with the old one for updating edges
    NodesMap.insert({OriginalNode, NodeCopy});
  }

  // Now that all nodes have been copied rebuild edges on new nodes. This must
  // be done as a separate step since successors may be out of order.
  for (size_t i = 0; i < ModifiableNodes.size(); i++) {
    auto OriginalNode = ModifiableNodes[i];
    auto NodeCopy = NewNodes[i];
    // Look through all the original node successors, find their copies and
    // register those as successors with the current copied node
    for (auto &NextNode : OriginalNode->MSuccessors) {
      auto Successor = NodesMap.at(NextNode.lock());
      NodeCopy->registerSuccessor(Successor);
    }
  }

  // Subgraph nodes need special handling, we extract all subgraph nodes and
  // merge them into the main node list

  for (auto NewNodeIt = NewNodes.rbegin(); NewNodeIt != NewNodes.rend();
       ++NewNodeIt) {
    auto NewNode = *NewNodeIt;
    if (NewNode->MNodeType != node_type::subgraph) {
      continue;
    }
    const std::vector<std::shared_ptr<node_impl>> &SubgraphNodes =
        NewNode->MSubGraphImpl->MNodeStorage;
    std::deque<std::shared_ptr<node_impl>> NewSubgraphNodes{};

    // Map of original subgraph nodes (keys) to new duplicated nodes (values)
    std::map<std::shared_ptr<node_impl>, std::shared_ptr<node_impl>>
        SubgraphNodesMap;

    // Copy subgraph nodes
    for (size_t i = 0; i < SubgraphNodes.size(); i++) {
      auto SubgraphNode = SubgraphNodes[i];
      auto NodeCopy = std::make_shared<node_impl>(*SubgraphNode);
      // Associate the ID of the original subgraph node with all extracted node
      // copies for future quick access.
      MIDCache.insert(std::make_pair(SubgraphNode->MID, NodeCopy));

      NewSubgraphNodes.push_back(NodeCopy);
      SubgraphNodesMap.insert({SubgraphNode, NodeCopy});
      NodeCopy->MSuccessors.clear();
      NodeCopy->MPredecessors.clear();
    }

    // Rebuild edges for new subgraph nodes
    for (size_t i = 0; i < SubgraphNodes.size(); i++) {
      auto SubgraphNode = SubgraphNodes[i];
      auto NodeCopy = NewSubgraphNodes[i];

      for (auto &NextNode : SubgraphNode->MSuccessors) {
        auto Successor = SubgraphNodesMap.at(NextNode.lock());
        NodeCopy->registerSuccessor(Successor);
      }
    }

    // Collect input and output nodes for the subgraph
    std::vector<std::shared_ptr<node_impl>> Inputs;
    std::vector<std::shared_ptr<node_impl>> Outputs;
    for (auto &NodeImpl : NewSubgraphNodes) {
      if (NodeImpl->MPredecessors.size() == 0) {
        Inputs.push_back(NodeImpl);
      }
      if (NodeImpl->MSuccessors.size() == 0) {
        Outputs.push_back(NodeImpl);
      }
    }

    // Update the predecessors and successors of the nodes which reference the
    // original subgraph node

    // Predecessors
    for (auto &PredNodeWeak : NewNode->MPredecessors) {
      auto PredNode = PredNodeWeak.lock();
      auto &Successors = PredNode->MSuccessors;

      // Remove the subgraph node from this nodes successors
      Successors.erase(std::remove_if(Successors.begin(), Successors.end(),
                                      [NewNode](auto WeakNode) {
                                        return WeakNode.lock() == NewNode;
                                      }),
                       Successors.end());

      // Add all input nodes from the subgraph as successors for this node
      // instead
      for (auto &Input : Inputs) {
        PredNode->registerSuccessor(Input);
      }
    }

    // Successors
    for (auto &SuccNodeWeak : NewNode->MSuccessors) {
      auto SuccNode = SuccNodeWeak.lock();
      auto &Predecessors = SuccNode->MPredecessors;

      // Remove the subgraph node from this nodes successors
      Predecessors.erase(std::remove_if(Predecessors.begin(),
                                        Predecessors.end(),
                                        [NewNode](auto WeakNode) {
                                          return WeakNode.lock() == NewNode;
                                        }),
                         Predecessors.end());

      // Add all Output nodes from the subgraph as predecessors for this node
      // instead
      for (auto &Output : Outputs) {
        Output->registerSuccessor(SuccNode);
      }
    }

    // Remove single subgraph node and add all new individual subgraph nodes
    // to the node storage in its place
    auto OldPositionIt =
        NewNodes.erase(std::find(NewNodes.begin(), NewNodes.end(), NewNode));
    // Also set the iterator to the newly added nodes so we can continue
    // iterating over all remaining nodes
    auto InsertIt = NewNodes.insert(OldPositionIt, NewSubgraphNodes.begin(),
                                    NewSubgraphNodes.end());
    // Since the new reverse_iterator will be at i - 1 we need to advance it
    // when constructing
    NewNodeIt = std::make_reverse_iterator(std::next(InsertIt));
  }

  // Store all the new nodes locally
  MNodeStorage.insert(MNodeStorage.begin(), NewNodes.begin(), NewNodes.end());
}

void exec_graph_impl::update(std::shared_ptr<graph_impl> GraphImpl) {

  if (MDevice != GraphImpl->getDevice()) {
    throw sycl::exception(
        sycl::make_error_code(errc::invalid),
        "Cannot update using a graph created with a different device.");
  }
  if (MContext != GraphImpl->getContext()) {
    throw sycl::exception(
        sycl::make_error_code(errc::invalid),
        "Cannot update using a graph created with a different context.");
  }

  if (MNodeStorage.size() != GraphImpl->MNodeStorage.size()) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "Cannot update using a graph with a different "
                          "topology. Mismatch found in the number of nodes.");
  }

  for (uint32_t i = 0; i < MNodeStorage.size(); ++i) {
    if (MNodeStorage[i]->MSuccessors.size() !=
            GraphImpl->MNodeStorage[i]->MSuccessors.size() ||
        MNodeStorage[i]->MPredecessors.size() !=
            GraphImpl->MNodeStorage[i]->MPredecessors.size()) {
      throw sycl::exception(
          sycl::make_error_code(errc::invalid),
          "Cannot update using a graph with a different topology. Mismatch "
          "found in the number of edges.");
    }
    if (MNodeStorage[i]->MCGType != GraphImpl->MNodeStorage[i]->MCGType) {
      throw sycl::exception(
          sycl::make_error_code(errc::invalid),
          "Cannot update using a graph with mismatched node types. Each pair "
          "of nodes being updated must have the same type");
    }

    if (MNodeStorage[i]->MCGType == sycl::detail::CGType::Kernel) {
      sycl::detail::CGExecKernel *TargetCGExec =
          static_cast<sycl::detail::CGExecKernel *>(
              MNodeStorage[i]->MCommandGroup.get());
      const std::string &TargetKernelName = TargetCGExec->getKernelName();

      sycl::detail::CGExecKernel *SourceCGExec =
          static_cast<sycl::detail::CGExecKernel *>(
              GraphImpl->MNodeStorage[i]->MCommandGroup.get());
      const std::string &SourceKernelName = SourceCGExec->getKernelName();

      if (TargetKernelName.compare(SourceKernelName) != 0) {
        std::stringstream ErrorStream(
            "Cannot update using a graph with mismatched kernel "
            "types. Source node type ");
        ErrorStream << SourceKernelName;
        ErrorStream << ", target node type ";
        ErrorStream << TargetKernelName;
        throw sycl::exception(sycl::make_error_code(errc::invalid),
                              ErrorStream.str());
      }
    }
  }

  for (uint32_t i = 0; i < MNodeStorage.size(); ++i) {
    MIDCache.insert(
        std::make_pair(GraphImpl->MNodeStorage[i]->MID, MNodeStorage[i]));
  }

  update(GraphImpl->MNodeStorage);
}

void exec_graph_impl::update(std::shared_ptr<node_impl> Node) {
  this->update(std::vector<std::shared_ptr<node_impl>>{Node});
}

void exec_graph_impl::update(
    const std::vector<std::shared_ptr<node_impl>> &Nodes) {
  if (!MIsUpdatable) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "update() cannot be called on a executable graph "
                          "which was not created with property::updatable");
  }

  // If the graph contains host tasks we need special handling here because
  // their state lives in the graph object itself, so we must do the update
  // immediately here. Whereas all other command state lives in the backend so
  // it can be scheduled along with other commands.
  if (MContainsHostTask) {
    updateHostTasksImpl(Nodes);
  }

  // If there are any accessor requirements, we have to update through the
  // scheduler to ensure that any allocations have taken place before trying
  // to update.
  std::vector<sycl::detail::AccessorImplHost *> UpdateRequirements;
  bool NeedScheduledUpdate = needsScheduledUpdate(Nodes, UpdateRequirements);
  if (NeedScheduledUpdate) {
    // Clean up any execution events which have finished so we don't pass them
    // to the scheduler.
    for (auto It = MExecutionEvents.begin(); It != MExecutionEvents.end();) {
      if ((*It)->isCompleted()) {
        It = MExecutionEvents.erase(It);
        continue;
      }
      ++It;
    }

    auto AllocaQueue = std::make_shared<sycl::detail::queue_impl>(
        sycl::detail::getSyclObjImpl(MGraphImpl->getDevice()),
        sycl::detail::getSyclObjImpl(MGraphImpl->getContext()),
        sycl::async_handler{}, sycl::property_list{});

    // Track the event for the update command since execution may be blocked by
    // other scheduler commands
    auto UpdateEvent =
        sycl::detail::Scheduler::getInstance().addCommandGraphUpdate(
            this, Nodes, AllocaQueue, UpdateRequirements, MExecutionEvents);

    MExecutionEvents.push_back(UpdateEvent);

    if (MContainsHostTask) {
      // If the graph has HostTasks, the update has to be blocking. This is
      // needed because HostTask nodes (and all the nodes that depend on
      // HostTasks), are scheduled using a separate thread. This wait call
      // acts as a synchronization point for that thread.
      UpdateEvent->wait(UpdateEvent);
    }
  } else {
    // For each partition in the executable graph, call UR update on the
    // command-buffer with the nodes to update.
    auto PartitionedNodes = getURUpdatableNodes(Nodes);
    for (auto &[PartitionIndex, NodeImpl] : PartitionedNodes) {
      auto &Partition = MPartitions[PartitionIndex];
      auto CommandBuffer = Partition->MCommandBuffers[MDevice];
      updateURImpl(CommandBuffer, NodeImpl);
    }
  }

  // Rebuild cached requirements and accessor storage for this graph with
  // updated nodes
  MRequirements.clear();
  MAccessors.clear();
  for (auto &Node : MNodeStorage) {
    if (!Node->MCommandGroup)
      continue;
    MRequirements.insert(MRequirements.end(),
                         Node->MCommandGroup->getRequirements().begin(),
                         Node->MCommandGroup->getRequirements().end());
    MAccessors.insert(MAccessors.end(),
                      Node->MCommandGroup->getAccStorage().begin(),
                      Node->MCommandGroup->getAccStorage().end());
  }
}

bool exec_graph_impl::needsScheduledUpdate(
    const std::vector<std::shared_ptr<node_impl>> &Nodes,
    std::vector<sycl::detail::AccessorImplHost *> &UpdateRequirements) {
  // If there are any accessor requirements, we have to update through the
  // scheduler to ensure that any allocations have taken place before trying to
  // update.
  bool NeedScheduledUpdate = false;
  // At worst we may have as many requirements as there are for the entire graph
  // for updating.
  UpdateRequirements.reserve(MRequirements.size());
  for (auto &Node : Nodes) {
    // Check if node(s) derived from this modifiable node exists in this graph
    if (MIDCache.count(Node->getID()) == 0) {
      throw sycl::exception(
          sycl::make_error_code(errc::invalid),
          "Node passed to update() is not part of the graph.");
    }

    if (!Node->isUpdatable()) {
      throw sycl::exception(
          errc::invalid,
          "Unsupported node type for update. Only kernel, host_task, "
          "barrier and empty nodes are supported.");
    }

    if (const auto &CG = Node->MCommandGroup;
        CG && CG->getRequirements().size() != 0) {
      NeedScheduledUpdate = true;

      UpdateRequirements.insert(UpdateRequirements.end(),
                                Node->MCommandGroup->getRequirements().begin(),
                                Node->MCommandGroup->getRequirements().end());
    }
  }

  // If we have previous execution events do the update through the scheduler to
  // ensure it is ordered correctly.
  NeedScheduledUpdate |= MExecutionEvents.size() > 0;

  return NeedScheduledUpdate;
}

void exec_graph_impl::populateURKernelUpdateStructs(
    const std::shared_ptr<node_impl> &Node,
    std::pair<ur_program_handle_t, ur_kernel_handle_t> &BundleObjs,
    std::vector<ur_exp_command_buffer_update_memobj_arg_desc_t> &MemobjDescs,
    std::vector<ur_kernel_arg_mem_obj_properties_t> &MemobjProps,
    std::vector<ur_exp_command_buffer_update_pointer_arg_desc_t> &PtrDescs,
    std::vector<ur_exp_command_buffer_update_value_arg_desc_t> &ValueDescs,
    sycl::detail::NDRDescT &NDRDesc,
    ur_exp_command_buffer_update_kernel_launch_desc_t &UpdateDesc) const {
  auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
  const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter();
  auto DeviceImpl = sycl::detail::getSyclObjImpl(MGraphImpl->getDevice());

  // Gather arg information from Node
  auto &ExecCG =
      *(static_cast<sycl::detail::CGExecKernel *>(Node->MCommandGroup.get()));
  // Copy args because we may modify them
  std::vector<sycl::detail::ArgDesc> NodeArgs = ExecCG.getArguments();
  // Copy NDR desc since we need to modify it
  NDRDesc = ExecCG.MNDRDesc;

  ur_kernel_handle_t UrKernel = nullptr;
  auto Kernel = ExecCG.MSyclKernel;
  auto KernelBundleImplPtr = ExecCG.MKernelBundle;
  std::shared_ptr<sycl::detail::kernel_impl> SyclKernelImpl = nullptr;
  const sycl::detail::KernelArgMask *EliminatedArgMask = nullptr;

  // Use kernel_bundle if available unless it is interop.
  // Interop bundles can't be used in the first branch, because the kernels
  // in interop kernel bundles (if any) do not have kernel_id
  // and can therefore not be looked up, but since they are self-contained
  // they can simply be launched directly.
  if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
    auto KernelName = ExecCG.MKernelName;
    kernel_id KernelID =
        sycl::detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
    kernel SyclKernel =
        KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr);
    SyclKernelImpl = sycl::detail::getSyclObjImpl(SyclKernel);
    UrKernel = SyclKernelImpl->getHandleRef();
    EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
  } else if (Kernel != nullptr) {
    UrKernel = Kernel->getHandleRef();
    EliminatedArgMask = Kernel->getKernelArgMask();
  } else {
    ur_program_handle_t UrProgram = nullptr;
    std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =
        sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
            ContextImpl, DeviceImpl, ExecCG.MKernelName);
    BundleObjs = std::make_pair(UrProgram, UrKernel);
  }

  // Remove eliminated args
  std::vector<sycl::detail::ArgDesc> MaskedArgs;
  MaskedArgs.reserve(NodeArgs.size());

  sycl::detail::applyFuncOnFilteredArgs(
      EliminatedArgMask, NodeArgs,
      [&MaskedArgs](sycl::detail::ArgDesc &Arg, int NextTrueIndex) {
        MaskedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, NextTrueIndex);
      });

  // Reverse kernel dims
  sycl::detail::ReverseRangeDimensionsForKernel(NDRDesc);

  size_t RequiredWGSize[3] = {0, 0, 0};
  size_t *LocalSize = nullptr;

  if (NDRDesc.LocalSize[0] != 0)
    LocalSize = &NDRDesc.LocalSize[0];
  else {
    Adapter->call<sycl::detail::UrApiKind::urKernelGetGroupInfo>(
        UrKernel, DeviceImpl->getHandleRef(),
        UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
        RequiredWGSize,
        /* param_value_size_ret = */ nullptr);

    const bool EnforcedLocalSize =
        (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
         RequiredWGSize[2] != 0);
    if (EnforcedLocalSize)
      LocalSize = RequiredWGSize;
  }

  // Storage for individual arg descriptors
  MemobjDescs.reserve(MaskedArgs.size());
  PtrDescs.reserve(MaskedArgs.size());
  ValueDescs.reserve(MaskedArgs.size());
  MemobjProps.resize(MaskedArgs.size()); // resize since we access by reference

  UpdateDesc.stype =
      UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC;
  UpdateDesc.pNext = nullptr;

  // Collect arg descriptors and fill kernel launch descriptor
  using sycl::detail::kernel_param_kind_t;
  for (size_t i = 0; i < MaskedArgs.size(); i++) {
    auto &NodeArg = MaskedArgs[i];
    switch (NodeArg.MType) {
    case kernel_param_kind_t::kind_pointer: {
      PtrDescs.push_back(
          {UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC,
           nullptr, static_cast<uint32_t>(NodeArg.MIndex), nullptr,
           NodeArg.MPtr});
    } break;
    case kernel_param_kind_t::kind_std_layout: {
      ValueDescs.push_back(
          {UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, nullptr,
           static_cast<uint32_t>(NodeArg.MIndex),
           static_cast<uint32_t>(NodeArg.MSize), nullptr, NodeArg.MPtr});
    } break;
    case kernel_param_kind_t::kind_accessor: {
      sycl::detail::Requirement *Req =
          static_cast<sycl::detail::Requirement *>(NodeArg.MPtr);

      ur_kernel_arg_mem_obj_properties_t &MemObjProp = MemobjProps[i];
      MemObjProp.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
      MemObjProp.pNext = nullptr;
      switch (Req->MAccessMode) {
      case access::mode::read: {
        MemObjProp.memoryAccess = UR_MEM_FLAG_READ_ONLY;
        break;
      }
      case access::mode::write:
      case access::mode::discard_write: {
        MemObjProp.memoryAccess = UR_MEM_FLAG_WRITE_ONLY;
        break;
      }
      default: {
        MemObjProp.memoryAccess = UR_MEM_FLAG_READ_WRITE;
        break;
      }
      }
      MemobjDescs.push_back(
          {UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC, nullptr,
           static_cast<uint32_t>(NodeArg.MIndex), &MemObjProp,
           static_cast<ur_mem_handle_t>(Req->MData)});

    } break;

    default:
      break;
    }
  }

  UpdateDesc.hNewKernel = UrKernel;
  UpdateDesc.numNewMemObjArgs = MemobjDescs.size();
  UpdateDesc.pNewMemObjArgList = MemobjDescs.data();
  UpdateDesc.numNewPointerArgs = PtrDescs.size();
  UpdateDesc.pNewPointerArgList = PtrDescs.data();
  UpdateDesc.numNewValueArgs = ValueDescs.size();
  UpdateDesc.pNewValueArgList = ValueDescs.data();

  UpdateDesc.pNewGlobalWorkOffset = &NDRDesc.GlobalOffset[0];
  UpdateDesc.pNewGlobalWorkSize = &NDRDesc.GlobalSize[0];
  UpdateDesc.pNewLocalWorkSize = LocalSize;
  UpdateDesc.newWorkDim = NDRDesc.Dims;

  // Query the ID cache to find the equivalent exec node for the node passed to
  // this function.
  // TODO: Handle subgraphs or any other cases where multiple nodes may be
  // associated with a single key, once those node types are supported for
  // update.
  auto ExecNode = MIDCache.find(Node->MID);
  assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache");

  auto Command = MCommandMap.find(ExecNode->second);
  assert(Command != MCommandMap.end());
  UpdateDesc.hCommand = Command->second;

  // Update ExecNode with new values from Node, in case we ever need to
  // rebuild the command buffers
  ExecNode->second->updateFromOtherNode(Node);
}

std::map<int, std::vector<std::shared_ptr<node_impl>>>
exec_graph_impl::getURUpdatableNodes(
    const std::vector<std::shared_ptr<node_impl>> &Nodes) const {
  // Iterate over the list of nodes, and for every node that can
  // be updated through UR, add it to the list of nodes for
  // that can be updated for the UR command-buffer partition.
  std::map<int, std::vector<std::shared_ptr<node_impl>>> PartitionedNodes;

  // Initialize vector for each partition
  for (size_t i = 0; i < MPartitions.size(); i++) {
    PartitionedNodes[i] = {};
  }

  for (auto &Node : Nodes) {
    // Kernel node update is the only command type supported in UR for update.
    if (Node->MCGType != sycl::detail::CGType::Kernel) {
      continue;
    }

    auto ExecNode = MIDCache.find(Node->MID);
    assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache");
    auto PartitionIndex = MPartitionNodes.find(ExecNode->second);
    assert(PartitionIndex != MPartitionNodes.end());
    PartitionedNodes[PartitionIndex->second].push_back(Node);
  }

  return PartitionedNodes;
}

void exec_graph_impl::updateHostTasksImpl(
    const std::vector<std::shared_ptr<node_impl>> &Nodes) const {
  for (auto &Node : Nodes) {
    if (Node->MNodeType != node_type::host_task) {
      continue;
    }
    // Query the ID cache to find the equivalent exec node for the node passed
    // to this function.
    auto ExecNode = MIDCache.find(Node->MID);
    assert(ExecNode != MIDCache.end() && "Node ID was not found in ID cache");

    ExecNode->second->updateFromOtherNode(Node);
  }
}

void exec_graph_impl::updateURImpl(
    ur_exp_command_buffer_handle_t CommandBuffer,
    const std::vector<std::shared_ptr<node_impl>> &Nodes) const {
  const size_t NumUpdatableNodes = Nodes.size();
  if (NumUpdatableNodes == 0) {
    return;
  }

  // The urCommandBufferUpdateKernelLaunchExp API takes structs which contain
  // members that are pointers to other structs. The lifetime of all the
  // pointers (including nested pointers) needs to be valid at the time of the
  // urCommandBufferUpdateKernelLaunchExp call. Define the objects here which
  // will be populated and used in the urCommandBufferUpdateKernelLaunchExp
  // call.
  std::vector<std::vector<ur_exp_command_buffer_update_memobj_arg_desc_t>>
      MemobjDescsList(NumUpdatableNodes);
  std::vector<std::vector<ur_kernel_arg_mem_obj_properties_t>> MemobjPropsList(
      NumUpdatableNodes);
  std::vector<std::vector<ur_exp_command_buffer_update_pointer_arg_desc_t>>
      PtrDescsList(NumUpdatableNodes);
  std::vector<std::vector<ur_exp_command_buffer_update_value_arg_desc_t>>
      ValueDescsList(NumUpdatableNodes);
  std::vector<sycl::detail::NDRDescT> NDRDescList(NumUpdatableNodes);
  std::vector<ur_exp_command_buffer_update_kernel_launch_desc_t> UpdateDescList(
      NumUpdatableNodes);
  std::vector<std::pair<ur_program_handle_t, ur_kernel_handle_t>>
      KernelBundleObjList(NumUpdatableNodes);

  size_t StructListIndex = 0;
  for (auto &Node : Nodes) {
    // This should be the case when getURUpdatableNodes() is used to
    // create the list of nodes.
    assert(Node->MCGType == sycl::detail::CGType::Kernel);

    auto &MemobjDescs = MemobjDescsList[StructListIndex];
    auto &MemobjProps = MemobjPropsList[StructListIndex];
    auto &KernelBundleObjs = KernelBundleObjList[StructListIndex];
    auto &PtrDescs = PtrDescsList[StructListIndex];
    auto &ValueDescs = ValueDescsList[StructListIndex];
    auto &NDRDesc = NDRDescList[StructListIndex];
    auto &UpdateDesc = UpdateDescList[StructListIndex];
    populateURKernelUpdateStructs(Node, KernelBundleObjs, MemobjDescs,
                                  MemobjProps, PtrDescs, ValueDescs, NDRDesc,
                                  UpdateDesc);
    StructListIndex++;
  }

  auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
  const sycl::detail::AdapterPtr &Adapter = ContextImpl->getAdapter();
  Adapter->call<sycl::detail::UrApiKind::urCommandBufferUpdateKernelLaunchExp>(
      CommandBuffer, UpdateDescList.size(), UpdateDescList.data());

  for (auto &BundleObjs : KernelBundleObjList) {
    // We retained these objects by inside populateUpdateStruct() by calling
    // getOrCreateKernel()
    if (auto &UrKernel = BundleObjs.second; nullptr != UrKernel) {
      Adapter->call<sycl::detail::UrApiKind::urKernelRelease>(UrKernel);
    }
    if (auto &UrProgram = BundleObjs.first; nullptr != UrProgram) {
      Adapter->call<sycl::detail::UrApiKind::urProgramRelease>(UrProgram);
    }
  }
}

modifiable_command_graph::modifiable_command_graph(
    const sycl::context &SyclContext, const sycl::device &SyclDevice,
    const sycl::property_list &PropList)
    : impl(std::make_shared<detail::graph_impl>(SyclContext, SyclDevice,
                                                PropList)) {}

modifiable_command_graph::modifiable_command_graph(
    const sycl::queue &SyclQueue, const sycl::property_list &PropList)
    : impl(std::make_shared<detail::graph_impl>(
          SyclQueue.get_context(), SyclQueue.get_device(), PropList)) {}

node modifiable_command_graph::addImpl(dynamic_command_group &DynCGF,
                                       const std::vector<node> &Deps) {
  impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
  auto DynCGFImpl = sycl::detail::getSyclObjImpl(DynCGF);

  if (DynCGFImpl->MGraph != impl) {
    throw sycl::exception(make_error_code(sycl::errc::invalid),
                          "Graph does not match the graph associated with "
                          "dynamic command-group.");
  }

  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
  for (auto &D : Deps) {
    DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
  }

  graph_impl::WriteLock Lock(impl->MMutex);
  std::shared_ptr<detail::node_impl> NodeImpl = impl->add(DynCGFImpl, DepImpls);
  return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
}

node modifiable_command_graph::addImpl(const std::vector<node> &Deps) {
  impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
  for (auto &D : Deps) {
    DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
  }

  graph_impl::WriteLock Lock(impl->MMutex);
  std::shared_ptr<detail::node_impl> NodeImpl = impl->add(DepImpls);
  return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
}

node modifiable_command_graph::addImpl(std::function<void(handler &)> CGF,
                                       const std::vector<node> &Deps) {
  impl->throwIfGraphRecordingQueue("Explicit API \"Add()\" function");
  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
  for (auto &D : Deps) {
    DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
  }

  graph_impl::WriteLock Lock(impl->MMutex);
  std::shared_ptr<detail::node_impl> NodeImpl = impl->add(CGF, {}, DepImpls);
  return sycl::detail::createSyclObjFromImpl<node>(std::move(NodeImpl));
}

void modifiable_command_graph::addGraphLeafDependencies(node Node) {
  // Find all exit nodes in the current graph and add them to the dependency
  // vector
  std::shared_ptr<detail::node_impl> DstImpl =
      sycl::detail::getSyclObjImpl(Node);
  graph_impl::WriteLock Lock(impl->MMutex);
  for (auto &NodeImpl : impl->MNodeStorage) {
    if ((NodeImpl->MSuccessors.size() == 0) && (NodeImpl != DstImpl)) {
      impl->makeEdge(NodeImpl, DstImpl);
    }
  }
}

void modifiable_command_graph::make_edge(node &Src, node &Dest) {
  std::shared_ptr<detail::node_impl> SenderImpl =
      sycl::detail::getSyclObjImpl(Src);
  std::shared_ptr<detail::node_impl> ReceiverImpl =
      sycl::detail::getSyclObjImpl(Dest);

  graph_impl::WriteLock Lock(impl->MMutex);
  impl->makeEdge(SenderImpl, ReceiverImpl);
}

command_graph<graph_state::executable>
modifiable_command_graph::finalize(const sycl::property_list &PropList) const {
  // Graph is read and written in this scope so we lock
  // this graph with full priviledges.
  graph_impl::WriteLock Lock(impl->MMutex);
  return command_graph<graph_state::executable>{
      this->impl, this->impl->getContext(), PropList};
}

void modifiable_command_graph::begin_recording(
    queue &RecordingQueue, const sycl::property_list &PropList) {
  // No properties is handled here originally, just check that properties are
  // related to graph at all.
  checkGraphPropertiesAndThrow(PropList);

  auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue);
  assert(QueueImpl);

  if (QueueImpl->hasCommandGraph()) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "begin_recording cannot be called for a queue which "
                          "is already in the recording state.");
  }

  if (QueueImpl->get_context() != impl->getContext()) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "begin_recording called for a queue whose context "
                          "differs from the graph context.");
  }
  if (QueueImpl->get_device() != impl->getDevice()) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "begin_recording called for a queue whose device "
                          "differs from the graph device.");
  }

  impl->beginRecording(QueueImpl);
}

void modifiable_command_graph::begin_recording(
    const std::vector<queue> &RecordingQueues,
    const sycl::property_list &PropList) {
  for (queue Queue : RecordingQueues) {
    this->begin_recording(Queue, PropList);
  }
}

void modifiable_command_graph::end_recording() {
  graph_impl::WriteLock Lock(impl->MMutex);
  impl->clearQueues();
}

void modifiable_command_graph::end_recording(queue &RecordingQueue) {
  auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue);
  if (!QueueImpl)
    return;
  if (QueueImpl->getCommandGraph() == impl) {
    QueueImpl->setCommandGraph(nullptr);
    graph_impl::WriteLock Lock(impl->MMutex);
    impl->removeQueue(QueueImpl);
  }
  if (QueueImpl->hasCommandGraph())
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "end_recording called for a queue which is recording "
                          "to a different graph.");
}

void modifiable_command_graph::end_recording(
    const std::vector<queue> &RecordingQueues) {
  for (queue Queue : RecordingQueues) {
    this->end_recording(Queue);
  }
}

void modifiable_command_graph::print_graph(sycl::detail::string_view pathstr,
                                           bool verbose) const {
  std::string path{pathstr.data()};
  graph_impl::ReadLock Lock(impl->MMutex);
  if (path.substr(path.find_last_of(".") + 1) == "dot") {
    impl->printGraphAsDot(std::move(path), verbose);
  } else {
    throw sycl::exception(
        sycl::make_error_code(errc::invalid),
        "DOT graph is the only format supported at the moment.");
  }
}

std::vector<node> modifiable_command_graph::get_nodes() const {
  graph_impl::ReadLock Lock(impl->MMutex);
  return createNodesFromImpls(impl->MNodeStorage);
}
std::vector<node> modifiable_command_graph::get_root_nodes() const {
  graph_impl::ReadLock Lock(impl->MMutex);
  auto &Roots = impl->MRoots;
  std::vector<std::weak_ptr<node_impl>> Impls{};

  std::copy(Roots.begin(), Roots.end(), std::back_inserter(Impls));
  return createNodesFromImpls(Impls);
}

void modifiable_command_graph::checkNodePropertiesAndThrow(
    const property_list &Properties) {
  auto CheckDataLessProperties = [](int PropertyKind) {
#define __SYCL_DATA_LESS_PROP(NS_QUALIFIER, PROP_NAME, ENUM_VAL)               \
  case NS_QUALIFIER::PROP_NAME::getKind():                                     \
    return true;
#define __SYCL_MANUALLY_DEFINED_PROP(NS_QUALIFIER, PROP_NAME)
    switch (PropertyKind) {
#include <sycl/ext/oneapi/experimental/detail/properties/node_properties.def>
    default:
      return false;
    }
  };
  auto CheckPropertiesWithData = [](int PropertyKind) {
#define __SYCL_DATA_LESS_PROP(NS_QUALIFIER, PROP_NAME, ENUM_VAL)
#define __SYCL_MANUALLY_DEFINED_PROP(NS_QUALIFIER, PROP_NAME)                  \
  case NS_QUALIFIER::PROP_NAME::getKind():                                     \
    return true;
    switch (PropertyKind) {
#include <sycl/ext/oneapi/experimental/detail/properties/node_properties.def>
    default:
      return false;
    }
  };
  sycl::detail::PropertyValidator::checkPropsAndThrow(
      Properties, CheckDataLessProperties, CheckPropertiesWithData);
}

executable_command_graph::executable_command_graph(
    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx,
    const property_list &PropList)
    : impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph, PropList)) {
  finalizeImpl(); // Create backend representation for executable graph
}

void executable_command_graph::finalizeImpl() {
  impl->makePartitions();

  auto Device = impl->getGraphImpl()->getDevice();
  for (auto Partition : impl->getPartitions()) {
    if (!Partition->isHostTask()) {
      impl->createCommandBuffers(Device, Partition);
    }
  }
}

void executable_command_graph::update(
    const command_graph<graph_state::modifiable> &Graph) {
  impl->update(sycl::detail::getSyclObjImpl(Graph));
}

void executable_command_graph::update(const node &Node) {
  impl->update(sycl::detail::getSyclObjImpl(Node));
}

void executable_command_graph::update(const std::vector<node> &Nodes) {
  std::vector<std::shared_ptr<node_impl>> NodeImpls{};
  NodeImpls.reserve(Nodes.size());
  for (auto &Node : Nodes) {
    NodeImpls.push_back(sycl::detail::getSyclObjImpl(Node));
  }

  impl->update(NodeImpls);
}

dynamic_parameter_base::dynamic_parameter_base(
    command_graph<graph_state::modifiable> Graph)
    : impl(std::make_shared<dynamic_parameter_impl>(
          sycl::detail::getSyclObjImpl(Graph))) {}

dynamic_parameter_base::dynamic_parameter_base(
    command_graph<graph_state::modifiable> Graph, size_t ParamSize,
    const void *Data)
    : impl(std::make_shared<dynamic_parameter_impl>(
          sycl::detail::getSyclObjImpl(Graph), ParamSize, Data)) {}

void dynamic_parameter_base::updateValue(const void *NewValue, size_t Size) {
  impl->updateValue(NewValue, Size);
}

void dynamic_parameter_base::updateValue(const raw_kernel_arg *NewRawValue,
                                         size_t Size) {
  impl->updateValue(NewRawValue, Size);
}

void dynamic_parameter_base::updateAccessor(
    const sycl::detail::AccessorBaseHost *Acc) {
  impl->updateAccessor(Acc);
}

void dynamic_parameter_base::updateWorkGroupMem(size_t BufferSize) {
  impl->updateWorkGroupMem(BufferSize);
}

void dynamic_parameter_impl::updateValue(const raw_kernel_arg *NewRawValue,
                                         size_t Size) {
  // Number of bytes is taken from member of raw_kernel_arg object rather
  // than using the size parameter which represents sizeof(raw_kernel_arg).
  std::ignore = Size;
  size_t RawArgSize = NewRawValue->MArgSize;
  const void *RawArgData = NewRawValue->MArgData;

  updateValue(RawArgData, RawArgSize);
}

void dynamic_parameter_impl::updateValue(const void *NewValue, size_t Size) {
  for (auto &[NodeWeak, ArgIndex] : MNodes) {
    auto NodeShared = NodeWeak.lock();
    if (NodeShared) {
      dynamic_parameter_impl::updateCGArgValue(NodeShared->MCommandGroup,
                                               ArgIndex, NewValue, Size);
    }
  }

  for (auto &DynCGInfo : MDynCGs) {
    auto DynCG = DynCGInfo.DynCG.lock();
    if (DynCG) {
      auto &CG = DynCG->MCommandGroups[DynCGInfo.CGIndex];
      dynamic_parameter_impl::updateCGArgValue(CG, DynCGInfo.ArgIndex, NewValue,
                                               Size);
    }
  }

  std::memcpy(MValueStorage.data(), NewValue, Size);
}

void dynamic_parameter_impl::updateAccessor(
    const sycl::detail::AccessorBaseHost *Acc) {
  for (auto &[NodeWeak, ArgIndex] : MNodes) {
    auto NodeShared = NodeWeak.lock();
    // Should we fail here if the node isn't alive anymore?
    if (NodeShared) {
      dynamic_parameter_impl::updateCGAccessor(NodeShared->MCommandGroup,
                                               ArgIndex, Acc);
    }
  }

  for (auto &DynCGInfo : MDynCGs) {
    auto DynCG = DynCGInfo.DynCG.lock();
    if (DynCG) {
      auto &CG = DynCG->MCommandGroups[DynCGInfo.CGIndex];
      dynamic_parameter_impl::updateCGAccessor(CG, DynCGInfo.ArgIndex, Acc);
    }
  }

  std::memcpy(MValueStorage.data(), Acc,
              sizeof(sycl::detail::AccessorBaseHost));
}

void dynamic_parameter_impl::updateWorkGroupMem(size_t BufferSize) {
  for (auto &[NodeWeak, ArgIndex] : MNodes) {
    auto NodeShared = NodeWeak.lock();
    if (NodeShared) {
      dynamic_parameter_impl::updateCGWorkGroupMem(NodeShared->MCommandGroup,
                                                   ArgIndex, BufferSize);
    }
  }

  for (auto &DynCGInfo : MDynCGs) {
    auto DynCG = DynCGInfo.DynCG.lock();
    if (DynCG) {
      auto &CG = DynCG->MCommandGroups[DynCGInfo.CGIndex];
      dynamic_parameter_impl::updateCGWorkGroupMem(CG, DynCGInfo.ArgIndex,
                                                   BufferSize);
    }
  }
}

void dynamic_parameter_impl::updateCGWorkGroupMem(
    std::shared_ptr<sycl::detail::CG> CG, int ArgIndex, size_t BufferSize) {

  auto &Args = static_cast<sycl::detail::CGExecKernel *>(CG.get())->MArgs;
  for (auto &Arg : Args) {
    if (Arg.MIndex != ArgIndex) {
      continue;
    }
    assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_std_layout);
    Arg.MSize = BufferSize;
    break;
  }
}

void dynamic_parameter_impl::updateCGArgValue(
    std::shared_ptr<sycl::detail::CG> CG, int ArgIndex, const void *NewValue,
    size_t Size) {
  auto &Args = static_cast<sycl::detail::CGExecKernel *>(CG.get())->MArgs;
  for (auto &Arg : Args) {
    if (Arg.MIndex != ArgIndex) {
      continue;
    }
    assert(Arg.MSize == static_cast<int>(Size));
    // MPtr may be a pointer into arg storage so we memcpy the contents of
    // NewValue rather than assign it directly
    std::memcpy(Arg.MPtr, NewValue, Size);
    break;
  }
}

void dynamic_parameter_impl::updateCGAccessor(
    std::shared_ptr<sycl::detail::CG> CG, int ArgIndex,
    const sycl::detail::AccessorBaseHost *Acc) {
  auto &Args = static_cast<sycl::detail::CGExecKernel *>(CG.get())->MArgs;

  auto NewAccImpl = sycl::detail::getSyclObjImpl(*Acc);
  for (auto &Arg : Args) {
    if (Arg.MIndex != ArgIndex) {
      continue;
    }
    assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_accessor);

    // Find old accessor in accessor storage and replace with new one
    if (static_cast<sycl::detail::SYCLMemObjT *>(NewAccImpl->MSYCLMemObj)
            ->needsWriteBack()) {
      throw sycl::exception(
          make_error_code(errc::invalid),
          "Accessors to buffers which have write_back enabled "
          "are not allowed to be used in command graphs.");
    }

    // All accessors passed to this function will be placeholders, so we must
    // perform steps similar to what happens when handler::require() is
    // called here.
    sycl::detail::Requirement *NewReq = NewAccImpl.get();
    if (NewReq->MAccessMode != sycl::access_mode::read) {
      auto SYCLMemObj =
          static_cast<sycl::detail::SYCLMemObjT *>(NewReq->MSYCLMemObj);
      SYCLMemObj->handleWriteAccessorCreation();
    }

    for (auto &Acc : CG->getAccStorage()) {
      if (auto OldAcc = static_cast<sycl::detail::AccessorImplHost *>(Arg.MPtr);
          Acc.get() == OldAcc) {
        Acc = NewAccImpl;
      }
    }

    for (auto &Req : CG->getRequirements()) {
      if (auto OldReq = static_cast<sycl::detail::AccessorImplHost *>(Arg.MPtr);
          Req == OldReq) {
        Req = NewReq;
      }
    }
    Arg.MPtr = NewAccImpl.get();
    break;
  }
}

dynamic_command_group_impl::dynamic_command_group_impl(
    const command_graph<graph_state::modifiable> &Graph)
    : MGraph{sycl::detail::getSyclObjImpl(Graph)}, MActiveCGF(0),
      MID(NextAvailableID.fetch_add(1, std::memory_order_relaxed)) {}

void dynamic_command_group_impl::finalizeCGFList(
    const std::vector<std::function<void(handler &)>> &CGFList) {
  for (size_t CGFIndex = 0; CGFIndex < CGFList.size(); CGFIndex++) {
    const auto &CGF = CGFList[CGFIndex];
    // Handler defined inside the loop so it doesn't appear to the runtime
    // as a single command-group with multiple commands inside.
    sycl::handler Handler{MGraph};
    CGF(Handler);

    if (Handler.getType() != sycl::detail::CGType::Kernel &&
        Handler.getType() != sycl::detail::CGType::CodeplayHostTask) {
      throw sycl::exception(
          make_error_code(errc::invalid),
          "The only types of command-groups that can be used in "
          "dynamic command-groups are kernels and host-tasks.");
    }

    // We need to store the first CG's type so we can check they are all the
    // same
    if (CGFIndex == 0) {
      MCGType = Handler.getType();
    } else if (MCGType != Handler.getType()) {
      throw sycl::exception(make_error_code(errc::invalid),
                            "Command-groups in a dynamic command-group must "
                            "all be the same type.");
    }

    Handler.finalize();

    // Take unique_ptr<detail::CG> object from handler and convert to
    // shared_ptr<detail::CG> to store
    sycl::detail::CG *RawCGPtr = Handler.impl->MGraphNodeCG.release();
    MCommandGroups.push_back(std::shared_ptr<sycl::detail::CG>(RawCGPtr));

    // Track dynamic_parameter usage in command-group
    auto &DynamicParams = Handler.impl->MDynamicParameters;

    if (DynamicParams.size() > 0 &&
        Handler.getType() == sycl::detail::CGType::CodeplayHostTask) {
      throw sycl::exception(make_error_code(errc::invalid),
                            "Cannot use dynamic parameters in a host_task");
    }
    for (auto &[DynamicParam, ArgIndex] : DynamicParams) {
      DynamicParam->registerDynCG(shared_from_this(), CGFIndex, ArgIndex);
    }
  }

  // Host tasks don't need to store alternative kernels
  if (MCGType == sycl::detail::CGType::CodeplayHostTask) {
    return;
  }

  // For each Kernel CG store the list of alternative kernels, not
  // including itself.
  using CGExecKernelSP = std::shared_ptr<sycl::detail::CGExecKernel>;
  using CGExecKernelWP = std::weak_ptr<sycl::detail::CGExecKernel>;
  for (std::shared_ptr<sycl::detail::CG> CommandGroup : MCommandGroups) {
    CGExecKernelSP KernelCG =
        std::dynamic_pointer_cast<sycl::detail::CGExecKernel>(CommandGroup);
    std::vector<CGExecKernelWP> Alternatives;

    // Add all other command groups except for the current one to the list of
    // alternatives
    for (auto &OtherCG : MCommandGroups) {
      CGExecKernelSP OtherKernelCG =
          std::dynamic_pointer_cast<sycl::detail::CGExecKernel>(OtherCG);
      if (KernelCG != OtherKernelCG) {
        Alternatives.push_back(OtherKernelCG);
      }
    }

    KernelCG->MAlternativeKernels = std::move(Alternatives);
  }
}

void dynamic_command_group_impl::setActiveIndex(size_t Index) {
  if (Index >= getNumCGs()) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "Index is out of range.");
  }
  MActiveCGF = Index;

  // Update nodes using the dynamic command-group to use the new active CG
  for (auto &Node : MNodes) {
    if (auto NodeSP = Node.lock()) {
      NodeSP->MCommandGroup = getActiveCG();
    }
  }
}
} // namespace detail

node_type node::get_type() const { return impl->MNodeType; }

std::vector<node> node::get_predecessors() const {
  return detail::createNodesFromImpls(impl->MPredecessors);
}

std::vector<node> node::get_successors() const {
  return detail::createNodesFromImpls(impl->MSuccessors);
}

node node::get_node_from_event(event nodeEvent) {
  auto EventImpl = sycl::detail::getSyclObjImpl(nodeEvent);
  auto GraphImpl = EventImpl->getCommandGraph();

  return sycl::detail::createSyclObjFromImpl<node>(
      GraphImpl->getNodeForEvent(EventImpl));
}

template <> __SYCL_EXPORT void node::update_nd_range<1>(nd_range<1> NDRange) {
  impl->updateNDRange(NDRange);
}
template <> __SYCL_EXPORT void node::update_nd_range<2>(nd_range<2> NDRange) {
  impl->updateNDRange(NDRange);
}
template <> __SYCL_EXPORT void node::update_nd_range<3>(nd_range<3> NDRange) {
  impl->updateNDRange(NDRange);
}
template <> __SYCL_EXPORT void node::update_range<1>(range<1> Range) {
  impl->updateRange(Range);
}
template <> __SYCL_EXPORT void node::update_range<2>(range<2> Range) {
  impl->updateRange(Range);
}
template <> __SYCL_EXPORT void node::update_range<3>(range<3> Range) {
  impl->updateRange(Range);
}

dynamic_command_group::dynamic_command_group(
    const command_graph<graph_state::modifiable> &Graph,
    const std::vector<std::function<void(handler &)>> &CGFList)
    : impl(std::make_shared<detail::dynamic_command_group_impl>(Graph)) {
  if (CGFList.empty()) {
    throw sycl::exception(sycl::make_error_code(errc::invalid),
                          "Dynamic command-group cannot be created with an "
                          "empty CGF list.");
  }
  impl->finalizeCGFList(CGFList);
}

size_t dynamic_command_group::get_active_index() const {
  return impl->getActiveIndex();
}
void dynamic_command_group::set_active_index(size_t Index) {
  return impl->setActiveIndex(Index);
}
} // namespace experimental
} // namespace oneapi
} // namespace ext
} // namespace _V1
} // namespace sycl

size_t std::hash<sycl::ext::oneapi::experimental::node>::operator()(
    const sycl::ext::oneapi::experimental::node &Node) const {
  auto ID = sycl::detail::getSyclObjImpl(Node)->getID();
  return std::hash<decltype(ID)>()(ID);
}

size_t
std::hash<sycl::ext::oneapi::experimental::dynamic_command_group>::operator()(
    const sycl::ext::oneapi::experimental::dynamic_command_group &DynamicCG)
    const {
  auto ID = sycl::detail::getSyclObjImpl(DynamicCG)->getID();
  return std::hash<decltype(ID)>()(ID);
}