From e13cbccd6454e1a92ad27432175acd8a6169f330 Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 14:47:09 +0100 Subject: [PATCH 1/8] Remove schedule method in graph_impl.cpp --- sycl/source/detail/graph_impl.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 34c5e36a394cb..d6e12c27ffb12 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -73,13 +73,15 @@ bool check_for_arg(const sycl::detail::ArgDesc &Arg, } } // anonymous namespace -void exec_graph_impl::schedule() { - if (MSchedule.empty()) { - for (auto Node : MGraphImpl->MRoots) { - Node->topology_sort(Node, MSchedule); - } - } -} +/* +// void exec_graph_impl::schedule() { +// if (MSchedule.empty()) { +// for (auto Node : MGraphImpl->MRoots) { +// Node->topology_sort(Node, MSchedule); +// } +// } +// } +*/ std::shared_ptr graph_impl::add_subgraph_nodes( const std::list> &NodeList) { @@ -564,7 +566,9 @@ command_graph::command_graph( void command_graph::finalize_impl() { // Create PI command-buffers for each device in the finalized context - impl->schedule(); + /* + // impl->schedule(); + */ auto Context = impl->get_context(); for (auto Device : impl->get_context().get_devices()) { From ee9815ad3ec064557107717ea320eb0bf42f3b72 Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 14:51:14 +0100 Subject: [PATCH 2/8] Remove topology_sort method from graph_impl.hpp --- sycl/source/detail/graph_impl.hpp | 32 ++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index d9cf3f0f69f0a..48457dc61eb73 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -73,21 +73,23 @@ class node_impl { std::unique_ptr &&CommandGroup) : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {} - /// Recursively add nodes to execution stack. - /// @param NodeImpl Node to schedule. - /// @param Schedule Execution ordering to add node to. - void topology_sort(std::shared_ptr NodeImpl, - std::list> &Schedule) { - for (auto Next : MSuccessors) { - // Check if we've already scheduled this node - if (std::find(Schedule.begin(), Schedule.end(), Next) == Schedule.end()) - Next->topology_sort(Next, Schedule); - } - // We don't need to schedule empty nodes as they are only used when - // calculating dependencies - if (!NodeImpl->is_empty()) - Schedule.push_front(NodeImpl); - } + /* + // /// Recursively add nodes to execution stack. + // /// @param NodeImpl Node to schedule. + // /// @param Schedule Execution ordering to add node to. + // void topology_sort(std::shared_ptr NodeImpl, + // std::list> &Schedule) { + // for (auto Next : MSuccessors) { + // // Check if we've already scheduled this node + // if (std::find(Schedule.begin(), Schedule.end(), Next) == Schedule.end()) + // Next->topology_sort(Next, Schedule); + // } + // // We don't need to schedule empty nodes as they are only used when + // // calculating dependencies + // if (!NodeImpl->is_empty()) + // Schedule.push_front(NodeImpl); + // } + */ /// Checks if this node has an argument. /// @param Arg Argument to lookup. From 72828c97466ffc870f51a90407eb5b0360b127a4 Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 15:00:17 +0100 Subject: [PATCH 3/8] Add depth to node_impl in graph_impl.hpp --- sycl/source/detail/graph_impl.hpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 48457dc61eb73..9abe898d811a6 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -19,6 +19,7 @@ #include #include #include +#inlcude namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -91,6 +92,28 @@ class node_impl { // } */ +private: + /// Depth of this node in a containing graph + /// + /// The first call to graph.exec_order_recompute computes & caches the value + /// It will likely become stale whenever the containing graph is changed and + /// a single value will be inequate if this node is added to multiple graphs + /// Caching is dangerous but recomputing takes O(graph_size) worst-case time + std::optional MDepth; + +public: + int get_depth(node_impl &V) { return V.get_depth(); }; + int get_depth() { + if (!MDepth.has_value()) { + int max_depth_found = -1; + for (auto P : MPredecessors) { + max_depth_found = std::max(max_depth_found, P.lock()->get_depth()); + } + MDepth = max_depth_found + 1; + } + return MDepth.value(); + }; + /// Checks if this node has an argument. /// @param Arg Argument to lookup. /// @return True if \p Arg is used in node, false otherwise. From 667c715738929096723d2f997c6d5f6d557c669a Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 15:15:30 +0100 Subject: [PATCH 4/8] Add compute schedule to graph_impl (modifiable) in graph_impl.hpp --- sycl/source/detail/graph_impl.hpp | 60 ++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 9abe898d811a6..a31b7c9aba6c6 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -19,7 +19,8 @@ #include #include #include -#inlcude +#include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -222,6 +223,63 @@ class graph_impl { : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(), MEventsMap() {} +private: + /// A cache of pointers to exit nodes + /// + /// This is not used (yet), but depth computation starts from exit nodes + /// Perhaps, it might be better to do the exec_order_recompute traversal + /// starting from each exit node and working upwards using MPredecessors + /// rather than from each root node and doing depth-first to exit nodes? + std::vector MExitNodes; + + /// A sorted multimap capturing the optimal execution/submission order + /// + /// The SortKey is the depth in the graph for the node_impl in the value + /// Depth is the length of the longest dependence chain to any root node + std::multimap> MExecOrder; + + /// + /// Depth-first recursion from V to build the optimal execution order + /// + /// Starting node for depth-first recursion + void exec_order_recompute(node_impl &V) { + // depth-first recursion to access all nodes that succeed this node + for (auto &S : V.MSuccessors) { + exec_order_recompute(*S.get()); + } + // insert this into execution order based on its depth in the graph + MExecOrder.insert(std::pair(V.get_depth(), &V)); + // cache all the exit nodes; no reason, just feels like a good idea + if (V.MSuccessors.empty()) { + MExitNodes.push_back(&V); + } + }; + + /// + /// Recomputes the optimal submission/execution order for this whole graph + /// + void exec_order_recompute() { + MExecOrder.clear(); + // for all root nodes ... + for (auto &root : MRoots) { + // ... recurse towards all exit nodes + exec_order_recompute(*root); + } + }; + +public: + /// + /// Recomputes the optimal submission/execution order then schedules all nodes + /// + std::list> compute_schedule() { + exec_order_recompute(); + std::list> sched; + for (auto &next : MExecOrder) { + sched.push_front(*next.second.get()); + } + return sched; + }; + /// Insert node into list of root nodes. /// @param Root Node to add to list of root nodes. void add_root(const std::shared_ptr &Root); From c511efb54a78ac2a711f060314100f26b2cd90db Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 15:19:35 +0100 Subject: [PATCH 5/8] Adjust graph_impl (executable) in graph_impl.hpp --- sycl/source/detail/graph_impl.hpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index a31b7c9aba6c6..8056fe86b08d0 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -397,7 +397,9 @@ class exec_graph_impl { /// @param GraphImpl Modifiable graph implementation to create with. exec_graph_impl(sycl::context Context, const std::shared_ptr &GraphImpl) - : MSchedule(), MGraphImpl(GraphImpl), MPiCommandBuffers(), + : MSchedule(GraphImpl->compute_schedule()), + /* MGraphImpl(GraphImpl), */ + MPiCommandBuffers(), MPiSyncPoints(), MContext(Context) {} /// Destructor. @@ -405,8 +407,10 @@ class exec_graph_impl { /// Releases any PI command-buffers the object has created. ~exec_graph_impl(); - /// Add nodes to MSchedule. - void schedule(); + /* + // /// Add nodes to MSchedule. + // void schedule(); + */ /// Enqueues the backend objects for the graph to the parametrized queue. /// @param Queue Command-queue to submit backend objects to. @@ -467,9 +471,11 @@ class exec_graph_impl { /// Execution schedule of nodes in the graph. std::list> MSchedule; - /// Pointer to the modifiable graph impl associated with this executable - /// graph. - std::shared_ptr MGraphImpl; + /* + // /// Pointer to the modifiable graph impl associated with this executable + // /// graph. + // std::shared_ptr MGraphImpl; + */ /// Map of devices to command buffers. std::unordered_map MPiCommandBuffers; /// Map of nodes in the exec graph to the sync point representing their From b6c5fd923197a364fba56b6c89f9bcb3dbcd6f63 Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 15:55:21 +0100 Subject: [PATCH 6/8] Removing commented code in graph_impl.cpp --- sycl/source/detail/graph_impl.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index d6e12c27ffb12..06d15e58638f6 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -73,16 +73,6 @@ bool check_for_arg(const sycl::detail::ArgDesc &Arg, } } // anonymous namespace -/* -// void exec_graph_impl::schedule() { -// if (MSchedule.empty()) { -// for (auto Node : MGraphImpl->MRoots) { -// Node->topology_sort(Node, MSchedule); -// } -// } -// } -*/ - std::shared_ptr graph_impl::add_subgraph_nodes( const std::list> &NodeList) { // Find all input and output nodes from the node list @@ -566,9 +556,6 @@ command_graph::command_graph( void command_graph::finalize_impl() { // Create PI command-buffers for each device in the finalized context - /* - // impl->schedule(); - */ auto Context = impl->get_context(); for (auto Device : impl->get_context().get_devices()) { From f859e416fb64f37f2f07dd7d81ba8f8da812ec81 Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Mon, 22 May 2023 15:56:49 +0100 Subject: [PATCH 7/8] Remove commented code in graph_impl.hpp --- sycl/source/detail/graph_impl.hpp | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 8056fe86b08d0..5ec35fecfc6ac 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -75,24 +75,6 @@ class node_impl { std::unique_ptr &&CommandGroup) : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {} - /* - // /// Recursively add nodes to execution stack. - // /// @param NodeImpl Node to schedule. - // /// @param Schedule Execution ordering to add node to. - // void topology_sort(std::shared_ptr NodeImpl, - // std::list> &Schedule) { - // for (auto Next : MSuccessors) { - // // Check if we've already scheduled this node - // if (std::find(Schedule.begin(), Schedule.end(), Next) == Schedule.end()) - // Next->topology_sort(Next, Schedule); - // } - // // We don't need to schedule empty nodes as they are only used when - // // calculating dependencies - // if (!NodeImpl->is_empty()) - // Schedule.push_front(NodeImpl); - // } - */ - private: /// Depth of this node in a containing graph /// @@ -398,7 +380,6 @@ class exec_graph_impl { exec_graph_impl(sycl::context Context, const std::shared_ptr &GraphImpl) : MSchedule(GraphImpl->compute_schedule()), - /* MGraphImpl(GraphImpl), */ MPiCommandBuffers(), MPiSyncPoints(), MContext(Context) {} @@ -407,11 +388,6 @@ class exec_graph_impl { /// Releases any PI command-buffers the object has created. ~exec_graph_impl(); - /* - // /// Add nodes to MSchedule. - // void schedule(); - */ - /// Enqueues the backend objects for the graph to the parametrized queue. /// @param Queue Command-queue to submit backend objects to. /// @return Event associated with enqueued object. @@ -471,11 +447,6 @@ class exec_graph_impl { /// Execution schedule of nodes in the graph. std::list> MSchedule; - /* - // /// Pointer to the modifiable graph impl associated with this executable - // /// graph. - // std::shared_ptr MGraphImpl; - */ /// Map of devices to command buffers. std::unordered_map MPiCommandBuffers; /// Map of nodes in the exec graph to the sync point representing their From 471a3451793515afd651303aed6f9637233eb4f1 Mon Sep 17 00:00:00 2001 From: Dan Holmes Date: Tue, 30 May 2023 10:49:34 +0100 Subject: [PATCH 8/8] Fixes from Ewan's code review --- sycl/source/detail/graph_impl.cpp | 9 +++++ sycl/source/detail/graph_impl.hpp | 67 ++++++++++--------------------- 2 files changed, 31 insertions(+), 45 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 06d15e58638f6..5b35f82ea7754 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -106,6 +106,15 @@ exec_graph_impl::exec(const std::shared_ptr &Queue) { return RetEvent; } +std::list> graph_impl::compute_schedule() { + exec_order_recompute(); + std::list> Sched; + for (auto &Next : MExecOrder) { + Sched.push_back(Next.second); + } + return Sched; +}; + void graph_impl::add_root(const std::shared_ptr &Root) { MRoots.insert(Root); } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 5ec35fecfc6ac..137bf1f5f09d8 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -76,23 +76,24 @@ class node_impl { : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {} private: - /// Depth of this node in a containing graph + /// Depth of this node in a containing graph. /// - /// The first call to graph.exec_order_recompute computes & caches the value + /// The first call to graph.exec_order_recompute computes & caches the value. /// It will likely become stale whenever the containing graph is changed and - /// a single value will be inequate if this node is added to multiple graphs - /// Caching is dangerous but recomputing takes O(graph_size) worst-case time + /// a single value will be inadequate if this node is added to multiple graphs. + /// Caching is dangerous but recomputing takes O(graph_size) worst-case time. std::optional MDepth; public: - int get_depth(node_impl &V) { return V.get_depth(); }; + /// Gets the depth of this node in its containing graph. + /// @return the depth of this node in its containing graph. int get_depth() { if (!MDepth.has_value()) { - int max_depth_found = -1; - for (auto P : MPredecessors) { - max_depth_found = std::max(max_depth_found, P.lock()->get_depth()); + int MaxDepthFound = -1; + for (auto &P : MPredecessors) { + MaxDepthFound = std::max(MaxDepthFound, P.lock()->get_depth()); } - MDepth = max_depth_found + 1; + MDepth = MaxDepthFound + 1; } return MDepth.value(); }; @@ -195,7 +196,7 @@ class node_impl { } }; -/// Class representing implementation details of command_graph. +/// Class representing implementation details of modifiable command_graph. class graph_impl { public: /// Constructor. @@ -206,24 +207,14 @@ class graph_impl { MEventsMap() {} private: - /// A cache of pointers to exit nodes + /// A sorted multimap capturing a breadth-first execution/submission order. /// - /// This is not used (yet), but depth computation starts from exit nodes - /// Perhaps, it might be better to do the exec_order_recompute traversal - /// starting from each exit node and working upwards using MPredecessors - /// rather than from each root node and doing depth-first to exit nodes? - std::vector MExitNodes; - - /// A sorted multimap capturing the optimal execution/submission order - /// - /// The SortKey is the depth in the graph for the node_impl in the value - /// Depth is the length of the longest dependence chain to any root node + /// The SortKey is the depth in the graph for the node_impl in the value. + /// Depth is the length of the longest dependence chain to any root node. std::multimap> MExecOrder; - /// - /// Depth-first recursion from V to build the optimal execution order - /// - /// Starting node for depth-first recursion + /// Depth-first recursion from V to build the execution order. + /// @param V Starting node for depth-first recursion. void exec_order_recompute(node_impl &V) { // depth-first recursion to access all nodes that succeed this node for (auto &S : V.MSuccessors) { @@ -231,36 +222,22 @@ class graph_impl { } // insert this into execution order based on its depth in the graph MExecOrder.insert(std::pair(V.get_depth(), &V)); - // cache all the exit nodes; no reason, just feels like a good idea - if (V.MSuccessors.empty()) { - MExitNodes.push_back(&V); - } }; - /// - /// Recomputes the optimal submission/execution order for this whole graph - /// + /// Recomputes the submission/execution order for this whole graph. void exec_order_recompute() { MExecOrder.clear(); // for all root nodes ... - for (auto &root : MRoots) { + for (auto &Root : MRoots) { // ... recurse towards all exit nodes - exec_order_recompute(*root); + exec_order_recompute(*Root); } }; public: - /// - /// Recomputes the optimal submission/execution order then schedules all nodes - /// - std::list> compute_schedule() { - exec_order_recompute(); - std::list> sched; - for (auto &next : MExecOrder) { - sched.push_front(*next.second.get()); - } - return sched; - }; + /// Recomputes the submission/execution order then schedules all nodes. + /// @return A list of shared pointers to nodes in linear scheduling order. + std::list> compute_schedule(); /// Insert node into list of root nodes. /// @param Root Node to add to list of root nodes.