Skip to content

Commit 452e7ae

Browse files
committed
[SYCL][Graphs] Remove blocking wait from graph enqueue.
The current approach uses the same LO PI event for all submissions of the same graph, and also doesn't use the wait events to enforce dependencies on the command-list submission. By doing these in the L0 adapter, we can remove the blocking queue wait from our graphs submission code in the runtime. Closes issue #139
1 parent 583fb91 commit 452e7ae

File tree

7 files changed

+329
-50
lines changed

7 files changed

+329
-50
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

+138-30
Original file line numberDiff line numberDiff line change
@@ -8829,7 +8829,64 @@ pi_result _pi_buffer::free() {
88298829
return PI_SUCCESS;
88308830
}
88318831

8832-
/// command-buffer Extension
8832+
/* Command-buffer Extension
8833+
8834+
The PI interface for submitting a PI command-buffer takes a list
8835+
of events to wait on, and an event representing the completion of
8836+
that particular submission of the command-buffer.
8837+
8838+
However, in `zeCommandQueueExecuteCommandLists` there are no parameters to
8839+
take a waitlist and also the only sync primitive returned is to block on
8840+
host.
8841+
8842+
In order to get the PI command-buffer enqueue semantics we want with L0
8843+
this adapter adds extra commands to the L0 command-list representing a
8844+
PI command-buffer.
8845+
8846+
Prefix - Commands added to the start of the L0 command-list by L0 adapter.
8847+
Suffix - Commands added to the end of the L0 command-list by L0 adapter.
8848+
8849+
These extra commands operate on L0 event synchronisation primitives used by
8850+
the command-list to interact with the external PI wait-list and PI return
8851+
event required for the enqueue interface.
8852+
8853+
The `pi_ext_command_buffer` class for this adapter contains a SignalEvent
8854+
which signals the completion of the command-list in the suffix, and
8855+
is reset in the prefix. This signal is detected by a new PI return
8856+
event created on PI command-buffer enqueue.
8857+
8858+
There is also a WaitEvent used by the `pi_ext_command_buffer` class
8859+
in the prefix to wait on any dependencies passed in the enqueue wait-list.
8860+
8861+
┌──────────┬────────────────────────────────────────────────┬─────────┐
8862+
│ Prefix │ Commands added to PI command-buffer by PI user │ Suffix │
8863+
└──────────┴────────────────────────────────────────────────┴─────────┘
8864+
8865+
┌───────────────────┬──────────────────────────────┐
8866+
Prefix │Reset signal event │ Barrier waiting on wait event│
8867+
└───────────────────┴──────────────────────────────┘
8868+
8869+
┌─────────────────────────────────────────┐
8870+
Suffix │Signal the PI command-buffer signal event│
8871+
└─────────────────────────────────────────┘
8872+
8873+
8874+
For a call to `piextEnqueueCommandBuffer` with an event_list `EL`,
8875+
command-buffer `CB`, and return event `RE` our implementation has to create
8876+
and submit two new command-lists for the above approach to work. One before
8877+
the command-list with extra commands associated with `CB`, and the other
8878+
after `CB`.
8879+
8880+
Command-list created on `piextEnqueueCommandBuffer` to execution before `CB`:
8881+
┌───────────────────────────────────────────────────────────┐
8882+
│Barrier on `EL` than signals `CB` WaitEvent when completed │
8883+
└───────────────────────────────────────────────────────────┘
8884+
8885+
Command-list created on `piextEnqueueCommandBuffer` to execution after `CB`:
8886+
┌─────────────────────────────────────────────────────────────┐
8887+
│Barrier on `CB` SignalEvent that signals `RE` when completed │
8888+
└─────────────────────────────────────────────────────────────┘
8889+
*/
88338890

88348891
/// Helper function to take a list of pi_ext_sync_points and fill the provided
88358892
/// vector with the associated ZeEvents
@@ -8872,6 +8929,19 @@ pi_result piextCommandBufferCreate(pi_context Context, pi_device Device,
88728929
} catch (...) {
88738930
return PI_ERROR_UNKNOWN;
88748931
}
8932+
8933+
// Create signal & wait events to be used in the command-list for sync
8934+
// on command-buffer enqueue.
8935+
auto CommandBuffer = *RetCommandBuffer;
8936+
PI_CALL(EventCreate(Context, nullptr, true, &CommandBuffer->SignalEvent));
8937+
PI_CALL(EventCreate(Context, nullptr, false, &CommandBuffer->WaitEvent));
8938+
8939+
// Add prefix commands
8940+
ZE_CALL(zeCommandListAppendEventReset,
8941+
(ZeCommandList, CommandBuffer->SignalEvent->ZeEvent));
8942+
ZE_CALL(zeCommandListAppendBarrier,
8943+
(ZeCommandList, nullptr, 1, &CommandBuffer->WaitEvent->ZeEvent));
8944+
88758945
return PI_SUCCESS;
88768946
}
88778947

@@ -8891,13 +8961,10 @@ pi_result piextCommandBufferRelease(pi_ext_command_buffer CommandBuffer) {
88918961
}
88928962

88938963
pi_result piextCommandBufferFinalize(pi_ext_command_buffer CommandBuffer) {
8894-
// We need to append some signal that will indicate that command-buffer has
8964+
// We need to append signal that will indicate that command-buffer has
88958965
// finished executing.
8896-
EventCreate(CommandBuffer->Context, nullptr, true,
8897-
&CommandBuffer->ExecutionEvent);
8898-
ZE_CALL(
8899-
zeCommandListAppendSignalEvent,
8900-
(CommandBuffer->ZeCommandList, CommandBuffer->ExecutionEvent->ZeEvent));
8966+
ZE_CALL(zeCommandListAppendSignalEvent,
8967+
(CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent));
89018968
// Close the command list and have it ready for dispatch.
89028969
ZE_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
89038970
return PI_SUCCESS;
@@ -9026,17 +9093,11 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,
90269093
pi_uint32 NumEventsInWaitList,
90279094
const pi_event *EventWaitList,
90289095
pi_event *Event) {
9029-
9030-
// Execute command list asynchronously, as the event will be used
9031-
// to track down its completion.
9032-
9033-
uint32_t QueueGroupOrdinal;
9034-
// TODO: Revisit forcing compute engine
9035-
auto UseCopyEngine = false;
9096+
// Use compute engine rather than copy engine
9097+
const auto UseCopyEngine = false;
90369098
auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
9037-
auto &ZeCommandQueue =
9038-
// ForcedCmdQueue ? *ForcedCmdQueue :
9039-
QGroup.getZeQueue(&QueueGroupOrdinal);
9099+
uint32_t QueueGroupOrdinal;
9100+
auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal);
90409101

90419102
ze_fence_handle_t ZeFence;
90429103
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
@@ -9050,25 +9111,69 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,
90509111
CommandBuffer->ZeCommandList,
90519112
{ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal}));
90529113

9053-
Queue->insertActiveBarriers(CommandListPtr, UseCopyEngine);
9054-
9114+
// Previous execution will have closed the command list, we need to reopen
9115+
// it otherwise calling `executeCommandList` will return early.
9116+
CommandListPtr->second.IsClosed = false;
90559117
CommandListPtr->second.ZeFenceInUse = true;
90569118

9057-
// Return the command-buffer's execution event as the user visible pi_event
9058-
*Event = CommandBuffer->ExecutionEvent;
9059-
(*Event)->Queue = Queue;
9060-
(*Event)->RefCount.increment();
9061-
Queue->RefCount.increment();
9119+
// Create command-list to execute before `CommandListPtr` and will signal
9120+
// when `EventWaitList` dependencies are complete.
9121+
pi_command_list_ptr_t WaitCommandList{};
9122+
if (NumEventsInWaitList) {
9123+
_pi_ze_event_list_t TmpWaitList;
9124+
if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
9125+
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
9126+
return Res;
90629127

9063-
PI_CALL(piEventRetain(*Event));
9128+
if (auto Res = Queue->Context->getAvailableCommandList(
9129+
Queue, WaitCommandList, false, false))
9130+
return Res;
90649131

9065-
// Previous execution will have closed the command list so we need to reopen
9066-
// it.
9067-
CommandListPtr->second.IsClosed = false;
9132+
ZE_CALL(zeCommandListAppendBarrier,
9133+
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
9134+
NumEventsInWaitList, TmpWaitList.ZeEventList));
9135+
} else {
9136+
if (auto Res = Queue->Context->getAvailableCommandList(
9137+
Queue, WaitCommandList, false, false))
9138+
return Res;
9139+
9140+
ZE_CALL(zeCommandListAppendSignalEvent,
9141+
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
9142+
}
9143+
9144+
// Execution event for this enqueue of the PI command-buffer
9145+
pi_event RetEvent{};
9146+
// Create a command-list to signal RetEvent on completion
9147+
pi_command_list_ptr_t SignalCommandList{};
9148+
if (Event) {
9149+
if (auto Res = Queue->Context->getAvailableCommandList(
9150+
Queue, SignalCommandList, false, false))
9151+
return Res;
9152+
9153+
if (auto Res = createEventAndAssociateQueue(
9154+
Queue, &RetEvent, PI_COMMAND_TYPE_EXT_COMMAND_BUFFER,
9155+
SignalCommandList, false))
9156+
return Res;
9157+
9158+
ZE_CALL(zeCommandListAppendBarrier,
9159+
(SignalCommandList->first, RetEvent->ZeEvent, 1,
9160+
&(CommandBuffer->SignalEvent->ZeEvent)));
9161+
}
9162+
9163+
// Execution our command-lists asynchronously
9164+
if (auto Res = Queue->executeCommandList(WaitCommandList, false, false))
9165+
return Res;
90689166

90699167
if (auto Res = Queue->executeCommandList(CommandListPtr, false, false))
90709168
return Res;
90719169

9170+
if (auto Res = Queue->executeCommandList(SignalCommandList, false, false))
9171+
return Res;
9172+
9173+
if (Event) {
9174+
*Event = RetEvent;
9175+
}
9176+
90729177
return PI_SUCCESS;
90739178
}
90749179

@@ -9088,8 +9193,11 @@ _pi_ext_command_buffer::~_pi_ext_command_buffer() {
90889193
if (ZeCommandList) {
90899194
ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
90909195
}
9091-
if (ExecutionEvent) {
9092-
ExecutionEvent->RefCount.decrementAndTest();
9196+
if (SignalEvent) {
9197+
SignalEvent->RefCount.decrementAndTest();
9198+
}
9199+
if (WaitEvent) {
9200+
WaitEvent->RefCount.decrementAndTest();
90939201
}
90949202
Context->RefCount.decrementAndTest();
90959203
}

sycl/plugins/level_zero/pi_level_zero.hpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -1374,8 +1374,12 @@ struct _pi_ext_command_buffer : _ur_object {
13741374
// Command list map so we can use queue::executeCommandList, TODO: Remove in
13751375
// future if possible
13761376
pi_command_list_map_t CommandListMap;
1377-
// Event which will signal the execution of the command-buffer has finished
1378-
pi_event ExecutionEvent;
1377+
// Event which will signals the most recent execution of the command-buffer
1378+
// has finished
1379+
pi_event SignalEvent = nullptr;
1380+
// Event which a command-buffer waits on until the wait-list dependencies
1381+
// passed to a command-buffer enqueue have been satisfied.
1382+
pi_event WaitEvent = nullptr;
13791383
};
13801384

13811385
#endif // PI_LEVEL_ZERO_HPP

sycl/source/detail/graph_impl.cpp

-10
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,6 @@ std::shared_ptr<node_impl> graph_impl::add_subgraph_nodes(
104104
return this->add(Outputs);
105105
}
106106

107-
sycl::event
108-
exec_graph_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue) {
109-
sycl::event RetEvent = enqueue(Queue);
110-
// TODO: Remove this queue wait. Currently waiting on the event returned from
111-
// graph execution does not work.
112-
Queue->wait();
113-
114-
return RetEvent;
115-
}
116-
117107
void graph_impl::add_root(const std::shared_ptr<node_impl> &Root) {
118108
MRoots.insert(Root);
119109
}

sycl/source/detail/graph_impl.hpp

+2-7
Original file line numberDiff line numberDiff line change
@@ -325,16 +325,11 @@ class exec_graph_impl {
325325
/// Add nodes to MSchedule.
326326
void schedule();
327327

328-
/// Enqueues the backend objects for the graph to the parametrized queue.
329-
/// @param Queue Command-queue to submit backend objects to.
330-
/// @return Event associated with enqueued object.
331-
sycl::event enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue);
332-
333328
/// Called by handler::ext_oneapi_command_graph() to schedule graph for
334329
/// execution.
335330
/// @param Queue Command-queue to schedule execution on.
336-
/// @return Event associated with the execution of the graph
337-
sycl::event exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue);
331+
/// @return Event associated with the execution of the graph.
332+
sycl::event enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue);
338333

339334
/// Turns our internal graph representation into PI command-buffers for a
340335
/// device.

sycl/source/handler.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ event handler::finalize() {
399399
// If we have a subgraph node we don't want to actually execute this command
400400
// graph submission.
401401
if (!MSubgraphNode) {
402-
event GraphCompletionEvent = MExecGraph->exec(MQueue);
402+
event GraphCompletionEvent = MExecGraph->enqueue(MQueue);
403403
MLastEvent = GraphCompletionEvent;
404404
return MLastEvent;
405405
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// REQUIRES: level_zero, gpu
2+
// RUN: %{build} -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
// Tests that buffer accessors exhibit the correct behaviour when:
6+
// * A node is added to the graph between two queue submissions which
7+
// use the same buffer, but are not added to the graph.
8+
//
9+
// * A queue submission using the same buffer is made after finalization
10+
// of the graph, but before graph execution.
11+
//
12+
// * The graph is submitted for execution twice separated by a queue
13+
// submission using the same buffer, this should respect dependencies and
14+
// create the correct ordering.
15+
16+
#include "../graph_common.hpp"
17+
int main() {
18+
19+
queue Queue{gpu_selector_v};
20+
21+
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
22+
23+
const size_t N = 10;
24+
std::vector<float> Arr(N, 0.0f);
25+
26+
buffer<float> Buf{N};
27+
Buf.set_write_back(false);
28+
29+
// Buffer elements set to 0.5
30+
Queue.submit([&](handler &CGH) {
31+
auto Acc = Buf.get_access(CGH);
32+
CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
33+
size_t i = idx;
34+
Acc[i] = 0.5f;
35+
});
36+
});
37+
38+
Graph.add([&](handler &CGH) {
39+
auto Acc = Buf.get_access(CGH);
40+
CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
41+
size_t i = idx;
42+
Acc[i] += 0.25f;
43+
});
44+
});
45+
46+
for (size_t i = 0; i < N; i++) {
47+
assert(Arr[i] == 0.0f);
48+
}
49+
50+
// Buffer elements set to 1.5
51+
Queue.submit([&](handler &CGH) {
52+
auto Acc = Buf.get_access(CGH);
53+
CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
54+
size_t i = idx;
55+
Acc[i] += 1.0f;
56+
});
57+
});
58+
59+
auto ExecGraph = Graph.finalize();
60+
61+
for (size_t i = 0; i < N; i++) {
62+
assert(Arr[i] == 0.0f);
63+
}
64+
65+
// Buffer elements set to 3.0
66+
Queue.submit([&](handler &CGH) {
67+
auto Acc = Buf.get_access(CGH);
68+
CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
69+
size_t i = idx;
70+
Acc[i] *= 2.0f;
71+
});
72+
});
73+
74+
// Buffer elements set to 3.25
75+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
76+
77+
// Buffer elements set to 6.5
78+
Queue.submit([&](handler &CGH) {
79+
auto Acc = Buf.get_access(CGH);
80+
CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
81+
size_t i = idx;
82+
Acc[i] *= 2.0f;
83+
});
84+
});
85+
86+
// Buffer elements set to 6.75
87+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
88+
89+
Queue.submit([&](handler &CGH) {
90+
auto Acc = Buf.get_access(CGH);
91+
CGH.copy(Acc, Arr.data());
92+
});
93+
Queue.wait();
94+
95+
for (size_t i = 0; i < N; i++) {
96+
assert(Arr[i] == 6.75f);
97+
}
98+
99+
return 0;
100+
}

0 commit comments

Comments
 (0)