diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index e3e88f1aa581c..594a173862f40 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_immediate_in_order_t::queueFinish() { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); - - auto commandListLocked = commandListManager.lock(); +ur_result_t ur_queue_immediate_in_order_t::synchronize( + locked &commandListLocked) { // TODO: use zeEventHostSynchronize instead? TRACK_SCOPE_LATENCY( "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); @@ -165,8 +163,27 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { return UR_RESULT_SUCCESS; } +ur_result_t ur_queue_immediate_in_order_t::queueFinish() { + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); + + auto commandListLocked = commandListManager.lock(); + return synchronize(commandListLocked); +} + +// In order to avoid tracking individual events for each kernel submission on +// the queue, the adapter simply keeps a vector of all handles of submitted +// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that +// all previously enqueued kernels have finished. However, some applications +// might not explicitly synchronize the queue, in which case the submitted +// kernels might grow unbounded. To prevent that, we need to cap the vector's +// size, and forcibly synchronize the queue once it exceeds the limit. +#define MAX_QUEUE_SUBMITTED_KERNELS 1024 + void ur_queue_immediate_in_order_t::recordSubmittedKernel( - ur_kernel_handle_t hKernel) { + locked &commandList, ur_kernel_handle_t hKernel) { + if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) { + synchronize(commandList); + } submittedKernels.push_back(hKernel); hKernel->RefCount.increment(); } @@ -195,7 +212,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent)); - recordSubmittedKernel(hKernel); + recordSubmittedKernel(commandListLocked, hKernel); return UR_RESULT_SUCCESS; } @@ -847,7 +864,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( &zeThreadGroupDimensions, zeSignalEvent, waitListView.num, waitListView.handles)); - recordSubmittedKernel(hKernel); + recordSubmittedKernel(commandListLocked, hKernel); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index fb7ed9a9b43e9..da0b217f1e1b1 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -22,6 +22,7 @@ #include "command_list_manager.hpp" #include "lockable.hpp" +#include "ur_api.h" namespace v2 { @@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - void recordSubmittedKernel(ur_kernel_handle_t hKernel); + void recordSubmittedKernel(locked &commandList, + ur_kernel_handle_t hKernel); + + ur_result_t synchronize(locked &commandList); public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,