Skip to content

Commit 4ac080d

Browse files
committed
Add async allocations to L0 adapter v2
1 parent 7d063d2 commit 4ac080d

File tree

11 files changed

+244
-48
lines changed

11 files changed

+244
-48
lines changed

unified-runtime/source/adapters/level_zero/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
134134
${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
135135
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
136136
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
137+
${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp
137138
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
138139
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
139140
${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp

unified-runtime/source/adapters/level_zero/enqueued_pool.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
EnqueuedPool::~EnqueuedPool() { cleanup(); }
1717

1818
std::optional<EnqueuedPool::Allocation>
19-
EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
20-
ur_queue_handle_t Queue) {
19+
EnqueuedPool::getBestFit(size_t Size, size_t Alignment, void *Queue) {
2120
auto Lock = std::lock_guard(Mutex);
2221

2322
Allocation Alloc = {nullptr, Size, nullptr, Queue, Alignment};
@@ -47,7 +46,7 @@ EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
4746
}
4847

4948
void EnqueuedPool::insert(void *Ptr, size_t Size, ur_event_handle_t Event,
50-
ur_queue_handle_t Queue) {
49+
void *Queue) {
5150
auto Lock = std::lock_guard(Mutex);
5251

5352
uintptr_t Address = (uintptr_t)Ptr;
@@ -67,14 +66,14 @@ bool EnqueuedPool::cleanup() {
6766
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It.Ptr);
6867
assert(umfRet == UMF_RESULT_SUCCESS);
6968

70-
urEventReleaseInternal(It.Event);
69+
eventRelease(It.Event);
7170
}
7271
Freelist.clear();
7372

7473
return FreedAllocations;
7574
}
7675

77-
bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
76+
bool EnqueuedPool::cleanupForQueue(void *Queue) {
7877
auto Lock = std::lock_guard(Mutex);
7978

8079
Allocation Alloc = {nullptr, 0, nullptr, Queue, 0};
@@ -90,7 +89,7 @@ bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
9089
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It->Ptr);
9190
assert(umfRet == UMF_RESULT_SUCCESS);
9291

93-
urEventReleaseInternal(It->Event);
92+
eventRelease(It->Event);
9493

9594
// Erase the current allocation and move to the next one
9695
It = Freelist.erase(It);

unified-runtime/source/adapters/level_zero/enqueued_pool.hpp

+11-5
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,23 @@ class EnqueuedPool {
2222
void *Ptr;
2323
size_t Size;
2424
ur_event_handle_t Event;
25-
ur_queue_handle_t Queue;
25+
// Queue handle, used as an identifier for the associated queue.
26+
// This can either be a `ur_queue_handle_t` or a pointer to a v2 queue
27+
// object.
28+
void *Queue;
2629
size_t Alignment;
2730
};
2831

32+
EnqueuedPool(ur_result_t (*eventRelease)(ur_event_handle_t))
33+
: eventRelease(eventRelease) {}
34+
2935
~EnqueuedPool();
3036
std::optional<Allocation> getBestFit(size_t Size, size_t Alignment,
31-
ur_queue_handle_t Queue);
32-
void insert(void *Ptr, size_t Size, ur_event_handle_t Event,
33-
ur_queue_handle_t Queue);
37+
void *Queue);
38+
void insert(void *Ptr, size_t Size, ur_event_handle_t Event, void *Queue);
3439
bool cleanup();
35-
bool cleanupForQueue(ur_queue_handle_t Queue);
40+
bool cleanupForQueue(void *Queue);
41+
ur_result_t (*eventRelease)(ur_event_handle_t);
3642

3743
private:
3844
struct Comparator {

unified-runtime/source/adapters/level_zero/usm.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "common.hpp"
1313

1414
#include "enqueued_pool.hpp"
15+
#include "event.hpp"
1516
#include "ur_api.h"
1617
#include "ur_pool_manager.hpp"
1718
#include <set>
@@ -20,7 +21,10 @@
2021
usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig();
2122

2223
struct UsmPool {
23-
UsmPool(umf::pool_unique_handle_t Pool) : UmfPool(std::move(Pool)) {}
24+
UsmPool(umf::pool_unique_handle_t Pool)
25+
: UmfPool(std::move(Pool)), AsyncPool([](ur_event_handle_t Event) {
26+
return urEventReleaseInternal(Event);
27+
}) {}
2428
umf::pool_unique_handle_t UmfPool;
2529
EnqueuedPool AsyncPool;
2630
};

unified-runtime/source/adapters/level_zero/v2/context.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,17 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
6767
v2::EVENT_FLAGS_PROFILING_ENABLED)),
6868
p2pAccessDevices(populateP2PDevices(
6969
phDevices[0]->Platform->getNumDevices(), this->hDevices)),
70-
defaultUSMPool(this, nullptr) {}
70+
defaultUSMPool(this, nullptr),
71+
asyncPool(this, nullptr) {}
7172

7273
ur_result_t ur_context_handle_t_::retain() {
7374
RefCount.increment();
7475
return UR_RESULT_SUCCESS;
7576
}
7677

7778
ur_result_t ur_context_handle_t_::release() {
79+
asyncPool.cleanupPools();
80+
7881
if (!RefCount.decrementAndTest())
7982
return UR_RESULT_SUCCESS;
8083

@@ -104,6 +107,10 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() {
104107
return &defaultUSMPool;
105108
}
106109

110+
ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() {
111+
return &asyncPool;
112+
}
113+
107114
const std::vector<ur_device_handle_t> &
108115
ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const {
109116
return p2pAccessDevices[hDevice->Id.value()];

unified-runtime/source/adapters/level_zero/v2/context.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct ur_context_handle_t_ : ur_object {
2929

3030
const std::vector<ur_device_handle_t> &getDevices() const;
3131
ur_usm_pool_handle_t getDefaultUSMPool();
32+
ur_usm_pool_handle_t getAsyncPool();
3233

3334
const std::vector<ur_device_handle_t> &
3435
getP2PDevices(ur_device_handle_t hDevice) const;
@@ -55,4 +56,6 @@ struct ur_context_handle_t_ : ur_object {
5556
const std::vector<std::vector<ur_device_handle_t>> p2pAccessDevices;
5657

5758
ur_usm_pool_handle_t_ defaultUSMPool;
59+
60+
ur_usm_pool_handle_t_ asyncPool;
5861
};

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

+129-16
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
148148
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
149149
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
150150

151+
hContext->getAsyncPool()->cleanupPoolsForQueue(this);
152+
151153
auto commandListLocked = commandListManager.lock();
152154
// TODO: use zeEventHostSynchronize instead?
153155
TRACK_SCOPE_LATENCY(
@@ -701,31 +703,142 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe(
701703
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
702704
}
703705

706+
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper(
707+
ur_usm_pool_handle_t pPool, const size_t size,
708+
const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList,
709+
const ur_event_handle_t *phEventWaitList, void **ppMem,
710+
ur_event_handle_t *phEvent, ur_usm_type_t type) {
711+
auto commandListLocked = commandListManager.lock();
712+
713+
if (!pPool) {
714+
pPool = hContext->getAsyncPool();
715+
}
716+
717+
auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice;
718+
auto waitListView =
719+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
720+
721+
auto asyncAlloc =
722+
pPool->allocateEnqueued(hContext, this, device, nullptr, type, size);
723+
if (!asyncAlloc) {
724+
auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem);
725+
if (Ret) {
726+
return Ret;
727+
}
728+
} else {
729+
ur_event_handle_t originAllocEvent;
730+
std::tie(*ppMem, originAllocEvent) = *asyncAlloc;
731+
waitListView = getWaitListView(commandListLocked, phEventWaitList,
732+
numEventsInWaitList, originAllocEvent);
733+
}
734+
735+
ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
736+
switch (type) {
737+
case UR_USM_TYPE_HOST:
738+
commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
739+
break;
740+
case UR_USM_TYPE_DEVICE:
741+
commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
742+
break;
743+
case UR_USM_TYPE_SHARED:
744+
commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
745+
break;
746+
default:
747+
logger::error("enqueueUSMAllocHelper: unsupported USM type");
748+
throw UR_RESULT_ERROR_UNKNOWN;
749+
}
750+
751+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType);
752+
auto [pWaitEvents, numWaitEvents] = waitListView;
753+
if (numWaitEvents > 0) {
754+
ZE2UR_CALL(
755+
zeCommandListAppendWaitOnEvents,
756+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
757+
}
758+
if (zeSignalEvent) {
759+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
760+
(commandListLocked->getZeCommandList(), zeSignalEvent));
761+
}
762+
763+
return UR_RESULT_SUCCESS;
764+
}
765+
704766
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp(
705-
ur_usm_pool_handle_t, const size_t,
706-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
707-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
708-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
767+
ur_usm_pool_handle_t pPool, const size_t size,
768+
const ur_exp_async_usm_alloc_properties_t *pProperties,
769+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
770+
void **ppMem, ur_event_handle_t *phEvent) {
771+
TRACK_SCOPE_LATENCY(
772+
"ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp");
773+
774+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
775+
phEventWaitList, ppMem, phEvent,
776+
UR_USM_TYPE_DEVICE);
709777
}
710778

711779
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp(
712-
ur_usm_pool_handle_t, const size_t,
713-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
714-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
715-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
780+
ur_usm_pool_handle_t pPool, const size_t size,
781+
const ur_exp_async_usm_alloc_properties_t *pProperties,
782+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
783+
void **ppMem, ur_event_handle_t *phEvent) {
784+
TRACK_SCOPE_LATENCY(
785+
"ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp");
786+
787+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
788+
phEventWaitList, ppMem, phEvent,
789+
UR_USM_TYPE_SHARED);
716790
}
717791

718792
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp(
719-
ur_usm_pool_handle_t, const size_t,
720-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
721-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
722-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
793+
ur_usm_pool_handle_t pPool, const size_t size,
794+
const ur_exp_async_usm_alloc_properties_t *pProperties,
795+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
796+
void **ppMem, ur_event_handle_t *phEvent) {
797+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp");
798+
799+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
800+
phEventWaitList, ppMem, phEvent,
801+
UR_USM_TYPE_HOST);
723802
}
724803

725804
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp(
726-
ur_usm_pool_handle_t, void *, uint32_t, const ur_event_handle_t *,
727-
ur_event_handle_t *) {
728-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
805+
ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList,
806+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
807+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFreeExp");
808+
auto commandListLocked = commandListManager.lock();
809+
810+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent,
811+
UR_COMMAND_ENQUEUE_USM_FREE_EXP);
812+
auto [pWaitEvents, numWaitEvents] =
813+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
814+
815+
umf_memory_pool_handle_t hPool = umfPoolByPtr(pMem);
816+
if (!hPool) {
817+
return UR_RESULT_SUCCESS;
818+
}
819+
820+
UsmPool *usmPool = nullptr;
821+
auto ret = umfPoolGetTag(hPool, (void **)&usmPool);
822+
if (ret != UR_RESULT_SUCCESS || !usmPool) {
823+
// This should never happen
824+
return UR_RESULT_ERROR_UNKNOWN;
825+
}
826+
827+
size_t size = umfPoolMallocUsableSize(hPool, pMem);
828+
usmPool->asyncPool.insert(pMem, size, *phEvent, this);
829+
830+
if (numWaitEvents > 0) {
831+
ZE2UR_CALL(
832+
zeCommandListAppendWaitOnEvents,
833+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
834+
}
835+
836+
if (zeSignalEvent) {
837+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
838+
(commandListLocked->getZeCommandList(), zeSignalEvent));
839+
}
840+
841+
return UR_RESULT_SUCCESS;
729842
}
730843

731844
ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp(
@@ -855,9 +968,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp(
855968
"ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp");
856969

857970
auto commandListLocked = commandListManager.lock();
971+
858972
auto zeSignalEvent =
859973
getSignalEvent(commandListLocked, phEvent, callerCommand);
860-
861974
auto [pWaitEvents, numWaitEvents] =
862975
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList,
863976
additionalWaitEvent);

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ {
6464

6565
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
6666

67+
ur_result_t
68+
enqueueUSMAllocHelper(ur_usm_pool_handle_t pPool, const size_t size,
69+
const ur_exp_async_usm_alloc_properties_t *pProperties,
70+
uint32_t numEventsInWaitList,
71+
const ur_event_handle_t *phEventWaitList, void **ppMem,
72+
ur_event_handle_t *phEvent, ur_usm_type_t Type);
73+
6774
public:
6875
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
6976
const ur_queue_properties_t *);

0 commit comments

Comments
 (0)