From c462829bf2a44373b57a7feabef5166303b3e5a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Staniewski?= Date: Thu, 24 Apr 2025 15:09:22 +0000 Subject: [PATCH] Add async allocations to L0 adapter v2 --- .../source/adapters/level_zero/CMakeLists.txt | 1 + .../source/adapters/level_zero/v2/context.cpp | 9 +- .../source/adapters/level_zero/v2/context.hpp | 3 + .../adapters/level_zero/v2/queue_api.cpp | 14 ++- .../adapters/level_zero/v2/queue_api.hpp | 20 ++-- .../v2/queue_immediate_in_order.cpp | 109 ++++++++++++++++-- .../v2/queue_immediate_in_order.hpp | 21 ++-- .../source/adapters/level_zero/v2/usm.cpp | 52 +++++++-- .../source/adapters/level_zero/v2/usm.hpp | 19 ++- 9 files changed, 201 insertions(+), 47 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 6eb08a6d93dde..b08b8846442ba 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -134,6 +134,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp ${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp diff --git a/unified-runtime/source/adapters/level_zero/v2/context.cpp b/unified-runtime/source/adapters/level_zero/v2/context.cpp index 4cfd6b7c9de54..8120104b32f24 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.cpp @@ -67,7 +67,8 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, v2::EVENT_FLAGS_PROFILING_ENABLED)), p2pAccessDevices(populateP2PDevices( phDevices[0]->Platform->getNumDevices(), this->hDevices)), - defaultUSMPool(this, nullptr) {} + defaultUSMPool(this, nullptr), + asyncPool(this, nullptr) {} ur_result_t ur_context_handle_t_::retain() { RefCount.increment(); @@ -75,6 +76,8 @@ ur_result_t ur_context_handle_t_::retain() { } ur_result_t ur_context_handle_t_::release() { + asyncPool.cleanupPools(); + if (!RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -104,6 +107,10 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() { return &defaultUSMPool; } +ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() { + return &asyncPool; +} + const std::vector & ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const { return p2pAccessDevices[hDevice->Id.value()]; diff --git a/unified-runtime/source/adapters/level_zero/v2/context.hpp b/unified-runtime/source/adapters/level_zero/v2/context.hpp index 61eb474eb26ce..d4093a88da885 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.hpp @@ -29,6 +29,7 @@ struct ur_context_handle_t_ : _ur_object { const std::vector &getDevices() const; ur_usm_pool_handle_t getDefaultUSMPool(); + ur_usm_pool_handle_t getAsyncPool(); const std::vector & getP2PDevices(ur_device_handle_t hDevice) const; @@ -55,4 +56,6 @@ struct ur_context_handle_t_ : _ur_object { const std::vector> p2pAccessDevices; ur_usm_pool_handle_t_ defaultUSMPool; + + ur_usm_pool_handle_t_ asyncPool; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index faa09eee3eaa5..2af40237f8b44 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -348,10 +348,12 @@ ur_result_t urEnqueueUSMDeviceAllocExp( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) try { return hQueue->get().enqueueUSMDeviceAllocExp( - pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, - phEvent); + hQueue, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, phEvent); } catch (...) { - return exceptionToResult(std::current_exception()); + return hQueue->get().enqueueUSMDeviceAllocExp( + hQueue, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, phEvent); } ur_result_t urEnqueueUSMSharedAllocExp( ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, @@ -359,8 +361,8 @@ ur_result_t urEnqueueUSMSharedAllocExp( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) try { return hQueue->get().enqueueUSMSharedAllocExp( - pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, - phEvent); + hQueue, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -369,7 +371,7 @@ ur_result_t urEnqueueUSMHostAllocExp( const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) try { - return hQueue->get().enqueueUSMHostAllocExp(pPool, size, pProperties, + return hQueue->get().enqueueUSMHostAllocExp(hQueue, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, phEvent); } catch (...) { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 1a551cfaedb21..ad8f267769724 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -127,18 +127,16 @@ struct ur_queue_t_ { bool, void *, size_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; + virtual ur_result_t enqueueUSMDeviceAllocExp( + ur_queue_handle_t, ur_usm_pool_handle_t, const size_t, + const ur_exp_async_usm_alloc_properties_t *, uint32_t, + const ur_event_handle_t *, void **, ur_event_handle_t *) = 0; + virtual ur_result_t enqueueUSMSharedAllocExp( + ur_queue_handle_t, ur_usm_pool_handle_t, const size_t, + const ur_exp_async_usm_alloc_properties_t *, uint32_t, + const ur_event_handle_t *, void **, ur_event_handle_t *) = 0; virtual ur_result_t - enqueueUSMDeviceAllocExp(ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, - uint32_t, const ur_event_handle_t *, void **, - ur_event_handle_t *) = 0; - virtual ur_result_t - enqueueUSMSharedAllocExp(ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, - uint32_t, const ur_event_handle_t *, void **, - ur_event_handle_t *) = 0; - virtual ur_result_t - enqueueUSMHostAllocExp(ur_usm_pool_handle_t, const size_t, + enqueueUSMHostAllocExp(ur_queue_handle_t, ur_usm_pool_handle_t, const size_t, const ur_exp_async_usm_alloc_properties_t *, uint32_t, const ur_event_handle_t *, void **, ur_event_handle_t *) = 0; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index a782bd43c94c0..f1b8d049dad45 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -153,6 +153,10 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_result_t ur_queue_immediate_in_order_t::queueFinish() { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); + // TODO: have ur_queue_handle_t here + // hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->getAsyncPool()->cleanupPools(); + auto commandListLocked = commandListManager.lock(); // TODO: use zeEventHostSynchronize instead? TRACK_SCOPE_LATENCY( @@ -712,25 +716,106 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper( + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent, ur_usm_type_t type) { + auto commandListLocked = commandListManager.lock(); + + if (!pPool) { + pPool = hContext->getAsyncPool(); + } + + auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice; + + std::vector extendedWaitList; + ur_event_handle_t originAllocEvent = nullptr; + auto asyncAlloc = + pPool->allocateEnqueued(hContext, hQueue, device, nullptr, type, size); + if (!asyncAlloc) { + auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem); + if (Ret) { + return Ret; + } + } else { + *ppMem = std::get<0>(*asyncAlloc); + originAllocEvent = std::get<1>(*asyncAlloc); + if (originAllocEvent) { + for (size_t i = 0; i < numEventsInWaitList; i++) { + extendedWaitList.push_back(phEventWaitList[i]); + } + extendedWaitList.push_back(originAllocEvent); + } + } + + if (!extendedWaitList.empty()) { + numEventsInWaitList = static_cast(extendedWaitList.size()); + phEventWaitList = extendedWaitList.data(); + } + + ur_command_t commandType = UR_COMMAND_FORCE_UINT32; + switch (type) { + case UR_USM_TYPE_HOST: + commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP; + break; + case UR_USM_TYPE_DEVICE: + commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP; + break; + case UR_USM_TYPE_SHARED: + commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP; + break; + default: + logger::error("enqueueUSMAllocHelper: unsupported USM type"); + throw UR_RESULT_ERROR_UNKNOWN; + } + + auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); + + if (numWaitEvents > 0) { + ZE2UR_CALL( + zeCommandListAppendWaitOnEvents, + (commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents)); + } + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (commandListLocked->getZeCommandList(), zeSignalEvent)); + } + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp( - ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, uint32_t, - const ur_event_handle_t *, void **, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + return enqueueUSMAllocHelper(hQueue, pPool, size, pProperties, + numEventsInWaitList, phEventWaitList, ppMem, + phEvent, UR_USM_TYPE_DEVICE); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp( - ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, uint32_t, - const ur_event_handle_t *, void **, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + return enqueueUSMAllocHelper(hQueue, pPool, size, pProperties, + numEventsInWaitList, phEventWaitList, ppMem, + phEvent, UR_USM_TYPE_SHARED); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp( - ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, uint32_t, - const ur_event_handle_t *, void **, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + return enqueueUSMAllocHelper(hQueue, pPool, size, pProperties, + numEventsInWaitList, phEventWaitList, ppMem, + phEvent, UR_USM_TYPE_HOST); } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp( diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 7ddd96fd9ff91..25ee102b63a46 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -67,6 +67,12 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ { void recordSubmittedKernel(ur_kernel_handle_t hKernel); + ur_result_t enqueueUSMAllocHelper( + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent, ur_usm_type_t Type); + public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, const ur_queue_properties_t *); @@ -221,21 +227,20 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; ur_result_t enqueueUSMDeviceAllocExp( - ur_usm_pool_handle_t pPool, const size_t size, + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override; ur_result_t enqueueUSMSharedAllocExp( - ur_usm_pool_handle_t pPool, const size_t size, + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override; + ur_result_t enqueueUSMHostAllocExp( + ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override; - ur_result_t - enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *pProperties, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t *phEvent) override; ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.cpp b/unified-runtime/source/adapters/level_zero/v2/usm.cpp index 18da0c37776eb..e3d1eccd21a84 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.cpp @@ -174,10 +174,11 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext, if (disjointPoolConfigs.has_value()) { auto &poolConfig = disjointPoolConfigs.value().Configs[descToDisjoinPoolMemType(desc)]; - poolManager.addPool( - desc, usm::makeDisjointPool(makeProvider(desc), poolConfig)); + auto pool = usm::makeDisjointPool(makeProvider(desc), poolConfig); + poolManager.addPool(desc, std::make_unique(std::move(pool))); } else { - poolManager.addPool(desc, usm::makeProxyPool(makeProvider(desc))); + auto pool = usm::makeProxyPool(makeProvider(desc)); + poolManager.addPool(desc, std::make_unique(std::move(pool))); } } } @@ -186,8 +187,7 @@ ur_context_handle_t ur_usm_pool_handle_t_::getContextHandle() const { return hContext; } -umf_memory_pool_handle_t -ur_usm_pool_handle_t_::getPool(const usm::pool_descriptor &desc) { +UsmPool *ur_usm_pool_handle_t_::getPool(const usm::pool_descriptor &desc) { auto pool = poolManager.getPool(desc).value(); assert(pool); return pool; @@ -215,12 +215,13 @@ ur_result_t ur_usm_pool_handle_t_::allocate( auto deviceFlags = getDeviceFlags(pUSMDesc); - auto umfPool = getPool(usm::pool_descriptor{ + auto pool = getPool(usm::pool_descriptor{ this, hContext, hDevice, type, bool(deviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY)}); - if (!umfPool) { + if (!pool) { return UR_RESULT_ERROR_INVALID_ARGUMENT; } + auto umfPool = pool->umfPool.get(); *ppRetMem = umfPoolAlignedMalloc(umfPool, size, alignment); if (*ppRetMem == nullptr) { @@ -241,6 +242,43 @@ ur_result_t ur_usm_pool_handle_t_::free(void *ptr) { } } +std::optional> +ur_usm_pool_handle_t_::allocateEnqueued(ur_context_handle_t hContext, + ur_queue_handle_t hQueue, + ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, + ur_usm_type_t type, size_t size) { + uint32_t alignment = pUSMDesc ? pUSMDesc->align : 0; + if ((alignment & (alignment - 1)) != 0) { + return std::nullopt; + } + + auto deviceFlags = getDeviceFlags(pUSMDesc); + + auto umfPool = getPool(usm::pool_descriptor{ + this, hContext, hDevice, type, + bool(deviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY)}); + if (!umfPool) { + return std::nullopt; + } + + auto allocation = umfPool->asyncPool.getBestFit(size, alignment, hQueue); + if (!allocation) { + return std::nullopt; + } + + return std::make_pair(allocation->Ptr, allocation->Event); +} + +void ur_usm_pool_handle_t_::cleanupPools() { + poolManager.forEachPool([&](UsmPool *p) { return p->asyncPool.cleanup(); }); +} + +void ur_usm_pool_handle_t_::cleanupPoolsForQueue(ur_queue_handle_t hQueue) { + poolManager.forEachPool( + [&](UsmPool *p) { return p->asyncPool.cleanupForQueue(hQueue); }); +} + namespace ur::level_zero { ur_result_t urUSMPoolCreate( /// [in] handle of the context object diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.hpp b/unified-runtime/source/adapters/level_zero/v2/usm.hpp index 4f85236c57e55..78f4b9132a216 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.hpp @@ -12,9 +12,16 @@ #include "ur_api.h" +#include "../enqueued_pool.hpp" #include "common.hpp" #include "ur_pool_manager.hpp" +struct UsmPool { + UsmPool(umf::pool_unique_handle_t pPool) : umfPool(std::move(pPool)) {} + umf::pool_unique_handle_t umfPool; + EnqueuedPool asyncPool; +}; + struct ur_usm_pool_handle_t_ : _ur_object { ur_usm_pool_handle_t_(ur_context_handle_t hContext, ur_usm_pool_desc_t *pPoolDes); @@ -26,9 +33,17 @@ struct ur_usm_pool_handle_t_ : _ur_object { size_t size, void **ppRetMem); ur_result_t free(void *ptr); + std::optional> + allocateEnqueued(ur_context_handle_t hContext, ur_queue_handle_t hQueue, + ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, + ur_usm_type_t type, size_t size); + + void cleanupPools(); + void cleanupPoolsForQueue(ur_queue_handle_t hQueue); + private: ur_context_handle_t hContext; - usm::pool_manager poolManager; + usm::pool_manager poolManager; - umf_memory_pool_handle_t getPool(const usm::pool_descriptor &desc); + UsmPool *getPool(const usm::pool_descriptor &desc); };