Skip to content

[UR][L0 v2][draft] Port USM alloc to adapter v2 #18179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: sycl
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
${CMAKE_CURRENT_SOURCE_DIR}/physical_mem.cpp
Expand Down
9 changes: 8 additions & 1 deletion unified-runtime/source/adapters/level_zero/v2/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,17 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
v2::EVENT_FLAGS_PROFILING_ENABLED)),
p2pAccessDevices(populateP2PDevices(
phDevices[0]->Platform->getNumDevices(), this->hDevices)),
defaultUSMPool(this, nullptr) {}
defaultUSMPool(this, nullptr),
asyncPool(this, nullptr) {}

ur_result_t ur_context_handle_t_::retain() {
RefCount.increment();
return UR_RESULT_SUCCESS;
}

ur_result_t ur_context_handle_t_::release() {
asyncPool.cleanupPools();

if (!RefCount.decrementAndTest())
return UR_RESULT_SUCCESS;

Expand Down Expand Up @@ -104,6 +107,10 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() {
return &defaultUSMPool;
}

ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() {
return &asyncPool;
}

const std::vector<ur_device_handle_t> &
ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const {
return p2pAccessDevices[hDevice->Id.value()];
Expand Down
3 changes: 3 additions & 0 deletions unified-runtime/source/adapters/level_zero/v2/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ struct ur_context_handle_t_ : _ur_object {

const std::vector<ur_device_handle_t> &getDevices() const;
ur_usm_pool_handle_t getDefaultUSMPool();
ur_usm_pool_handle_t getAsyncPool();

const std::vector<ur_device_handle_t> &
getP2PDevices(ur_device_handle_t hDevice) const;
Expand All @@ -55,4 +56,6 @@ struct ur_context_handle_t_ : _ur_object {
const std::vector<std::vector<ur_device_handle_t>> p2pAccessDevices;

ur_usm_pool_handle_t_ defaultUSMPool;

ur_usm_pool_handle_t_ asyncPool;
};
14 changes: 8 additions & 6 deletions unified-runtime/source/adapters/level_zero/v2/queue_api.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 9 additions & 11 deletions unified-runtime/source/adapters/level_zero/v2/queue_api.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");

// TODO: have ur_queue_handle_t here
// hContext->getAsyncPool()->cleanupPoolsForQueue(this);
hContext->getAsyncPool()->cleanupPools();

auto commandListLocked = commandListManager.lock();
// TODO: use zeEventHostSynchronize instead?
TRACK_SCOPE_LATENCY(
Expand Down Expand Up @@ -712,25 +716,106 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper(
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, void **ppMem,
ur_event_handle_t *phEvent, ur_usm_type_t type) {
auto commandListLocked = commandListManager.lock();

if (!pPool) {
pPool = hContext->getAsyncPool();
}

auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice;

std::vector<ur_event_handle_t> extendedWaitList;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don;t need to create a temporary vector here. commandListManger has getWaitListView function that you should use here (it also accepts an optional extra wait event). You just call: getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList, originAllocEvent);

ur_event_handle_t originAllocEvent = nullptr;
auto asyncAlloc =
pPool->allocateEnqueued(hContext, hQueue, device, nullptr, type, size);
if (!asyncAlloc) {
auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem);
if (Ret) {
return Ret;
}
} else {
*ppMem = std::get<0>(*asyncAlloc);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: std::tie(*ppMem, originAllocEvent) = *asyncAlloc;

originAllocEvent = std::get<1>(*asyncAlloc);
if (originAllocEvent) {
for (size_t i = 0; i < numEventsInWaitList; i++) {
extendedWaitList.push_back(phEventWaitList[i]);
}
extendedWaitList.push_back(originAllocEvent);
}
}

if (!extendedWaitList.empty()) {
numEventsInWaitList = static_cast<uint32_t>(extendedWaitList.size());
phEventWaitList = extendedWaitList.data();
}

ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
switch (type) {
case UR_USM_TYPE_HOST:
commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
break;
case UR_USM_TYPE_DEVICE:
commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
break;
case UR_USM_TYPE_SHARED:
commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
break;
default:
logger::error("enqueueUSMAllocHelper: unsupported USM type");
throw UR_RESULT_ERROR_UNKNOWN;
}

auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType);
auto [pWaitEvents, numWaitEvents] =
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);

if (numWaitEvents > 0) {
ZE2UR_CALL(
zeCommandListAppendWaitOnEvents,
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
}

if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
(commandListLocked->getZeCommandList(), zeSignalEvent));
}

return UR_RESULT_SUCCESS;
}

ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp(
ur_usm_pool_handle_t, const size_t,
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
const ur_event_handle_t *, void **, ur_event_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent) {
return enqueueUSMAllocHelper(hQueue, pPool, size, pProperties,
numEventsInWaitList, phEventWaitList, ppMem,
phEvent, UR_USM_TYPE_DEVICE);
}

ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp(
ur_usm_pool_handle_t, const size_t,
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
const ur_event_handle_t *, void **, ur_event_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent) {
return enqueueUSMAllocHelper(hQueue, pPool, size, pProperties,
numEventsInWaitList, phEventWaitList, ppMem,
phEvent, UR_USM_TYPE_SHARED);
}

ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp(
ur_usm_pool_handle_t, const size_t,
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
const ur_event_handle_t *, void **, ur_event_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent) {
return enqueueUSMAllocHelper(hQueue, pPool, size, pProperties,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp"); at the beginning of this function (and other function you implement in this patch).

numEventsInWaitList, phEventWaitList, ppMem,
phEvent, UR_USM_TYPE_HOST);
}

ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {

void recordSubmittedKernel(ur_kernel_handle_t hKernel);

ur_result_t enqueueUSMAllocHelper(
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent, ur_usm_type_t Type);

public:
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
const ur_queue_properties_t *);
Expand Down Expand Up @@ -221,21 +227,20 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) override;
ur_result_t enqueueUSMDeviceAllocExp(
ur_usm_pool_handle_t pPool, const size_t size,
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent) override;
ur_result_t enqueueUSMSharedAllocExp(
ur_usm_pool_handle_t pPool, const size_t size,
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent) override;
ur_result_t enqueueUSMHostAllocExp(
ur_queue_handle_t hQueue, ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
void **ppMem, ur_event_handle_t *phEvent) override;
ur_result_t
enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size,
const ur_exp_async_usm_alloc_properties_t *pProperties,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, void **ppMem,
ur_event_handle_t *phEvent) override;
ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
Expand Down
52 changes: 45 additions & 7 deletions unified-runtime/source/adapters/level_zero/v2/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,11 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t hContext,
if (disjointPoolConfigs.has_value()) {
auto &poolConfig =
disjointPoolConfigs.value().Configs[descToDisjoinPoolMemType(desc)];
poolManager.addPool(
desc, usm::makeDisjointPool(makeProvider(desc), poolConfig));
auto pool = usm::makeDisjointPool(makeProvider(desc), poolConfig);
poolManager.addPool(desc, std::make_unique<UsmPool>(std::move(pool)));
} else {
poolManager.addPool(desc, usm::makeProxyPool(makeProvider(desc)));
auto pool = usm::makeProxyPool(makeProvider(desc));
poolManager.addPool(desc, std::make_unique<UsmPool>(std::move(pool)));
}
}
}
Expand All @@ -186,8 +187,7 @@ ur_context_handle_t ur_usm_pool_handle_t_::getContextHandle() const {
return hContext;
}

umf_memory_pool_handle_t
ur_usm_pool_handle_t_::getPool(const usm::pool_descriptor &desc) {
UsmPool *ur_usm_pool_handle_t_::getPool(const usm::pool_descriptor &desc) {
auto pool = poolManager.getPool(desc).value();
assert(pool);
return pool;
Expand Down Expand Up @@ -215,12 +215,13 @@ ur_result_t ur_usm_pool_handle_t_::allocate(

auto deviceFlags = getDeviceFlags(pUSMDesc);

auto umfPool = getPool(usm::pool_descriptor{
auto pool = getPool(usm::pool_descriptor{
this, hContext, hDevice, type,
bool(deviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY)});
if (!umfPool) {
if (!pool) {
return UR_RESULT_ERROR_INVALID_ARGUMENT;
}
auto umfPool = pool->umfPool.get();

*ppRetMem = umfPoolAlignedMalloc(umfPool, size, alignment);
if (*ppRetMem == nullptr) {
Expand All @@ -241,6 +242,43 @@ ur_result_t ur_usm_pool_handle_t_::free(void *ptr) {
}
}

std::optional<std::pair<void *, ur_event_handle_t>>
ur_usm_pool_handle_t_::allocateEnqueued(ur_context_handle_t hContext,
ur_queue_handle_t hQueue,
ur_device_handle_t hDevice,
const ur_usm_desc_t *pUSMDesc,
ur_usm_type_t type, size_t size) {
uint32_t alignment = pUSMDesc ? pUSMDesc->align : 0;
if ((alignment & (alignment - 1)) != 0) {
return std::nullopt;
}

auto deviceFlags = getDeviceFlags(pUSMDesc);

auto umfPool = getPool(usm::pool_descriptor{
this, hContext, hDevice, type,
bool(deviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY)});
if (!umfPool) {
return std::nullopt;
}

auto allocation = umfPool->asyncPool.getBestFit(size, alignment, hQueue);
if (!allocation) {
return std::nullopt;
}

return std::make_pair(allocation->Ptr, allocation->Event);
}

void ur_usm_pool_handle_t_::cleanupPools() {
poolManager.forEachPool([&](UsmPool *p) { return p->asyncPool.cleanup(); });
}

void ur_usm_pool_handle_t_::cleanupPoolsForQueue(ur_queue_handle_t hQueue) {
poolManager.forEachPool(
[&](UsmPool *p) { return p->asyncPool.cleanupForQueue(hQueue); });
}

namespace ur::level_zero {
ur_result_t urUSMPoolCreate(
/// [in] handle of the context object
Expand Down
Loading