diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp index e127f13e5e6f5..1d410e16bdf50 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp @@ -18,7 +18,7 @@ #include "kernel.hpp" #include "lockable.hpp" #include "memory.hpp" -#include "ur.hpp" +// #include "ur/ur.hpp" #include "../common/latency_tracker.hpp" #include "../helpers/kernel_helpers.hpp" @@ -34,6 +34,8 @@ namespace v2 { +// constexpr uint64_t initialSlotsForBatches = 10; + ur_queue_batched_t::ur_queue_batched_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, ze_command_queue_priority_t priority, std::optional index, @@ -123,10 +125,10 @@ ur_result_t batch_manager::renewRegularUnlocked( ur_result_t ur_queue_batched_t::renewBatchUnlocked(locked &batchLocked) { if (batchLocked->isLimitOfUsedCommandListsReached()) { - UR_CALL(queueFinishUnlocked(batchLocked)); + return queueFinishUnlocked(batchLocked); + } else { + return batchLocked->renewRegularUnlocked(getNewRegularCmdList()); } - - return batchLocked->renewRegularUnlocked(getNewRegularCmdList()); } ur_result_t batch_manager::enqueueCurrentBatchUnlocked() { @@ -214,20 +216,26 @@ ur_result_t batch_manager::batchFinish() { UR_CALL(activeBatch.releaseSubmittedKernels()); - { + if (!isActiveBatchEmpty()) { TRACK_SCOPE_LATENCY("ur_queue_batched_t::resetRegCmdlist"); ZE2UR_CALL(zeCommandListReset, (activeBatch.getZeCommandList())); + + setBatchEmpty(); + regularGenerationNumber++; } runBatches.clear(); - setBatchEmpty(); + // regularGenerationNumber++; return UR_RESULT_SUCCESS; } ur_result_t ur_queue_batched_t::queueFinishUnlocked(locked &batchLocked) { - UR_CALL(batchLocked->enqueueCurrentBatchUnlocked()); + if (!batchLocked->isActiveBatchEmpty()) { + UR_CALL(batchLocked->enqueueCurrentBatchUnlocked()); + } + UR_CALL(batchLocked->hostSynchronize()); UR_CALL(queueFinishPoolsUnlocked()); @@ -1070,7 +1078,12 @@ ur_queue_batched_t::queueFlushUnlocked(locked &batchLocked) { ur_result_t ur_queue_batched_t::queueFlush() { auto batchLocked = currentCmdLists.lock(); - return queueFlushUnlocked(batchLocked); + + if (batchLocked->isActiveBatchEmpty()) { + return UR_RESULT_SUCCESS; + } else { + return queueFlushUnlocked(batchLocked); + } } } // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp index 882145cf22996..3b6c505caa582 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp @@ -47,6 +47,14 @@ namespace v2 { +// The limit of regular command lists stored for execution; if exceeded, the +// vector is cleared as part of queueFinish and slots are renewed. +inline constexpr uint64_t initialSlotsForBatches = 10; + +// For the explanation of the purpose of generation numbers, see the comment for +// regularGenerationNumber below +inline constexpr ur_event_generation_t initialGenerationNumber = 0; + struct batch_manager { private: // The currently active regular command list, which may be replaced in the @@ -75,9 +83,6 @@ struct batch_manager { // associated with the event has already been submitted for execution and // additional submission of the current batch is not needed. ur_event_generation_t regularGenerationNumber; - // The limit of regular command lists stored for execution; if exceeded, the - // vector is cleared as part of queueFinish and slots are renewed. - static constexpr uint64_t initialSlotsForBatches = 10; // Whether any operation has been enqueued on the current batch bool isEmpty = true; @@ -91,7 +96,7 @@ struct batch_manager { immediateList(context, device, std::forward( commandListImmediate)), - regularGenerationNumber(0) { + regularGenerationNumber(initialGenerationNumber) { runBatches.reserve(initialSlotsForBatches); } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index cc08532d2817f..80828deb12573 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -76,6 +76,9 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; } + // For tests in CI + flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + auto zeIndex = v2::getZeIndex(pProperties); bool immediate = true; @@ -95,6 +98,8 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, "urQueueCreate called with both UR_QUEUE_FLAG_SUBMISSION_BATCHED " "and UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE in ur_queue_flags_t. " "Defaulting to the immediate submission mode."); + + flags &= ~UR_QUEUE_FLAG_SUBMISSION_BATCHED; } immediate = true; diff --git a/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp b/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp index fa7c3ac55c3e8..838d117cee701 100644 --- a/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp +++ b/unified-runtime/test/adapters/level_zero/enqueue_alloc.cpp @@ -10,6 +10,7 @@ #include #include "ur_api.h" +#include "uur/utils.h" #include struct EnqueueAllocTestParam { @@ -81,6 +82,7 @@ struct urL0EnqueueAllocMultiQueueSameDeviceTest for (size_t i = 0; i < param.numQueues; i++) { ur_queue_handle_t queue = nullptr; ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue)); + SKIP_IF_BATCHED_QUEUE(queue); queues.push_back(queue); } } @@ -353,6 +355,10 @@ TEST_P(urL0EnqueueAllocMultiQueueSameDeviceTest, SuccessMt) { const auto checkUSMSupportFunc = std::get<1>(this->GetParam()).funcParams.checkUSMSupportFunc; + if (numQueues > 0) { + SKIP_IF_BATCHED_QUEUE(queues[0]);; + } + ur_device_usm_access_capability_flags_t USMSupport = 0; ASSERT_SUCCESS(checkUSMSupportFunc(device, USMSupport)); if (!(USMSupport & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { diff --git a/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt b/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt index bc2d0b09665f2..b178cc862f56c 100644 --- a/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt +++ b/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt @@ -67,6 +67,22 @@ add_l0_v2_devices_test(memory_residency ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp ) +add_l0_v2_devices_test(batched_queue + batched_queue_test.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/adapter.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/common.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/device.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/platform.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_pool_cache.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_pool.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_counter.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_normal.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event.cpp + ${PROJECT_SOURCE_DIR}/source/ur/ur.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/command_list_cache.cpp +) + if(NOT UR_FOUND_DPCXX) # Tests that require kernels can't be used if we aren't generating # device binaries diff --git a/unified-runtime/test/adapters/level_zero/v2/batched_queue_test.cpp b/unified-runtime/test/adapters/level_zero/v2/batched_queue_test.cpp new file mode 100644 index 0000000000000..e4477b3cf8d45 --- /dev/null +++ b/unified-runtime/test/adapters/level_zero/v2/batched_queue_test.cpp @@ -0,0 +1,456 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %with-v2 ./batched_queue-test +// REQUIRES: v2 +#include "command_list_cache.hpp" + +#include "level_zero/common.hpp" +#include "level_zero/device.hpp" + +#include "../ze_helpers.hpp" +#include "context.hpp" +#include "event.hpp" +#include "event_pool.hpp" +#include "event_pool_cache.hpp" +#include "event_provider.hpp" +#include "event_provider_counter.hpp" +#include "event_provider_normal.hpp" +#include "queue_batched.hpp" +#include "queue_handle.hpp" +#include "ur_api.h" +#include "uur/checks.h" +#include "uur/fixtures.h" +#include "ze_api.h" + +#include "gtest/gtest.h" +#include +#include +#include +#include +#include + +const ur_dditable_t *ur::level_zero::ddi_getter::value() { + // Return a blank dditable + static ur_dditable_t table{}; + return &table; +}; + +// mock necessary functions from context, we can't pull in entire context +// implementation due to a lot of other dependencies +std::vector mockVec{}; +const std::vector & +ur_context_handle_t_::getDevices() const { + return mockVec; +} + +struct urBatchedQueueTest : uur::urContextTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp()); + + ASSERT_SUCCESS( + urQueueCreate(context, device, &batched_queue_properties, &queue1)); + ASSERT_NE(queue1, nullptr); + + ASSERT_SUCCESS( + urQueueCreate(context, device, &batched_queue_properties, &queue2)); + ASSERT_NE(queue2, nullptr); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, + buffer_size, nullptr, &buffer)); + ASSERT_NE(buffer, nullptr); + } + + void TearDown() override { + if (buffer) { + ASSERT_SUCCESS(urMemRelease(buffer)); + } + + if (queue1) { + ASSERT_SUCCESS(urQueueRelease(queue1)); + } + + if (queue2) { + ASSERT_SUCCESS(urQueueRelease(queue2)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urContextTest::TearDown()); + } + + void vectorOfSubmittedBatchesIsClearedHelper() { + std::vector data(buffer_size, 42); + // Initially, the vector of batches submitted for execution is empty. After every iteration, each queueFlush results in submitting the current batch for execution, then pushing the submitted batch to the vector of submitted batches. Therefore, after initialSlotsForBatches we reach the maximum arbitrary chosen capacity (initialSlotsForBatches) + for (uint64_t i = 0; i < v2::initialSlotsForBatches; i++) { + ASSERT_SUCCESS(urEnqueueMemBufferWrite( + queue1, buffer, /* isBlocking */ false, 0, buffer_size, data.data(), + 0, nullptr, nullptr)); + // A non-empty batch should be submitted for execution and renewed + ASSERT_SUCCESS(urQueueFlush(queue1)); + } + + // The maximum arbitrary set capacity is reached, but the vector is not cleared + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0); + + std::vector output(buffer_size, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue1, buffer, false, 0, buffer_size, + output.data(), 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFlush(queue1)); + + // When trying to push another submitted batch to the vector of submitted batches after reaching its arbitrary set capacity, the vector is cleared as a result of queueFinish. Submitted batches (regular command lists) are returned to the command list cache in their destructors. + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), + v2::initialSlotsForBatches); + + for (size_t index = 0; index < buffer_size; index++) { + ASSERT_EQ(data[index], output[index]); + } + } + + ur_queue_properties_t batched_queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, + // 0}; + UR_QUEUE_FLAG_SUBMISSION_BATCHED}; + + ur_queue_handle_t queue1 = nullptr; + ur_queue_handle_t queue2 = nullptr; + + ur_mem_handle_t buffer = nullptr; + const size_t buffer_size = 1024; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urBatchedQueueTest); + +void isEventSubmitted(ur_event_handle_t &event) { + // submitted + ur_event_status_t status; + ASSERT_SUCCESS(urEventGetInfo(event, UR_EVENT_INFO_COMMAND_EXECUTION_STATUS, + sizeof(ur_event_status_t), &status, nullptr)); + ASSERT_EQ(status, UR_EVENT_STATUS_SUBMITTED); +} + +void isEventCompleted(ur_event_handle_t &event) { + ASSERT_EQ(zeEventHostSynchronize(event->getZeEvent(), UINT64_MAX), + ZE_RESULT_SUCCESS); +} +/* +event1 = enqueue kernel(q1, ...); +event2 = enqueue kernel(q2, waitlist = event1); // waits on event1 +urQueueFinish(q2); +urQueueFinish(q1); + +When q2 performs an operation O2 which must wait on events from q1, O2 would never be executed if the batch from q1 is not enqueued during wait_list_view construction. +*/ +TEST_P(urBatchedQueueTest, WaitForEventFromAnotherBatched) { + // Test for deadlocks + ur_event_handle_t event1 = nullptr; + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* blocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &event1)); + + std::vector output(buffer_size, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue2, buffer, false, 0, buffer_size, + output.data(), 1, &event1, nullptr)); + + // Starting with queue2, which should wait for events from queue1 + ASSERT_SUCCESS(urQueueFinish(queue2)); + ASSERT_SUCCESS(urQueueFinish(queue1)); + + // Check if the operations have been actually performed + for (size_t index = 0; index < buffer_size; index++) { + ASSERT_EQ(data[index], output[index]); + } + + ASSERT_SUCCESS(urEventRelease(event1)); +} + +/* +event1 = enqueueSth(q1) +event2 = enqueueSth(q1) +enqueueSth(q2, event2) // submit the current batch from q1 +enqueueSth(q1, ..., getEvent) // access to the current batchNr from q1 +getEvent->getBatch > event2->getBatch + +enqueueSth(q2, event1) // already run in q1 +enqueueSth(q1, ..., getEvent2) // access to batchNr - check if the batch has been submitted for execution for the second time +getEvent2->getBatch == getEvent->getBatch + +event statuses in L0v2 are only UR_EVENT_STATUS_SUBMITTED and UR_EVENT_STATUS_COMPLETE +*/ +TEST_P(urBatchedQueueTest, RunBatchOnlyWhenNeededSimple) { + ur_event_handle_t event1 = nullptr; + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* blocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &event1)); + ASSERT_NE(event1->getBatch(), std::nullopt); + + ur_event_handle_t event2 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* blocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &event2)); + ASSERT_NE(event2->getBatch(), std::nullopt); + // Events from the same batch + ASSERT_EQ(event1->getBatch(), event2->getBatch()); + + // Submit the current batch in queue1 for execution + std::vector output(buffer_size, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue2, buffer, false, 0, buffer_size, + output.data(), 1, &event2, nullptr)); + + // Get access to the current batch number from q1 + ur_event_handle_t getEvent1 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* blocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &getEvent1)); + ASSERT_NE(getEvent1->getBatch(), std::nullopt); + + ASSERT_EQ(getEvent1->getBatch().value(), event2->getBatch().value() + 1); + + // Event1 is from the batch from q1, which has been already submitted for execution + // The current batch from q1 should not have been submitted for execution + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue2, buffer, false, 0, buffer_size, + output.data(), 1, &event1, nullptr)); + + // Get access to the batch number from q1 + ur_event_handle_t getEvent2 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* blocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &getEvent2)); + ASSERT_NE(getEvent2->getBatch(), std::nullopt); + + // Events should be assigned to the same batch + ASSERT_EQ(getEvent1->getBatch(), getEvent2->getBatch()); + + ASSERT_SUCCESS(urQueueFinish(queue1)); + ASSERT_SUCCESS(urQueueFinish(queue2)); + + ASSERT_SUCCESS(urEventRelease(event1)); + ASSERT_SUCCESS(urEventRelease(event2)); + ASSERT_SUCCESS(urEventRelease(getEvent1)); + ASSERT_SUCCESS(urEventRelease(getEvent2)); +} + +TEST_P(urBatchedQueueTest, IncreaseGenerationNumberAfterQueueFinish) { + ur_event_handle_t event1 = nullptr; + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* isBlocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &event1)); + ASSERT_NE(event1->getBatch(), std::nullopt); + ASSERT_NO_FATAL_FAILURE(isEventSubmitted(event1)); + + ur_event_handle_t event2 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferWrite( + queue1, buffer, /* blocking involves queueFinish */ true, 0, buffer_size, + data.data(), 0, nullptr, &event2)); + ASSERT_NE(event2->getBatch(), std::nullopt); + + ur_event_handle_t event3 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* isBlocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &event3)); + ASSERT_NE(event3->getBatch(), std::nullopt); + + // Events from the same batch + ASSERT_EQ(event1->getBatch(), event2->getBatch()); + + ASSERT_EQ(event3->getBatch().value(), event2->getBatch().value() + 1); + + ASSERT_SUCCESS(urQueueFinish(queue1)); + + ASSERT_SUCCESS(urEventRelease(event1)); + ASSERT_SUCCESS(urEventRelease(event2)); + ASSERT_SUCCESS(urEventRelease(event3)); +} + +// enqueue cmdbuff (empty batch) + +// e1 = enqueue sth (q1) without finish +// e1 should be the initial generation number - the current batch is not run when empty + +// enqueue cmdbuf +// e2 = enqueue sth (q1) +// e2.generation_number should be e1.generation_number + 1 since cmdbuff was enqueued on non-empty batch + +// When a command buffer is enqueued, the current batch is closed and submitted for execution on an immediate command list, then the command buffer is submitted for execution on the same immediate command list as the current batch. After that, a new batch with an incremented generation number is opened for enqueueing operations. + +TEST_P(urBatchedQueueTest, RunBatchIfNeededCommandBuffer) { + ur_bool_t command_buffer_support = false; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP, + sizeof(command_buffer_support), &command_buffer_support, nullptr)); + + if (!command_buffer_support) { + GTEST_SKIP() << "EXP command-buffer feature is not supported."; + } + + ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, false, false, false}; + ASSERT_SUCCESS( + urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle)); + ASSERT_NE(cmd_buf_handle, nullptr); + + ur_mem_handle_t output = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, buffer_size, + nullptr, &output)); + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp( + cmd_buf_handle, buffer, output, 0, 0, buffer_size, 0, nullptr, 0, nullptr, + nullptr, nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + // Command buffers are submitted to an immediate command list instead of the regular commnd list (the current batch) + ur_event_handle_t eventOnImmediate = nullptr; + ASSERT_SUCCESS(urEnqueueCommandBufferExp(queue1, cmd_buf_handle, 0, nullptr, + &eventOnImmediate)); + + // Operations enqueued on a regular command list must be submitted for execution before they are passed to the driver as events to wait on. Generation numbers assigned to events allow for determining whether the batch which includes an operation bound to the event has already been run: if the generation number of the event is lower than the generation number of the current batch, the batch assigned to the event has been submitted for execution. If the numbers are equal, the current batch should be submitted for execution - then, the operations enqueued on the current batch would be executed. Otherwise, the event would never be signalled. However, since command buffers are enqueued on immediate command lists, they are also submitted for execution immediately - in contrast to operations submitted on regular command lists: their execution would start only when a regular command list is enqueued directly on an immediate command list. Therefore, for events generated by submitting command buffers on batched queues, the generation number of the current batch is not tracked. + ASSERT_EQ(eventOnImmediate->getBatch(), std::nullopt); + + ur_event_handle_t eventAfterEnqueueCmdBuff = nullptr; + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite( + queue1, buffer, /* blocking write involves queueFinish */ false, 0, + buffer_size, data.data(), 0, nullptr, &eventAfterEnqueueCmdBuff)); + ASSERT_NE(eventAfterEnqueueCmdBuff->getBatch(), std::nullopt); + + ASSERT_EQ(eventAfterEnqueueCmdBuff->getBatch(), v2::initialGenerationNumber); + + // Enqueue command buffer when the current batch is not empty + if (cmd_buf_handle) { + ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + } + + ASSERT_SUCCESS( + urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle)); + ASSERT_NE(cmd_buf_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferAppendMemBufferCopyExp( + cmd_buf_handle, buffer, output, 0, 0, buffer_size, 0, nullptr, 0, nullptr, + nullptr, nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); + + ASSERT_SUCCESS( + urEnqueueCommandBufferExp(queue1, cmd_buf_handle, 0, nullptr, nullptr)); + + // ASSERT_NO_FATAL_FAILURE(isEventCompleted(eventAfterEnqueueCmdBuff)); + + ur_event_handle_t eventNonemptyBatch = nullptr; + std::vector output2(buffer_size, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue1, output, false, 0, buffer_size, + output2.data(), 0, nullptr, + &eventNonemptyBatch)); + ASSERT_EQ(eventNonemptyBatch->getBatch(), v2::initialGenerationNumber + 1); + + urQueueFinish(queue1); + + if (cmd_buf_handle) { + ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + } + + if (output) { + ASSERT_SUCCESS(urMemRelease(output)); + } + + ASSERT_SUCCESS(urEventRelease(eventOnImmediate)); + ASSERT_SUCCESS(urEventRelease(eventAfterEnqueueCmdBuff)); + ASSERT_SUCCESS(urEventRelease(eventNonemptyBatch)); +} + +TEST_P(urBatchedQueueTest, RunBatchWhenNeededSameQueue) { + ur_event_handle_t event1 = nullptr; + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* isBlocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &event1)); + ASSERT_NE(event1->getBatch(), std::nullopt); + ASSERT_NO_FATAL_FAILURE(isEventSubmitted(event1)); + + ur_event_handle_t event2 = nullptr; + std::vector data2(buffer_size, 24); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* isBlocking */ false, + 0, buffer_size, data2.data(), 0, + nullptr, &event2)); + ASSERT_NE(event2, nullptr); + ASSERT_NE(event2->getBatch(), std::nullopt); + ASSERT_NO_FATAL_FAILURE(isEventSubmitted(event2)); + // wait_list_view is constructed before passing arguments to command list manager functions. Therefore, if the batch from the current queue might have been submitted for execution, the generation number of the event passed as an argument for the enqueue command must have already been increased. However, there is no need to submit batches assigned to events from the same queue, since the operations are executed in-order: either from different consecutive batches or as part of the same batch. + ASSERT_EQ(event1->getBatch(), event2->getBatch()); + + ASSERT_SUCCESS(urQueueFinish(queue1)); + + ASSERT_SUCCESS(urEventRelease(event1)); + ASSERT_SUCCESS(urEventRelease(event2)); +} + +// Run batch only when it is non-empty +TEST_P(urBatchedQueueTest, RunBatchWhenNeededQueueFlush) { + // empty queue + ASSERT_SUCCESS(urQueueFlush(queue1)); + + ur_event_handle_t eventEmpty = nullptr; + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* isBlocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, &eventEmpty)); + ASSERT_NE(eventEmpty->getBatch(), std::nullopt); + ASSERT_NO_FATAL_FAILURE(isEventSubmitted(eventEmpty)); + + // The batch should have not been run when empty + ASSERT_EQ(eventEmpty->getBatch().value(), v2::initialGenerationNumber); + + // A non-empty batch should have been sumitted for execution and renewed + // The generation number is increased + ASSERT_SUCCESS(urQueueFlush(queue1)); + + ur_event_handle_t eventNonEmpty = nullptr; + std::vector output(buffer_size, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue1, buffer, false, 0, buffer_size, + output.data(), 0, nullptr, + &eventNonEmpty)); + + ASSERT_NE(eventNonEmpty->getBatch(), std::nullopt); + ASSERT_NO_FATAL_FAILURE(isEventSubmitted(eventNonEmpty)); + + // The batch should not have been run when empty + ASSERT_EQ(eventNonEmpty->getBatch().value(), v2::initialGenerationNumber + 1); + + ASSERT_SUCCESS(urQueueFinish(queue1)); + + for (size_t index = 0; index < buffer_size; index++) { + ASSERT_EQ(data[index], output[index]); + } + + ASSERT_SUCCESS(urEventRelease(eventEmpty)); + ASSERT_SUCCESS(urEventRelease(eventNonEmpty)); +} + +TEST_P(urBatchedQueueTest, VectorOfSubmittedBatchesIsClearedQueueFlush) { + ASSERT_NO_FATAL_FAILURE(vectorOfSubmittedBatchesIsClearedHelper()); +} + +TEST_P(urBatchedQueueTest, VectorOfSubmittedBatchesIsClearedQueueFinish) { + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0); + + std::vector data(buffer_size, 42); + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue1, buffer, /* isBlocking */ false, + 0, buffer_size, data.data(), 0, + nullptr, nullptr)); + // A non-empty batch should be submitted for execution and renewed + ASSERT_SUCCESS(urQueueFlush(queue1)); + + // The vector of current batches is cleared + ASSERT_SUCCESS(urQueueFinish(queue1)); + + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 1); +} + +TEST_P(urBatchedQueueTest, ReuseCommandLists) { + int iterNum = 3; + for (int i = 0; i < iterNum; i++) { + ASSERT_NO_FATAL_FAILURE(vectorOfSubmittedBatchesIsClearedHelper()); + } +} \ No newline at end of file