diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp index 3947c4866857f7..5a012f9a19719b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp @@ -411,8 +411,10 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) std::vector closure_unpack_required; std::vector closure_copy_required; - for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { - auto& closure = comp_model_desc.closure[cidx]; + auto& desc_closure = comp_model_desc.closure.get().closure; + + for (std::size_t cidx = 0u; cidx < desc_closure.size(); cidx++) { + auto& closure = desc_closure[cidx]; const auto closure_param_id = comp_model_desc.param_base + cidx; if (m_npuw_model->is_gather_closure(idx, cidx)) { @@ -440,7 +442,7 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){ ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) { auto cidx = closure_copy_required[j]; - auto& closure = comp_model_desc.closure[cidx]; + auto& closure = desc_closure[cidx]; const auto closure_param_id = comp_model_desc.param_base + cidx; auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; auto clparam = request->get_tensor(iport); @@ -455,7 +457,7 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) auto cidx = closure_unpack_required[j]; // FIXME: zerops are stored with absolute indexing, this needs to be aligned - auto& closure = comp_model_desc.closure[cidx]; + auto& closure = desc_closure[cidx]; const auto closure_param_id = comp_model_desc.param_base + cidx; auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; @@ -565,7 +567,8 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx]; const auto gather = request->get_tensor(gport); - const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; + const auto& vocab = + comp_model_desc.closure.get().closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base]; const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx]; const auto lookup = request->get_tensor(lport); @@ -926,7 +929,7 @@ bool ov::npuw::IBaseInferRequest::needs_copy(std::size_t idx, std::size_t cidx) return false; } auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - if (comp_model_desc.is_remote[cidx]) { + if (comp_model_desc.closure.get().is_remote[cidx]) { // FIXME: Test if the tensor device and the request device are // the same or compatible! return false; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 08df11c19ba5ee..3e5e699eb9331b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -330,6 +330,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, // - dump the subgraphs, if necessary std::map compiledFunctions; m_compiled_submodels.resize(orderedSubgraphs.size()); + const std::size_t end_sub_idx = orderedSubgraphs.size(); const std::string dump_sub_opt = m_cfg.get<::intel_npu::NPUW_DUMP_SUBS>(); @@ -382,16 +383,18 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, m_compiled_submodels[id].replaced_by = compiled_fcn_iter->second; LOG_INFO("Subgraph[" << id << "] is a function call to [" << compiled_fcn_iter->second << "]"); } + auto& closure_desc = m_compiled_submodels[id].closure.get(); + m_compiled_submodels[id].host_gather = subgraph._host_gather; m_compiled_submodels[id].quant_unpack_gather = subgraph._quant_unpack_gather; m_compiled_submodels[id].param_base = fcn_template._param_offset; - m_compiled_submodels[id].closure = subgraph._closure; + closure_desc.closure = subgraph._closure; m_compiled_submodels[id].lazy_closure = subgraph._lazy_closure; - m_compiled_submodels[id].closure_uid.resize(m_compiled_submodels[id].closure.size(), -1); + closure_desc.closure_uid.resize(subgraph._closure.size(), -1); m_compiled_submodels[id].scales = subgraph._scales; m_compiled_submodels[id].zerops = subgraph._zerops; m_compiled_submodels[id].forced_to_fcall = subgraph._forced_to_fcall; - m_compiled_submodels[id].is_remote.resize(m_compiled_submodels[id].closure.size(), false); + closure_desc.is_remote.resize(subgraph._closure.size(), false); } // if(!funcall) if (!m_compiled_submodels[id].model && !m_compiled_submodels[id].replaced_by) { @@ -649,22 +652,24 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream, write(stream, spatial); write(stream, attention); - write(stream, is_remote); - write(stream, closure_uid); + auto& closure_desc = closure.get(); + + write(stream, closure_desc.is_remote); + write(stream, closure_desc.closure_uid); if (ctx.is_weightless) { write_weightless(stream, scales, ctx); write_weightless(stream, zerops, ctx); - write(stream, closure.size()); + write(stream, closure_desc.closure.size()); std::vector cpu_closures; std::vector cpu_closure_ids; std::vector non_cpu_tensors; std::vector non_cpu_tensors_ids; - for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) { - if (closure_uid[cidx] == -1) { // CPU closure + for (std::size_t cidx = 0; cidx < closure_desc.closure.size(); ++cidx) { + if (closure_desc.closure_uid[cidx] == -1) { // CPU closure cpu_closure_ids.push_back(cidx); - cpu_closures.push_back(closure[cidx]); + cpu_closures.push_back(closure_desc.closure[cidx]); } else { non_cpu_tensors_ids.push_back(cidx); non_cpu_tensors.push_back(lazy_closure[cidx]); // must be there @@ -679,13 +684,13 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream, write(stream, scales); write(stream, zerops); - write(stream, closure.size()); + write(stream, closure_desc.closure.size()); std::vector cpu_closures; std::vector cpu_closure_ids; - for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) { - if (closure_uid[cidx] == -1) { // CPU closure, not in the bank + for (std::size_t cidx = 0; cidx < closure_desc.closure.size(); ++cidx) { + if (closure_desc.closure_uid[cidx] == -1) { // CPU closure, not in the bank cpu_closure_ids.push_back(cidx); - cpu_closures.push_back(closure[cidx]); + cpu_closures.push_back(closure_desc.closure[cidx]); } } @@ -724,8 +729,10 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea read(stream, spatial); read(stream, attention); - read(stream, is_remote); - read(stream, closure_uid); + auto& closure_desc = closure.get(); + + read(stream, closure_desc.is_remote); + read(stream, closure_desc.closure_uid); if (ctx.weights || !ctx.consts_cache.empty()) { read_weightless(stream, scales, ctx); @@ -733,7 +740,7 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea std::size_t closure_size = 0; read(stream, closure_size); - closure.resize(closure_size); + closure_desc.closure.resize(closure_size); lazy_closure.resize(closure_size); std::vector cpu_closure_ids; @@ -743,7 +750,7 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea read_weightless(stream, cpu_closures, ctx); std::size_t tidx = 0; for (const auto& idx : cpu_closure_ids) { - closure[idx] = std::move(cpu_closures[tidx++]); + closure_desc.closure[idx] = std::move(cpu_closures[tidx++]); } std::vector non_cpu_tensors_ids; @@ -757,8 +764,9 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea } // Also read weights into LazyTensors - for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) { - if (closure_uid[cidx] != -1 && lazy_closure[cidx]) { // previously registered before serialization + for (std::size_t cidx = 0; cidx < closure_desc.closure.size(); ++cidx) { + if (closure_desc.closure_uid[cidx] != -1 && + lazy_closure[cidx]) { // previously registered before serialization lazy_closure[cidx].read_weight(ctx); } } @@ -770,15 +778,21 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea read(stream, closure_size); std::vector cpu_closure_ids; read(stream, cpu_closure_ids); - closure.resize(closure_size); + closure_desc.closure.resize(closure_size); for (const auto& cidx : cpu_closure_ids) { - read(stream, closure[cidx]); + read(stream, closure_desc.closure[cidx]); } } LOG_DEBUG("DONE."); } +ov::npuw::CompiledModel::~CompiledModel() { + if (m_eval_future.valid()) { + m_eval_future.wait(); + } +} + void ov::npuw::CompiledModel::export_model(std::ostream& stream) const { using namespace ov::npuw::s11n; @@ -912,7 +926,6 @@ std::shared_ptr ov::npuw::CompiledModel::import_model( if (is_weightless) { compiled->m_weights_bank = ov::npuw::weights::bank(bank_name, compiled->get_plugin()->get_core(), ""); compiled->finalize_weights_bank(); - compiled->m_import_weights_ctx.reset(); } else { compiled->m_weights_bank = ov::npuw::weights::Bank::deserialize(model_stream, compiled->get_plugin()->get_core(), bank_name); @@ -1013,10 +1026,13 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream, const ov::npuw::s1 // Serialize compiled submodels write(model_stream, m_compiled_submodels.size()); - for (const auto& subm : m_compiled_submodels) { + for (std::size_t i = 0; i < m_compiled_submodels.size(); ++i) { + auto& subm = m_compiled_submodels[i]; + auto real_idx = subm.replaced_by.value_or(i); // Write device idx - std::size_t device_idx = subm.device_it - m_dev_list.begin(); - write(model_stream, device_idx); + // FIXME: if there is no compiled submodel, device_it is not set. + std::size_t device_idx = m_compiled_submodels[real_idx].device_it - m_dev_list.begin(); + write(model_stream, real_idx == i ? device_idx : 0); // Write ICompiledModel if it's there if (subm.compiled_model) { write(model_stream, true); @@ -1224,49 +1240,79 @@ void ov::npuw::CompiledModel::reconstruct_closure() { const auto real_idx = comp_model_desc.replaced_by.value_or(idx); auto& func_desc = m_compiled_submodels[real_idx]; + auto& desc_closure = comp_model_desc.closure.get(); - for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) { - if (comp_model_desc.closure[cidx]) { + for (std::size_t cidx = 0; cidx < desc_closure.closure.size(); ++cidx) { + if (desc_closure.closure[cidx]) { // host-side closure - already set, do nothing - NPUW_ASSERT(!comp_model_desc.is_remote[cidx]); + NPUW_ASSERT(!desc_closure.is_remote[cidx]); continue; } - NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1); - comp_model_desc.closure[cidx] = - m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it); + NPUW_ASSERT(desc_closure.closure_uid[cidx] != -1); + desc_closure.closure[cidx] = m_weights_bank->get(desc_closure.closure_uid[cidx], *func_desc.device_it); } } } void ov::npuw::CompiledModel::finalize_weights_bank() { LOG_INFO("Finalizing weights bank..."); - // Register lazy tensors - for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { - auto& comp_model_desc = m_compiled_submodels[idx]; + std::shared_future weights_bank_evaluation = std::async(std::launch::async, [&]() { + // Register lazy tensors + for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { + auto& comp_model_desc = m_compiled_submodels[idx]; - // Skip optimized out and non-functions - if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { - continue; - } + // Skip optimized out and non-functions + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; + } - const auto real_idx = comp_model_desc.replaced_by.value_or(idx); - auto& func_desc = m_compiled_submodels[real_idx]; + const auto real_idx = comp_model_desc.replaced_by.value_or(idx); + auto& func_desc = m_compiled_submodels[real_idx]; - for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) { - if (comp_model_desc.closure[tidx]) { - continue; // host-side closure + for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) { + if (comp_model_desc.closure.unsafe_get().closure[tidx]) { + continue; // host-side closure + } + comp_model_desc.closure.unsafe_get().closure_uid[tidx] = + m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it); } - comp_model_desc.closure_uid[tidx] = - m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it); } - } - // Evaluate and allocate all LazyTensors inside the bank - m_profile["weights bank"].record([&]() { + // Evaluate and allocate all LazyTensors inside the bank m_weights_bank->evaluate_and_allocate(); + + // Set evaluated and allocated ov::Tensors to closures + for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { + auto& comp_model_desc = m_compiled_submodels[idx]; + + // Skip optimized out and non-functions + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; + } + + const auto real_idx = comp_model_desc.replaced_by.value_or(idx); + auto& func_desc = m_compiled_submodels[real_idx]; + auto& desc_closure = comp_model_desc.closure.unsafe_get(); + + for (std::size_t tidx = 0; tidx < desc_closure.closure.size(); ++tidx) { + if (desc_closure.closure[tidx]) { + // host-side closure - already set, do nothing + desc_closure.is_remote[tidx] = false; + continue; + } + const auto& uid = desc_closure.closure_uid[tidx]; + NPUW_ASSERT(uid != -1); // All tensors should be registered at this point + desc_closure.closure[tidx] = m_weights_bank->get(uid, *func_desc.device_it); + // FIXME: find a more reliable way to do so + desc_closure.is_remote[tidx] = m_weights_bank->is_remote(uid); + } + } + + m_import_weights_ctx.reset(); }); - // Set evaluated and allocated ov::Tensors to closures + m_eval_future = weights_bank_evaluation; + for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { auto& comp_model_desc = m_compiled_submodels[idx]; @@ -1275,21 +1321,7 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { continue; } - const auto real_idx = comp_model_desc.replaced_by.value_or(idx); - auto& func_desc = m_compiled_submodels[real_idx]; - - for (std::size_t tidx = 0; tidx < comp_model_desc.closure.size(); ++tidx) { - if (comp_model_desc.closure[tidx]) { - // host-side closure - already set, do nothing - comp_model_desc.is_remote[tidx] = false; - continue; - } - const auto& uid = comp_model_desc.closure_uid[tidx]; - NPUW_ASSERT(uid != -1); // All tensors should be registered at this point - comp_model_desc.closure[tidx] = m_weights_bank->get(uid, *func_desc.device_it); - // FIXME: find a more reliable way to do so - comp_model_desc.is_remote[tidx] = m_weights_bank->is_remote(uid); - } + comp_model_desc.closure.set_future(weights_bank_evaluation); } LOG_INFO("Done."); @@ -1654,7 +1686,7 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx) const { auto& comp_model_desc = m_compiled_submodels.at(idx); - for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) { + for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.get().closure.size(); cidx++) { if (unpack_required(idx, cidx)) { return true; } @@ -1671,7 +1703,7 @@ bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx, const std:: const auto real_idx = comp_model_desc.replaced_by.value(); auto& func_desc = m_compiled_submodels.at(real_idx); - auto& closure = comp_model_desc.closure.at(cidx); + auto& closure = comp_model_desc.closure.get().closure.at(cidx); const auto closure_param_id = comp_model_desc.param_base + cidx; auto& iport = func_desc.compiled_model->inputs()[closure_param_id]; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 81af4c7b276a1a..de81713b2f05c0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -61,6 +61,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { std::shared_ptr create_infer_request() const override; + // Custom destructor to wait for Delayed + ~CompiledModel(); + private: // FIXME: This class has many friends.. friend class IBaseInferRequest; @@ -178,15 +181,26 @@ class CompiledModel : public ov::npuw::ICompiledModel { // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure // w.r.t. function calls std::size_t param_base = 0; + + struct Closure { + std::vector closure; + std::vector closure_uid; // Note: value -1 is considered uninitialized + std::vector is_remote; + }; + + // Need to wrap closure, since finalize_weights_bank() will + // asynchronously evaluate weights and put them in closure. + // Other functions of CompiledModel as well as InferRequest and + // other entities need to wait for the closure to be populated first + // (meaning to wait for async weights processing to end). + ov::npuw::util::Delayed closure; + // NB: closure and lazy_closure are of the same size - to preserve proper indexing. // closure is responsible for host-side tensors (DCOFF, Gather, etc) while // lazy_closure is used for weights sharing and allocating device memory. - std::vector closure; std::vector lazy_closure; - std::vector closure_uid; // Note: value -1 is considered uninitialized std::vector scales; std::vector zerops; - std::vector is_remote; bool forced_to_fcall = false; @@ -212,6 +226,8 @@ class CompiledModel : public ov::npuw::ICompiledModel { std::unordered_map m_const_to_offset; ov::npuw::s11n::BF16Cache m_bf16_consts; ov::npuw::s11n::WeightsContext m_import_weights_ctx; + + std::shared_future m_eval_future; }; } // namespace npuw } // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 7f62c3ef0abd52..13de38da1ff8dc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1702,15 +1702,12 @@ std::shared_ptr ov::npuw::LLMCompiledModel::import_m compiled->m_prefill_compiled->m_weights_bank = bank; compiled->m_kvcache_compiled->finalize_weights_bank(); - compiled->m_kvcache_compiled->m_import_weights_ctx.reset(); compiled->m_prefill_compiled->finalize_weights_bank(); - compiled->m_prefill_compiled->m_import_weights_ctx.reset(); if (compiled->m_lm_head_compiled) { compiled->m_lm_head_compiled->m_weights_bank = bank; compiled->m_lm_head_compiled->finalize_weights_bank(); - compiled->m_lm_head_compiled->m_import_weights_ctx.reset(); } } else { auto bank = diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp index 228abbfcb86988..7322ed67b3c526 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp @@ -265,7 +265,9 @@ void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str)); - var->set_friendly_name(*names.begin()); // FIXME: any_name ? + if (!names.empty()) { + var->set_friendly_name(*names.begin()); // FIXME: any_name ? + } var->output(0).get_tensor().set_names(names); } @@ -286,7 +288,9 @@ void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) names); var = std::make_shared(res); var->output(0).set_tensor_ptr(tensor_dummy); - var->set_friendly_name(*names.begin()); // any_name ? + if (!names.empty()) { + var->set_friendly_name(*names.begin()); // any_name ? + } } void ov::npuw::s11n::read_any(std::istream& stream, ov::Any& var) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp index edbacfbdbc8ba1..dac90f0aa20567 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -201,6 +202,47 @@ typename std::underlying_type::type _v(T&& t) { return static_cast::type>(t); } +template +class Delayed { +public: + T& get() { + return const_cast(get_impl()); + } + // FIXME: since main purpose of this is to guard closure, + // even const get should wait for the future to finish, + // otherwise it's not ready yet (e.g. .size() method). + const T& get() const { + return get_impl(); + } + T& unsafe_get() { + return data; + } + void set_future(std::shared_future& f) { + future = f; + } + void wait() const { + if (!done && future.valid()) { + future.wait(); + done = true; + } + } + +private: + const T& get_impl() const { + if (done) + return data; + if (future.valid()) { + future.wait(); + done = true; + } + return data; + } + + T data; + std::shared_future future; + mutable bool done = false; +}; + } // namespace util } // namespace npuw } // namespace ov diff --git a/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp b/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp index 0ad57418eb0ab7..70ee4974ef3c83 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp +++ b/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp @@ -8,6 +8,7 @@ #include "intel_npu/config/config.hpp" #include "intel_npu/config/npuw.hpp" +#include "openvino/core/parallel.hpp" #include "openvino/openvino.hpp" #include "serialization.hpp" #include "compiled_model.hpp" @@ -285,4 +286,75 @@ TEST(SerializationTest, OVTypes_Tensor_with_weights) { EXPECT_EQ(data, data_res); } +TEST(SerializationTest, Stress_ParallelImport) { + // Only run this test on NPU device + ov::Core ov_core; + auto core_devices = ov_core.get_available_devices(); + if (std::find(core_devices.begin(), core_devices.end(), "NPU") == core_devices.end()) { + GTEST_SKIP() << "No available devices."; + } + + // Device + const std::string device = "NPU"; + + // Create model + ModelGenerator mg; + auto model1 = mg.get_model_with_repeated_blocks(); + auto model2 = mg.get_model_with_repeated_blocks(); + auto model3 = mg.get_model_with_repeated_blocks(); + auto model4 = mg.get_model_with_repeated_blocks(); + + // NPUW config + ov::AnyMap config = { + {"NPU_USE_NPUW", "YES"}, + {"NPUW_FUNCALL_FOR_ALL", "YES"}, + {"NPUW_DEVICES", "NPU"}, + {"NPUW_FOLD" , "YES"}, + // FIXME: enable once proper model for weights sharing is available + // (go through LLMCompiledModel). Otherwise we hit a case + // where bank reads same weights several times, in which + // case an assert is triggered. + // {"NPUW_WEIGHTS_BANK", "shared"}, + + // FIXME: test weightless mode once proper model with actual weights + // is available in tests. + {"CACHE_MODE", "OPTIMIZE_SPEED"} + }; + + // Run stress test to check for data race + for (size_t i = 0; i < 10; ++i) { + // Compile NPUW + auto compiled1 = ov_core.compile_model(model1, device, config); + auto compiled2 = ov_core.compile_model(model2, device, config); + auto compiled3 = ov_core.compile_model(model3, device, config); + auto compiled4 = ov_core.compile_model(model4, device, config); + + // Create infer request and infer + auto request1 = compiled1.create_infer_request(); + request1.infer(); + auto request2 = compiled2.create_infer_request(); + request2.infer(); + auto request3 = compiled3.create_infer_request(); + request3.infer(); + auto request4 = compiled4.create_infer_request(); + request4.infer(); + + std::vector ss(4); + compiled1.export_model(ss[0]); + compiled2.export_model(ss[1]); + compiled3.export_model(ss[2]); + compiled4.export_model(ss[3]); + + std::vector imported(4); + ov::parallel_for(4, [&](size_t idx) { + imported[idx] = ov_core.import_model(ss[idx], "NPU"); + }); + + for (auto& m : imported) { + auto r = m.create_infer_request(); + r.infer(); + } + } +} + // TODO: add tests on CompiledModel and LLMCompiledModel once tests have access to any model to test on