diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
index 3947c4866857f7..5a012f9a19719b 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -411,8 +411,10 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request)
     std::vector<std::size_t> closure_unpack_required;
     std::vector<std::size_t> closure_copy_required;
 
-    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
-        auto& closure = comp_model_desc.closure[cidx];
+    auto& desc_closure = comp_model_desc.closure.get().closure;
+
+    for (std::size_t cidx = 0u; cidx < desc_closure.size(); cidx++) {
+        auto& closure = desc_closure[cidx];
         const auto closure_param_id = comp_model_desc.param_base + cidx;
 
         if (m_npuw_model->is_gather_closure(idx, cidx)) {
@@ -440,7 +442,7 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request)
     // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
     ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
         auto cidx = closure_copy_required[j];
-        auto& closure = comp_model_desc.closure[cidx];
+        auto& closure = desc_closure[cidx];
         const auto closure_param_id = comp_model_desc.param_base + cidx;
         auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
         auto clparam = request->get_tensor(iport);
@@ -455,7 +457,7 @@ void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request)
         auto cidx = closure_unpack_required[j];
 
         // FIXME: zerops are stored with absolute indexing, this needs to be aligned
-        auto& closure = comp_model_desc.closure[cidx];
+        auto& closure = desc_closure[cidx];
 
         const auto closure_param_id = comp_model_desc.param_base + cidx;
         auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
@@ -565,7 +567,8 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
         const auto& gport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.dst_idx];
         const auto gather = request->get_tensor(gport);
 
-        const auto& vocab = comp_model_desc.closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
+        const auto& vocab =
+            comp_model_desc.closure.get().closure[comp_model_desc.host_gather.src_idx - comp_model_desc.param_base];
         const auto& lport = comp_model_desc.compiled_model->inputs()[comp_model_desc.host_gather.idx_idx];
         const auto lookup = request->get_tensor(lport);
 
@@ -926,7 +929,7 @@ bool ov::npuw::IBaseInferRequest::needs_copy(std::size_t idx, std::size_t cidx)
         return false;
     }
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
-    if (comp_model_desc.is_remote[cidx]) {
+    if (comp_model_desc.closure.get().is_remote[cidx]) {
         // FIXME: Test if the tensor device and the request device are
         // the same or compatible!
         return false;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 08df11c19ba5ee..3e5e699eb9331b 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -330,6 +330,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     // - dump the subgraphs, if necessary
     std::map<std::string, std::size_t> compiledFunctions;
     m_compiled_submodels.resize(orderedSubgraphs.size());
+
     const std::size_t end_sub_idx = orderedSubgraphs.size();
 
     const std::string dump_sub_opt = m_cfg.get<::intel_npu::NPUW_DUMP_SUBS>();
@@ -382,16 +383,18 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                 m_compiled_submodels[id].replaced_by = compiled_fcn_iter->second;
                 LOG_INFO("Subgraph[" << id << "] is a function call to [" << compiled_fcn_iter->second << "]");
             }
+            auto& closure_desc = m_compiled_submodels[id].closure.get();
+
             m_compiled_submodels[id].host_gather = subgraph._host_gather;
             m_compiled_submodels[id].quant_unpack_gather = subgraph._quant_unpack_gather;
             m_compiled_submodels[id].param_base = fcn_template._param_offset;
-            m_compiled_submodels[id].closure = subgraph._closure;
+            closure_desc.closure = subgraph._closure;
             m_compiled_submodels[id].lazy_closure = subgraph._lazy_closure;
-            m_compiled_submodels[id].closure_uid.resize(m_compiled_submodels[id].closure.size(), -1);
+            closure_desc.closure_uid.resize(subgraph._closure.size(), -1);
             m_compiled_submodels[id].scales = subgraph._scales;
             m_compiled_submodels[id].zerops = subgraph._zerops;
             m_compiled_submodels[id].forced_to_fcall = subgraph._forced_to_fcall;
-            m_compiled_submodels[id].is_remote.resize(m_compiled_submodels[id].closure.size(), false);
+            closure_desc.is_remote.resize(subgraph._closure.size(), false);
         }  // if(!funcall)
 
         if (!m_compiled_submodels[id].model && !m_compiled_submodels[id].replaced_by) {
@@ -649,22 +652,24 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,
     write(stream, spatial);
     write(stream, attention);
 
-    write(stream, is_remote);
-    write(stream, closure_uid);
+    auto& closure_desc = closure.get();
+
+    write(stream, closure_desc.is_remote);
+    write(stream, closure_desc.closure_uid);
 
     if (ctx.is_weightless) {
         write_weightless(stream, scales, ctx);
         write_weightless(stream, zerops, ctx);
 
-        write(stream, closure.size());
+        write(stream, closure_desc.closure.size());
         std::vector<ov::Tensor> cpu_closures;
         std::vector<std::size_t> cpu_closure_ids;
         std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
         std::vector<std::size_t> non_cpu_tensors_ids;
-        for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
-            if (closure_uid[cidx] == -1) {  // CPU closure
+        for (std::size_t cidx = 0; cidx < closure_desc.closure.size(); ++cidx) {
+            if (closure_desc.closure_uid[cidx] == -1) {  // CPU closure
                 cpu_closure_ids.push_back(cidx);
-                cpu_closures.push_back(closure[cidx]);
+                cpu_closures.push_back(closure_desc.closure[cidx]);
             } else {
                 non_cpu_tensors_ids.push_back(cidx);
                 non_cpu_tensors.push_back(lazy_closure[cidx]);  // must be there
@@ -679,13 +684,13 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,
         write(stream, scales);
         write(stream, zerops);
 
-        write(stream, closure.size());
+        write(stream, closure_desc.closure.size());
         std::vector<ov::Tensor> cpu_closures;
         std::vector<std::size_t> cpu_closure_ids;
-        for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
-            if (closure_uid[cidx] == -1) {  // CPU closure, not in the bank
+        for (std::size_t cidx = 0; cidx < closure_desc.closure.size(); ++cidx) {
+            if (closure_desc.closure_uid[cidx] == -1) {  // CPU closure, not in the bank
                 cpu_closure_ids.push_back(cidx);
-                cpu_closures.push_back(closure[cidx]);
+                cpu_closures.push_back(closure_desc.closure[cidx]);
             }
         }
 
@@ -724,8 +729,10 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
     read(stream, spatial);
     read(stream, attention);
 
-    read(stream, is_remote);
-    read(stream, closure_uid);
+    auto& closure_desc = closure.get();
+
+    read(stream, closure_desc.is_remote);
+    read(stream, closure_desc.closure_uid);
 
     if (ctx.weights || !ctx.consts_cache.empty()) {
         read_weightless(stream, scales, ctx);
@@ -733,7 +740,7 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
 
         std::size_t closure_size = 0;
         read(stream, closure_size);
-        closure.resize(closure_size);
+        closure_desc.closure.resize(closure_size);
         lazy_closure.resize(closure_size);
 
         std::vector<std::size_t> cpu_closure_ids;
@@ -743,7 +750,7 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
         read_weightless(stream, cpu_closures, ctx);
         std::size_t tidx = 0;
         for (const auto& idx : cpu_closure_ids) {
-            closure[idx] = std::move(cpu_closures[tidx++]);
+            closure_desc.closure[idx] = std::move(cpu_closures[tidx++]);
         }
 
         std::vector<std::size_t> non_cpu_tensors_ids;
@@ -757,8 +764,9 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
         }
 
         // Also read weights into LazyTensors
-        for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
-            if (closure_uid[cidx] != -1 && lazy_closure[cidx]) {  // previously registered before serialization
+        for (std::size_t cidx = 0; cidx < closure_desc.closure.size(); ++cidx) {
+            if (closure_desc.closure_uid[cidx] != -1 &&
+                lazy_closure[cidx]) {  // previously registered before serialization
                 lazy_closure[cidx].read_weight(ctx);
             }
         }
@@ -770,15 +778,21 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
         read(stream, closure_size);
         std::vector<std::size_t> cpu_closure_ids;
         read(stream, cpu_closure_ids);
-        closure.resize(closure_size);
+        closure_desc.closure.resize(closure_size);
         for (const auto& cidx : cpu_closure_ids) {
-            read(stream, closure[cidx]);
+            read(stream, closure_desc.closure[cidx]);
         }
     }
 
     LOG_DEBUG("DONE.");
 }
 
+ov::npuw::CompiledModel::~CompiledModel() {
+    if (m_eval_future.valid()) {
+        m_eval_future.wait();
+    }
+}
+
 void ov::npuw::CompiledModel::export_model(std::ostream& stream) const {
     using namespace ov::npuw::s11n;
 
@@ -912,7 +926,6 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::import_model(
         if (is_weightless) {
             compiled->m_weights_bank = ov::npuw::weights::bank(bank_name, compiled->get_plugin()->get_core(), "");
             compiled->finalize_weights_bank();
-            compiled->m_import_weights_ctx.reset();
         } else {
             compiled->m_weights_bank =
                 ov::npuw::weights::Bank::deserialize(model_stream, compiled->get_plugin()->get_core(), bank_name);
@@ -1013,10 +1026,13 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream, const ov::npuw::s1
 
         // Serialize compiled submodels
         write(model_stream, m_compiled_submodels.size());
-        for (const auto& subm : m_compiled_submodels) {
+        for (std::size_t i = 0; i < m_compiled_submodels.size(); ++i) {
+            auto& subm = m_compiled_submodels[i];
+            auto real_idx = subm.replaced_by.value_or(i);
             // Write device idx
-            std::size_t device_idx = subm.device_it - m_dev_list.begin();
-            write(model_stream, device_idx);
+            // FIXME: if there is no compiled submodel, device_it is not set.
+            std::size_t device_idx = m_compiled_submodels[real_idx].device_it - m_dev_list.begin();
+            write(model_stream, real_idx == i ? device_idx : 0);
             // Write ICompiledModel if it's there
             if (subm.compiled_model) {
                 write(model_stream, true);
@@ -1224,49 +1240,79 @@ void ov::npuw::CompiledModel::reconstruct_closure() {
 
         const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
         auto& func_desc = m_compiled_submodels[real_idx];
+        auto& desc_closure = comp_model_desc.closure.get();
 
-        for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) {
-            if (comp_model_desc.closure[cidx]) {
+        for (std::size_t cidx = 0; cidx < desc_closure.closure.size(); ++cidx) {
+            if (desc_closure.closure[cidx]) {
                 // host-side closure - already set, do nothing
-                NPUW_ASSERT(!comp_model_desc.is_remote[cidx]);
+                NPUW_ASSERT(!desc_closure.is_remote[cidx]);
                 continue;
             }
-            NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1);
-            comp_model_desc.closure[cidx] =
-                m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it);
+            NPUW_ASSERT(desc_closure.closure_uid[cidx] != -1);
+            desc_closure.closure[cidx] = m_weights_bank->get(desc_closure.closure_uid[cidx], *func_desc.device_it);
         }
     }
 }
 
 void ov::npuw::CompiledModel::finalize_weights_bank() {
     LOG_INFO("Finalizing weights bank...");
-    // Register lazy tensors
-    for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
-        auto& comp_model_desc = m_compiled_submodels[idx];
+    std::shared_future<void> weights_bank_evaluation = std::async(std::launch::async, [&]() {
+        // Register lazy tensors
+        for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+            auto& comp_model_desc = m_compiled_submodels[idx];
 
-        // Skip optimized out and non-functions
-        if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
-            continue;
-        }
+            // Skip optimized out and non-functions
+            if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+                continue;
+            }
 
-        const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
-        auto& func_desc = m_compiled_submodels[real_idx];
+            const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+            auto& func_desc = m_compiled_submodels[real_idx];
 
-        for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
-            if (comp_model_desc.closure[tidx]) {
-                continue;  // host-side closure
+            for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
+                if (comp_model_desc.closure.unsafe_get().closure[tidx]) {
+                    continue;  // host-side closure
+                }
+                comp_model_desc.closure.unsafe_get().closure_uid[tidx] =
+                    m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it);
             }
-            comp_model_desc.closure_uid[tidx] =
-                m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it);
         }
-    }
 
-    // Evaluate and allocate all LazyTensors inside the bank
-    m_profile["weights bank"].record([&]() {
+        // Evaluate and allocate all LazyTensors inside the bank
         m_weights_bank->evaluate_and_allocate();
+
+        // Set evaluated and allocated ov::Tensors to closures
+        for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+            auto& comp_model_desc = m_compiled_submodels[idx];
+
+            // Skip optimized out and non-functions
+            if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
+                continue;
+            }
+
+            const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+            auto& func_desc = m_compiled_submodels[real_idx];
+            auto& desc_closure = comp_model_desc.closure.unsafe_get();
+
+            for (std::size_t tidx = 0; tidx < desc_closure.closure.size(); ++tidx) {
+                if (desc_closure.closure[tidx]) {
+                    // host-side closure - already set, do nothing
+                    desc_closure.is_remote[tidx] = false;
+                    continue;
+                }
+                const auto& uid = desc_closure.closure_uid[tidx];
+                NPUW_ASSERT(uid != -1);  // All tensors should be registered at this point
+                desc_closure.closure[tidx] = m_weights_bank->get(uid, *func_desc.device_it);
+                // FIXME: find a more reliable way to do so
+                desc_closure.is_remote[tidx] = m_weights_bank->is_remote(uid);
+            }
+        }
+
+        m_import_weights_ctx.reset();
     });
 
-    // Set evaluated and allocated ov::Tensors to closures
+    m_eval_future = weights_bank_evaluation;
+
     for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
         auto& comp_model_desc = m_compiled_submodels[idx];
 
@@ -1275,21 +1321,7 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
             continue;
         }
 
-        const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
-        auto& func_desc = m_compiled_submodels[real_idx];
-
-        for (std::size_t tidx = 0; tidx < comp_model_desc.closure.size(); ++tidx) {
-            if (comp_model_desc.closure[tidx]) {
-                // host-side closure - already set, do nothing
-                comp_model_desc.is_remote[tidx] = false;
-                continue;
-            }
-            const auto& uid = comp_model_desc.closure_uid[tidx];
-            NPUW_ASSERT(uid != -1);  // All tensors should be registered at this point
-            comp_model_desc.closure[tidx] = m_weights_bank->get(uid, *func_desc.device_it);
-            // FIXME: find a more reliable way to do so
-            comp_model_desc.is_remote[tidx] = m_weights_bank->is_remote(uid);
-        }
+        comp_model_desc.closure.set_future(weights_bank_evaluation);
     }
 
     LOG_INFO("Done.");
@@ -1654,7 +1686,7 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons
 
 bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx) const {
     auto& comp_model_desc = m_compiled_submodels.at(idx);
-    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
+    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.get().closure.size(); cidx++) {
         if (unpack_required(idx, cidx)) {
             return true;
         }
@@ -1671,7 +1703,7 @@ bool ov::npuw::CompiledModel::unpack_required(const std::size_t idx, const std::
     const auto real_idx = comp_model_desc.replaced_by.value();
     auto& func_desc = m_compiled_submodels.at(real_idx);
 
-    auto& closure = comp_model_desc.closure.at(cidx);
+    auto& closure = comp_model_desc.closure.get().closure.at(cidx);
     const auto closure_param_id = comp_model_desc.param_base + cidx;
 
     auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 81af4c7b276a1a..de81713b2f05c0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -61,6 +61,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {
 
     std::shared_ptr<ov::IAsyncInferRequest> create_infer_request() const override;
 
+    // Custom destructor to wait for Delayed
+    ~CompiledModel();
+
 private:
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
@@ -178,15 +181,26 @@ class CompiledModel : public ov::npuw::ICompiledModel {
         // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
         // w.r.t. function calls
         std::size_t param_base = 0;
+
+        struct Closure {
+            std::vector<ov::Tensor> closure;
+            std::vector<int64_t> closure_uid;  // Note: value -1 is considered uninitialized
+            std::vector<bool> is_remote;
+        };
+
+        // Need to wrap closure, since finalize_weights_bank() will
+        // asynchronously evaluate weights and put them in closure.
+        // Other functions of CompiledModel as well as InferRequest and
+        // other entities need to wait for the closure to be populated first
+        // (meaning to wait for async weights processing to end).
+        ov::npuw::util::Delayed<Closure> closure;
+
         // NB: closure and lazy_closure are of the same size - to preserve proper indexing.
         //     closure is responsible for host-side tensors (DCOFF, Gather, etc) while
         //     lazy_closure is used for weights sharing and allocating device memory.
-        std::vector<ov::Tensor> closure;
         std::vector<weights::LazyTensor> lazy_closure;
-        std::vector<int64_t> closure_uid;  // Note: value -1 is considered uninitialized
         std::vector<ov::Tensor> scales;
         std::vector<ov::Tensor> zerops;
-        std::vector<bool> is_remote;
 
         bool forced_to_fcall = false;
 
@@ -212,6 +226,8 @@ class CompiledModel : public ov::npuw::ICompiledModel {
     std::unordered_map<const void*, std::size_t> m_const_to_offset;
     ov::npuw::s11n::BF16Cache m_bf16_consts;
     ov::npuw::s11n::WeightsContext m_import_weights_ctx;
+
+    std::shared_future<void> m_eval_future;
 };
 }  // namespace npuw
 }  // namespace ov
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
index 7f62c3ef0abd52..13de38da1ff8dc 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -1702,15 +1702,12 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::import_m
             compiled->m_prefill_compiled->m_weights_bank = bank;
 
             compiled->m_kvcache_compiled->finalize_weights_bank();
-            compiled->m_kvcache_compiled->m_import_weights_ctx.reset();
             compiled->m_prefill_compiled->finalize_weights_bank();
-            compiled->m_prefill_compiled->m_import_weights_ctx.reset();
 
             if (compiled->m_lm_head_compiled) {
                 compiled->m_lm_head_compiled->m_weights_bank = bank;
 
                 compiled->m_lm_head_compiled->finalize_weights_bank();
-                compiled->m_lm_head_compiled->m_import_weights_ctx.reset();
             }
         } else {
             auto bank =
diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
index 228abbfcb86988..7322ed67b3c526 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
@@ -265,7 +265,9 @@ void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::op::v0::Para
     read(stream, names);
     // NOTE: the code below is taken from NPU plugin's create_dummy_model()
     var = std::make_shared<op::v0::Parameter>(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str));
-    var->set_friendly_name(*names.begin());  // FIXME: any_name ?
+    if (!names.empty()) {
+        var->set_friendly_name(*names.begin());  // FIXME: any_name ?
+    }
     var->output(0).get_tensor().set_names(names);
 }
 
@@ -286,7 +288,9 @@ void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::Node>& var)
                                                  names);
     var = std::make_shared<ov::op::v0::Result>(res);
     var->output(0).set_tensor_ptr(tensor_dummy);
-    var->set_friendly_name(*names.begin());  // any_name ?
+    if (!names.empty()) {
+        var->set_friendly_name(*names.begin());  // any_name ?
+    }
 }
 
 void ov::npuw::s11n::read_any(std::istream& stream, ov::Any& var) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
index edbacfbdbc8ba1..dac90f0aa20567 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <future>
 #include <random>
 #include <string>
 
@@ -201,6 +202,47 @@ typename std::underlying_type<T>::type _v(T&& t) {
     return static_cast<typename std::underlying_type<T>::type>(t);
 }
 
+template <class T>
+class Delayed {
+public:
+    T& get() {
+        return const_cast<T&>(get_impl());
+    }
+    // FIXME: since main purpose of this is to guard closure,
+    // even const get should wait for the future to finish,
+    // otherwise it's not ready yet (e.g. .size() method).
+    const T& get() const {
+        return get_impl();
+    }
+    T& unsafe_get() {
+        return data;
+    }
+    void set_future(std::shared_future<void>& f) {
+        future = f;
+    }
+    void wait() const {
+        if (!done && future.valid()) {
+            future.wait();
+            done = true;
+        }
+    }
+
+private:
+    const T& get_impl() const {
+        if (done)
+            return data;
+        if (future.valid()) {
+            future.wait();
+            done = true;
+        }
+        return data;
+    }
+
+    T data;
+    std::shared_future<void> future;
+    mutable bool done = false;
+};
+
 }  // namespace util
 }  // namespace npuw
 }  // namespace ov
diff --git a/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp b/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp
index 0ad57418eb0ab7..70ee4974ef3c83 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/serialization.cpp
@@ -8,6 +8,7 @@
 
 #include "intel_npu/config/config.hpp"
 #include "intel_npu/config/npuw.hpp"
+#include "openvino/core/parallel.hpp"
 #include "openvino/openvino.hpp"
 #include "serialization.hpp"
 #include "compiled_model.hpp"
@@ -285,4 +286,75 @@ TEST(SerializationTest, OVTypes_Tensor_with_weights) {
     EXPECT_EQ(data, data_res);
 }
 
+TEST(SerializationTest, Stress_ParallelImport) {
+    // Only run this test on NPU device
+    ov::Core ov_core;
+    auto core_devices = ov_core.get_available_devices();
+    if (std::find(core_devices.begin(), core_devices.end(), "NPU") == core_devices.end()) {
+        GTEST_SKIP() << "No available devices.";
+    }
+
+    // Device
+    const std::string device = "NPU";
+
+    // Create model
+    ModelGenerator mg;
+    auto model1 = mg.get_model_with_repeated_blocks();
+    auto model2 = mg.get_model_with_repeated_blocks();
+    auto model3 = mg.get_model_with_repeated_blocks();
+    auto model4 = mg.get_model_with_repeated_blocks();
+
+    // NPUW config
+    ov::AnyMap config = {
+        {"NPU_USE_NPUW", "YES"},
+        {"NPUW_FUNCALL_FOR_ALL", "YES"},
+        {"NPUW_DEVICES", "NPU"},
+        {"NPUW_FOLD" , "YES"},
+        // FIXME: enable once proper model for weights sharing is available
+        // (go through LLMCompiledModel). Otherwise we hit a case
+        // where bank reads same weights several times, in which
+        // case an assert is triggered.
+        // {"NPUW_WEIGHTS_BANK", "shared"},
+
+        // FIXME: test weightless mode once proper model with actual weights
+        // is available in tests.
+        {"CACHE_MODE", "OPTIMIZE_SPEED"}
+    };
+
+    // Run stress test to check for data race
+    for (size_t i = 0; i < 10; ++i) {
+        // Compile NPUW
+        auto compiled1 = ov_core.compile_model(model1, device, config);
+        auto compiled2 = ov_core.compile_model(model2, device, config);
+        auto compiled3 = ov_core.compile_model(model3, device, config);
+        auto compiled4 = ov_core.compile_model(model4, device, config);
+
+        // Create infer request and infer
+        auto request1 = compiled1.create_infer_request();
+        request1.infer();
+        auto request2 = compiled2.create_infer_request();
+        request2.infer();
+        auto request3 = compiled3.create_infer_request();
+        request3.infer();
+        auto request4 = compiled4.create_infer_request();
+        request4.infer();
+
+        std::vector<std::stringstream> ss(4);
+        compiled1.export_model(ss[0]);
+        compiled2.export_model(ss[1]);
+        compiled3.export_model(ss[2]);
+        compiled4.export_model(ss[3]);
+
+        std::vector<ov::CompiledModel> imported(4);
+        ov::parallel_for(4, [&](size_t idx) {
+            imported[idx] = ov_core.import_model(ss[idx], "NPU");
+        });
+
+        for (auto& m : imported) {
+            auto r = m.create_infer_request();
+            r.infer();
+        }
+    }
+}
+
 // TODO: add tests on CompiledModel and LLMCompiledModel once tests have access to any model to test on