diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp index 6e7bbfe1757..96eded59283 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp @@ -275,14 +275,11 @@ NProto::TDeviceConfig TDeviceList::AllocateDevice( const TDiskId& diskId, const TAllocationQuery& query) { - for (auto& kv: NodeDevices) { - if (!query.NodeIds.empty() && !query.NodeIds.contains(kv.first)) { + for (auto& [nodeId, nodeDevices]: NodeDevices) { + if (!query.NodeIds.empty() && !query.NodeIds.contains(nodeId)) { continue; } - const ui32 nodeId = kv.first; - auto& nodeDevices = kv.second; - const auto& currentRack = nodeDevices.Rack; auto& devices = nodeDevices.FreeDevices; @@ -428,10 +425,6 @@ void TDeviceList::MarkDeviceAllocated(const TDiskId& diskId, const TDeviceId& id AllocatedDevices.emplace(id, diskId); } -// returns a list of racks sorted by preference and then by occupied space ASC -// then by free space DESC -// the nodes in each rack are sorted by occupied space ASC then by free space -// DESC auto TDeviceList::SelectRacks( const TAllocationQuery& query, const TString& poolName) const -> TVector @@ -446,17 +439,16 @@ auto TDeviceList::SelectRacks( auto& rack = racks[currentRack]; rack.Id = currentRack; rack.Nodes.push_back({nodeId, 0, 0}); - rack.Preferred = query.PreferredRacks.contains(currentRack); }; if (!query.NodeIds.empty()) { for (ui32 id: query.NodeIds) { - if (auto* nodeDevices = NodeDevices.FindPtr(id)) { + if (const auto* nodeDevices = NodeDevices.FindPtr(id)) { appendNode(nodeDevices->Rack, id); } } } else { - for (auto& [nodeId, nodeDevices]: NodeDevices) { + for (const auto& [nodeId, nodeDevices]: NodeDevices) { appendNode(nodeDevices.Rack, nodeId); } } @@ -481,49 +473,69 @@ auto TDeviceList::SelectRacks( } } - Sort( - rack.Nodes, - [] (const TNodeInfo& lhs, const TNodeInfo& rhs) { - if (lhs.OccupiedSpace != rhs.OccupiedSpace) { - return lhs.OccupiedSpace < rhs.OccupiedSpace; - } - - return lhs.FreeSpace > rhs.FreeSpace; - }); - rack.OccupiedSpace = rackTotalSpace - rack.FreeSpace; } - TVector bySpace; - for (auto& x: racks) { - if (x.second.FreeSpace) { - bySpace.push_back(&x.second); + TVector result; + result.reserve(racks.size()); + for (auto& [_, r]: racks) { + if (r.FreeSpace) { + r.Preferred = query.PreferredRacks.contains(r.Id); + result.push_back(std::move(r)); } } + return result; +} + +// Returns a ranked list of nodes. Sorting is performed in stages: +// - Sort racks by preference, occupied space, free space +// - Sort nodes within each rack by occupied space, free space +// - Optionally apply custom NodeRankingFunc to all nodes (if set in query) +auto TDeviceList::RankNodes( + const TAllocationQuery& query, + TVector racks) const -> TVector +{ + // Sort racks by preference then by occupied space ASC then by free space + // DESC Sort( - bySpace, - [] (const TRack* lhs, const TRack* rhs) { - if (lhs->Preferred != rhs->Preferred) { - return lhs->Preferred > rhs->Preferred; - } - if (lhs->OccupiedSpace != rhs->OccupiedSpace) { - return lhs->OccupiedSpace < rhs->OccupiedSpace; - } - if (lhs->FreeSpace != rhs->FreeSpace) { - return lhs->FreeSpace > rhs->FreeSpace; - } - return lhs->Id < rhs->Id; + racks, + [](const TRack& l, const TRack& r) + { + return std::tie(r.Preferred, l.OccupiedSpace, r.FreeSpace, l.Id) < + std::tie(l.Preferred, r.OccupiedSpace, l.FreeSpace, r.Id); }); - TVector result; - result.reserve(bySpace.size()); + TVector nodeIds; + { + size_t size = 0; + for (const TRack& rack: racks) { + size += rack.Nodes.size(); + } + nodeIds.reserve(size); + } - for (auto* x: bySpace) { - result.push_back(*x); + for (TRack& rack: racks) { + // Sort rack's nodes by occupied space ASC then by free space DESC + Sort( + rack.Nodes, + [](const TNodeInfo& lhs, const TNodeInfo& rhs) + { + return std::tie(lhs.OccupiedSpace, rhs.FreeSpace) < + std::tie(rhs.OccupiedSpace, lhs.FreeSpace); + }); + + for (const TNodeInfo& node: rack.Nodes) { + nodeIds.push_back(node.NodeId); + } } - return result; + // Get the final order of nodes + if (query.NodeRankingFunc) { + query.NodeRankingFunc(nodeIds); + } + + return nodeIds; } TVector TDeviceList::CollectDevices( @@ -537,85 +549,83 @@ TVector TDeviceList::CollectDevices( TVector ranges; ui64 totalSize = query.GetTotalByteCount(); - for (const auto& rack: SelectRacks(query, poolName)) { - for (const auto& node: rack.Nodes) { - const auto* nodeDevices = NodeDevices.FindPtr(node.NodeId); - Y_ABORT_UNLESS(nodeDevices); + for (ui32 nodeId: RankNodes(query, SelectRacks(query, poolName))) { + const auto* nodeDevices = NodeDevices.FindPtr(nodeId); + Y_ABORT_UNLESS(nodeDevices); - // finding free devices belonging to this node that match our - // query - auto [begin, end] = - FindDeviceRange(query, poolName, nodeDevices->FreeDevices); + // finding free devices belonging to this node that match our + // query + auto [begin, end] = + FindDeviceRange(query, poolName, nodeDevices->FreeDevices); - using TDeviceIter = decltype(begin); - struct TDeviceInfo - { - TString DeviceName; - ui64 Size = 0; - std::pair Range; - }; - - // grouping these matching devices by DeviceName and sorting - // these groups by size in descending order - TVector bySize; - for (auto it = begin; it != end; ++it) { - if (bySize.empty() - || bySize.back().DeviceName != it->GetDeviceName()) - { - bySize.emplace_back(TDeviceInfo{ - it->GetDeviceName(), - 0, - std::make_pair(it, it), - }); - } + using TDeviceIter = decltype(begin); + struct TDeviceInfo + { + TString DeviceName; + ui64 Size = 0; + std::pair Range; + }; - auto& current = bySize.back(); - current.Size += it->GetBlockSize() * it->GetBlocksCount(); - ++current.Range.second; + // grouping these matching devices by DeviceName and sorting + // these groups by size in descending order + TVector bySize; + for (auto it = begin; it != end; ++it) { + if (bySize.empty() + || bySize.back().DeviceName != it->GetDeviceName()) + { + bySize.emplace_back(TDeviceInfo{ + .DeviceName = it->GetDeviceName(), + .Size = 0, + .Range = std::make_pair(it, it), + }); } - SortBy(bySize, [] (const TDeviceInfo& d) { - return Max() - d.Size; - }); - - // traversing device groups from the biggest to the smallest - // the goal is to greedily select as few groups as possible - // in most of the cases it will lead to an allocation which is placed - // on a single physical device, which is what we want - for (const auto& deviceInfo: bySize) { - auto it = deviceInfo.Range.first; - for (; it != deviceInfo.Range.second; ++it) { - const auto& device = *it; + auto& current = bySize.back(); + current.Size += it->GetBlockSize() * it->GetBlocksCount(); + ++current.Range.second; + } - Y_DEBUG_ABORT_UNLESS(device.GetRack() == nodeDevices->Rack); + SortBy(bySize, [] (const TDeviceInfo& d) { + return Max() - d.Size; + }); - const ui64 size = device.GetBlockSize() * device.GetBlocksCount(); + // traversing device groups from the biggest to the smallest + // the goal is to greedily select as few groups as possible + // in most of the cases it will lead to an allocation which is placed + // on a single physical device, which is what we want + for (const auto& deviceInfo: bySize) { + auto it = deviceInfo.Range.first; + for (; it != deviceInfo.Range.second; ++it) { + const auto& device = *it; - if (totalSize <= size) { - totalSize = 0; - ++it; - break; - } + Y_DEBUG_ABORT_UNLESS(device.GetRack() == nodeDevices->Rack); - totalSize -= size; - } + const ui64 size = device.GetBlockSize() * device.GetBlocksCount(); - if (deviceInfo.Range.first != it) { - ranges.emplace_back(node.NodeId, deviceInfo.Range.first, it); + if (totalSize <= size) { + totalSize = 0; + ++it; + break; } - if (totalSize == 0) { - return ranges; - } + totalSize -= size; } - if (query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL) { - // here we go again + if (deviceInfo.Range.first != it) { + ranges.emplace_back(nodeId, deviceInfo.Range.first, it); + } - ranges.clear(); - totalSize = query.GetTotalByteCount(); + if (totalSize == 0) { + return ranges; } } + + if (query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL) { + // a local disk cannot be shared between multiple nodes + + ranges.clear(); + totalSize = query.GetTotalByteCount(); + } } return {}; diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h index b345f04263b..52af5824519 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h @@ -3,14 +3,18 @@ #include "public.h" #include + #include #include -#include #include +#include #include #include +#include +#include + namespace NCloud::NBlockStore::NStorage { //////////////////////////////////////////////////////////////////////////////// @@ -67,10 +71,13 @@ class TDeviceList const bool AlwaysAllocateLocalDisks; public: + using TNodeRankingFunc = std::function nodeIds)>; + struct TAllocationQuery { THashSet ForbiddenRacks; THashSet PreferredRacks; + TNodeRankingFunc NodeRankingFunc; ui32 LogicalBlockSize = 0; ui64 BlockCount = 0; @@ -175,9 +182,15 @@ class TDeviceList const TAllocationQuery& query, const TString& poolName); TVector CollectDevices(const TAllocationQuery& query); + [[nodiscard]] TVector SelectRacks( const TAllocationQuery& query, const TString& poolName) const; + + [[nodiscard]] TVector RankNodes( + const TAllocationQuery& query, + TVector racks) const; + void RemoveDeviceFromFreeList(const TDeviceId& id); void UpdateInAllDevices( const TDeviceId& id, diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp b/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp index 23a348a3f8d..77aa306e8f4 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp @@ -428,13 +428,14 @@ Y_UNIT_TEST_SUITE(TDeviceListTest) TDeviceList deviceList; - auto allocate = [&] (ui32 n, THashSet preferredRacks = {}) { + auto allocate = [&](ui32 n, THashSet nodeIds = {}) + { return deviceList.AllocateDevices( "disk", { - .PreferredRacks = std::move(preferredRacks), .LogicalBlockSize = DefaultBlockSize, .BlockCount = n * DefaultBlockCount, + .NodeIds = std::move(nodeIds), } ); }; @@ -450,25 +451,228 @@ Y_UNIT_TEST_SUITE(TDeviceListTest) UNIT_ASSERT_VALUES_EQUAL(60, deviceList.Size()); - // using up some space in rack1 - auto devices0 = allocate(15, {"rack1"}); + // using up some space in rack1 (on agent1) + auto devices0 = allocate(15, {agent1.GetNodeId()}); UNIT_ASSERT_VALUES_EQUAL(15, devices0.size()); - UNIT_ASSERT_VALUES_EQUAL("rack1", devices0[0].GetRack()); + for (const auto& d: devices0) { + UNIT_ASSERT_VALUES_EQUAL("rack1", d.GetRack()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetNodeId(), d.GetNodeId()); + } // now rack2 has less occupied space auto devices1 = allocate(10); UNIT_ASSERT_VALUES_EQUAL(10, devices1.size()); - UNIT_ASSERT_VALUES_EQUAL("rack2", devices1[0].GetRack()); + for (const auto& d: devices1) { + UNIT_ASSERT_VALUES_EQUAL("rack2", d.GetRack()); + UNIT_ASSERT_VALUES_EQUAL(devices1[0].GetNodeId(), d.GetNodeId()); + UNIT_ASSERT_VALUES_EQUAL(devices1[0].GetAgentId(), d.GetAgentId()); + } // rack2 still has less occupied space auto devices2 = allocate(10); UNIT_ASSERT_VALUES_EQUAL(10, devices2.size()); - UNIT_ASSERT_VALUES_EQUAL("rack2", devices2[0].GetRack()); + + // it must be another node, the least occupied one + UNIT_ASSERT_VALUES_UNEQUAL( + devices1[0].GetNodeId(), + devices2[0].GetNodeId()); + UNIT_ASSERT_VALUES_UNEQUAL( + devices1[0].GetAgentId(), + devices2[0].GetAgentId()); + + for (const auto& d: devices2) { + UNIT_ASSERT_VALUES_EQUAL("rack2", d.GetRack()); + UNIT_ASSERT_VALUES_EQUAL(devices2[0].GetNodeId(), d.GetNodeId()); + UNIT_ASSERT_VALUES_EQUAL(devices2[0].GetAgentId(), d.GetAgentId()); + } // now rack1 has less occupied space auto devices3 = allocate(1); UNIT_ASSERT_VALUES_EQUAL(1, devices3.size()); UNIT_ASSERT_VALUES_EQUAL("rack1", devices3[0].GetRack()); + UNIT_ASSERT_VALUES_EQUAL(agent2.GetAgentId(), devices3[0].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent2.GetNodeId(), devices3[0].GetNodeId()); + } + + Y_UNIT_TEST(ShouldDownrankNodes) + { + const auto poolConfigs = CreateDevicePoolConfigs({}); + + TDeviceList deviceList; + + auto allocate = [&](ui32 n, + THashSet nodeIds = {}, + THashSet downrankedNodeIds = {}) + { + return deviceList.AllocateDevices( + "disk", + { + .NodeRankingFunc = + [downrankedNodeIds](std::span nodeIds) + { + auto it = std::stable_partition( + nodeIds.begin(), + nodeIds.end(), + [&](ui32 nodeId) + { return !downrankedNodeIds.contains(nodeId); }); + + std::sort(it, nodeIds.end()); + }, + .LogicalBlockSize = DefaultBlockSize, + .BlockCount = n * DefaultBlockCount, + .NodeIds = std::move(nodeIds), + }); + }; + + NProto::TAgentConfig agent1 = CreateAgentConfig("agent1", 1, "rack1"); + NProto::TAgentConfig agent2 = CreateAgentConfig("agent2", 2, "rack1"); + NProto::TAgentConfig agent3 = CreateAgentConfig("agent3", 3, "rack2"); + NProto::TAgentConfig agent4 = CreateAgentConfig("agent4", 4, "rack2"); + NProto::TAgentConfig agent5 = CreateAgentConfig("agent5", 5, "rack3"); + NProto::TAgentConfig agent6 = CreateAgentConfig("agent6", 6, "rack3"); + + for (auto& agent: {agent1, agent2, agent3, agent4, agent5, agent6}) { + deviceList.UpdateDevices(agent, poolConfigs); + } + + allocate(12, {agent2.GetNodeId()}); + + allocate(6, {agent3.GetNodeId()}); + allocate(12, {agent4.GetNodeId()}); + + allocate(9, {agent5.GetNodeId()}); + allocate(12, {agent6.GetNodeId()}); + + { + // rack1 *agent1: 15, agent2: 3 18 + // rack2 agent3: 9, agent4: 3 12 + // rack3 agent5: 6, agent6: 3 9 + + auto devices = allocate(1); + UNIT_ASSERT_VALUES_EQUAL(1, devices.size()); + UNIT_ASSERT_VALUES_EQUAL("rack1", devices[0].GetRack()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetAgentId(), devices[0].GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetNodeId(), devices[0].GetNodeId()); + } + + const THashSet downrankedNodes = { + agent1.GetNodeId(), + agent5.GetNodeId() + }; + + { + // rack1 [agent1: 14], *agent2: 3 17 + // rack2 agent3: 9, agent4: 3 12 + // rack3 [agent5: 6], agent6: 3 9 + + auto devices = allocate(3, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(3, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL("rack1", d.GetRack()); + UNIT_ASSERT_VALUES_EQUAL(agent2.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent2.GetNodeId(), d.GetNodeId()); + } + } + + { + // rack1 [agent1: 14] 14 + // rack2 *agent3: 9, agent4: 3 12 + // rack3 [agent5: 6], agent6: 3 9 + + auto devices = allocate(9, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(9, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL_C("rack2", d.GetRack(), d); + UNIT_ASSERT_VALUES_EQUAL(agent3.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent3.GetNodeId(), d.GetNodeId()); + } + } + + { + // rack1 [agent1: 14] 14 + // rack3 [agent5: 6], *agent6: 3 9 + // rack2 agent4: 3 3 + + auto devices = allocate(3, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(3, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL_C("rack3", d.GetRack(), d); + UNIT_ASSERT_VALUES_EQUAL(agent6.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent6.GetNodeId(), d.GetNodeId()); + } + } + + { + // rack1 [agent1: 14] 14 + // rack3 [agent5: 6] 6 + // rack2 *agent4: 3 3 + + auto devices = allocate(3, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(3, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL_C("rack2", d.GetRack(), d); + UNIT_ASSERT_VALUES_EQUAL(agent4.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent4.GetNodeId(), d.GetNodeId()); + } + } + + // It's time for the downranked nodes. + + { + // rack1 *[agent1: 14] 14 + // rack3 [agent5: 6] 6 + + auto devices = allocate(6, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(6, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL_C("rack1", d.GetRack(), d); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetNodeId(), d.GetNodeId()); + } + } + + { + // rack1 *[agent1: 8] 8 + // rack3 [agent5: 6] 6 + + auto devices = allocate(6, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(6, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL_C("rack1", d.GetRack(), d); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetNodeId(), d.GetNodeId()); + } + } + + { + // downranked nodes are sorted by nodeId, so agent1 is selected + // first, even though rack1 has less space than rack3. + + // rack1 *[agent1: 2] 2 + // rack3 [agent5: 6] 6 + + auto devices = allocate(2, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(2, devices.size()); + for (const auto& d: devices) { + UNIT_ASSERT_VALUES_EQUAL_C("rack1", d.GetRack(), d); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetAgentId(), d.GetAgentId()); + UNIT_ASSERT_VALUES_EQUAL(agent1.GetNodeId(), d.GetNodeId()); + } + } + + { + // rack3 [agent5: 6] 6 + + auto devices = allocate(6, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(6, devices.size()); + } + + // No space left + { + auto devices = allocate(1, {}, downrankedNodes); + UNIT_ASSERT_VALUES_EQUAL(0, devices.size()); + } } Y_UNIT_TEST(ShouldSelectPhysicalDeviceWithTheMostSpaceUponDeviceAllocation)