Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 115 additions & 105 deletions cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,11 @@ NProto::TDeviceConfig TDeviceList::AllocateDevice(
const TDiskId& diskId,
const TAllocationQuery& query)
{
for (auto& kv: NodeDevices) {
if (!query.NodeIds.empty() && !query.NodeIds.contains(kv.first)) {
for (auto& [nodeId, nodeDevices]: NodeDevices) {
if (!query.NodeIds.empty() && !query.NodeIds.contains(nodeId)) {
continue;
}

const ui32 nodeId = kv.first;
auto& nodeDevices = kv.second;

const auto& currentRack = nodeDevices.Rack;
auto& devices = nodeDevices.FreeDevices;

Expand Down Expand Up @@ -428,10 +425,6 @@ void TDeviceList::MarkDeviceAllocated(const TDiskId& diskId, const TDeviceId& id
AllocatedDevices.emplace(id, diskId);
}

// returns a list of racks sorted by preference and then by occupied space ASC
// then by free space DESC
// the nodes in each rack are sorted by occupied space ASC then by free space
// DESC
auto TDeviceList::SelectRacks(
const TAllocationQuery& query,
const TString& poolName) const -> TVector<TRack>
Expand All @@ -446,17 +439,16 @@ auto TDeviceList::SelectRacks(
auto& rack = racks[currentRack];
rack.Id = currentRack;
rack.Nodes.push_back({nodeId, 0, 0});
rack.Preferred = query.PreferredRacks.contains(currentRack);
};

if (!query.NodeIds.empty()) {
for (ui32 id: query.NodeIds) {
if (auto* nodeDevices = NodeDevices.FindPtr(id)) {
if (const auto* nodeDevices = NodeDevices.FindPtr(id)) {
appendNode(nodeDevices->Rack, id);
}
}
} else {
for (auto& [nodeId, nodeDevices]: NodeDevices) {
for (const auto& [nodeId, nodeDevices]: NodeDevices) {
appendNode(nodeDevices.Rack, nodeId);
}
}
Expand All @@ -481,49 +473,69 @@ auto TDeviceList::SelectRacks(
}
}

Sort(
rack.Nodes,
[] (const TNodeInfo& lhs, const TNodeInfo& rhs) {
if (lhs.OccupiedSpace != rhs.OccupiedSpace) {
return lhs.OccupiedSpace < rhs.OccupiedSpace;
}

return lhs.FreeSpace > rhs.FreeSpace;
});

rack.OccupiedSpace = rackTotalSpace - rack.FreeSpace;
}

TVector<TRack*> bySpace;
for (auto& x: racks) {
if (x.second.FreeSpace) {
bySpace.push_back(&x.second);
TVector<TRack> result;
result.reserve(racks.size());
for (auto& [_, r]: racks) {
if (r.FreeSpace) {
r.Preferred = query.PreferredRacks.contains(r.Id);
result.push_back(std::move(r));
}
}

return result;
}

// Returns a ranked list of nodes. Sorting is performed in stages:
// - Sort racks by preference, occupied space, free space
// - Sort nodes within each rack by occupied space, free space
// - Optionally apply custom NodeRankingFunc to all nodes (if set in query)
auto TDeviceList::RankNodes(
const TAllocationQuery& query,
TVector<TRack> racks) const -> TVector<TNodeId>
{
// Sort racks by preference then by occupied space ASC then by free space
// DESC
Sort(
bySpace,
[] (const TRack* lhs, const TRack* rhs) {
if (lhs->Preferred != rhs->Preferred) {
return lhs->Preferred > rhs->Preferred;
}
if (lhs->OccupiedSpace != rhs->OccupiedSpace) {
return lhs->OccupiedSpace < rhs->OccupiedSpace;
}
if (lhs->FreeSpace != rhs->FreeSpace) {
return lhs->FreeSpace > rhs->FreeSpace;
}
return lhs->Id < rhs->Id;
racks,
[](const TRack& l, const TRack& r)
{
return std::tie(r.Preferred, l.OccupiedSpace, r.FreeSpace, l.Id) <
std::tie(l.Preferred, r.OccupiedSpace, l.FreeSpace, r.Id);
});

TVector<TRack> result;
result.reserve(bySpace.size());
TVector<TNodeId> nodeIds;
{
size_t size = 0;
for (const TRack& rack: racks) {
size += rack.Nodes.size();
}
nodeIds.reserve(size);
}

for (auto* x: bySpace) {
result.push_back(*x);
for (TRack& rack: racks) {
// Sort rack's nodes by occupied space ASC then by free space DESC
Sort(
rack.Nodes,
[](const TNodeInfo& lhs, const TNodeInfo& rhs)
{
return std::tie(lhs.OccupiedSpace, rhs.FreeSpace) <
std::tie(rhs.OccupiedSpace, lhs.FreeSpace);
});

for (const TNodeInfo& node: rack.Nodes) {
nodeIds.push_back(node.NodeId);
}
}

return result;
// Get the final order of nodes
if (query.NodeRankingFunc) {
query.NodeRankingFunc(nodeIds);
}

return nodeIds;
}

TVector<TDeviceList::TDeviceRange> TDeviceList::CollectDevices(
Expand All @@ -537,85 +549,83 @@ TVector<TDeviceList::TDeviceRange> TDeviceList::CollectDevices(
TVector<TDeviceRange> ranges;
ui64 totalSize = query.GetTotalByteCount();

for (const auto& rack: SelectRacks(query, poolName)) {
for (const auto& node: rack.Nodes) {
const auto* nodeDevices = NodeDevices.FindPtr(node.NodeId);
Y_ABORT_UNLESS(nodeDevices);
for (ui32 nodeId: RankNodes(query, SelectRacks(query, poolName))) {
const auto* nodeDevices = NodeDevices.FindPtr(nodeId);
Y_ABORT_UNLESS(nodeDevices);

// finding free devices belonging to this node that match our
// query
auto [begin, end] =
FindDeviceRange(query, poolName, nodeDevices->FreeDevices);
// finding free devices belonging to this node that match our
// query
auto [begin, end] =
FindDeviceRange(query, poolName, nodeDevices->FreeDevices);

using TDeviceIter = decltype(begin);
struct TDeviceInfo
{
TString DeviceName;
ui64 Size = 0;
std::pair<TDeviceIter, TDeviceIter> Range;
};

// grouping these matching devices by DeviceName and sorting
// these groups by size in descending order
TVector<TDeviceInfo> bySize;
for (auto it = begin; it != end; ++it) {
if (bySize.empty()
|| bySize.back().DeviceName != it->GetDeviceName())
{
bySize.emplace_back(TDeviceInfo{
it->GetDeviceName(),
0,
std::make_pair(it, it),
});
}
using TDeviceIter = decltype(begin);
struct TDeviceInfo
{
TString DeviceName;
ui64 Size = 0;
std::pair<TDeviceIter, TDeviceIter> Range;
};

auto& current = bySize.back();
current.Size += it->GetBlockSize() * it->GetBlocksCount();
++current.Range.second;
// grouping these matching devices by DeviceName and sorting
// these groups by size in descending order
TVector<TDeviceInfo> bySize;
for (auto it = begin; it != end; ++it) {
if (bySize.empty()
|| bySize.back().DeviceName != it->GetDeviceName())
{
bySize.emplace_back(TDeviceInfo{
.DeviceName = it->GetDeviceName(),
.Size = 0,
.Range = std::make_pair(it, it),
});
}

SortBy(bySize, [] (const TDeviceInfo& d) {
return Max<ui64>() - d.Size;
});

// traversing device groups from the biggest to the smallest
// the goal is to greedily select as few groups as possible
// in most of the cases it will lead to an allocation which is placed
// on a single physical device, which is what we want
for (const auto& deviceInfo: bySize) {
auto it = deviceInfo.Range.first;
for (; it != deviceInfo.Range.second; ++it) {
const auto& device = *it;
auto& current = bySize.back();
current.Size += it->GetBlockSize() * it->GetBlocksCount();
++current.Range.second;
}

Y_DEBUG_ABORT_UNLESS(device.GetRack() == nodeDevices->Rack);
SortBy(bySize, [] (const TDeviceInfo& d) {
return Max<ui64>() - d.Size;
});

const ui64 size = device.GetBlockSize() * device.GetBlocksCount();
// traversing device groups from the biggest to the smallest
// the goal is to greedily select as few groups as possible
// in most of the cases it will lead to an allocation which is placed
// on a single physical device, which is what we want
for (const auto& deviceInfo: bySize) {
auto it = deviceInfo.Range.first;
for (; it != deviceInfo.Range.second; ++it) {
const auto& device = *it;

if (totalSize <= size) {
totalSize = 0;
++it;
break;
}
Y_DEBUG_ABORT_UNLESS(device.GetRack() == nodeDevices->Rack);

totalSize -= size;
}
const ui64 size = device.GetBlockSize() * device.GetBlocksCount();

if (deviceInfo.Range.first != it) {
ranges.emplace_back(node.NodeId, deviceInfo.Range.first, it);
if (totalSize <= size) {
totalSize = 0;
++it;
break;
}

if (totalSize == 0) {
return ranges;
}
totalSize -= size;
}

if (query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL) {
// here we go again
if (deviceInfo.Range.first != it) {
ranges.emplace_back(nodeId, deviceInfo.Range.first, it);
}

ranges.clear();
totalSize = query.GetTotalByteCount();
if (totalSize == 0) {
return ranges;
}
}

if (query.PoolKind == NProto::DEVICE_POOL_KIND_LOCAL) {
// a local disk cannot be shared between multiple nodes

ranges.clear();
totalSize = query.GetTotalByteCount();
}
}

return {};
Expand Down
15 changes: 14 additions & 1 deletion cloud/blockstore/libs/storage/disk_registry/model/device_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
#include "public.h"

#include <cloud/blockstore/libs/storage/protos/disk.pb.h>

#include <cloud/storage/core/libs/common/error.h>

#include <util/datetime/base.h>
#include <util/generic/hash_set.h>
#include <util/generic/hash.h>
#include <util/generic/hash_set.h>
#include <util/generic/string.h>
#include <util/generic/vector.h>

#include <functional>
#include <span>

namespace NCloud::NBlockStore::NStorage {

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -67,10 +71,13 @@ class TDeviceList
const bool AlwaysAllocateLocalDisks;

public:
using TNodeRankingFunc = std::function<void (std::span<ui32> nodeIds)>;

struct TAllocationQuery
{
THashSet<TString> ForbiddenRacks;
THashSet<TString> PreferredRacks;
TNodeRankingFunc NodeRankingFunc;

ui32 LogicalBlockSize = 0;
ui64 BlockCount = 0;
Expand Down Expand Up @@ -175,9 +182,15 @@ class TDeviceList
const TAllocationQuery& query,
const TString& poolName);
TVector<TDeviceRange> CollectDevices(const TAllocationQuery& query);

[[nodiscard]] TVector<TRack> SelectRacks(
const TAllocationQuery& query,
const TString& poolName) const;

[[nodiscard]] TVector<TNodeId> RankNodes(
const TAllocationQuery& query,
TVector<TRack> racks) const;

void RemoveDeviceFromFreeList(const TDeviceId& id);
void UpdateInAllDevices(
const TDeviceId& id,
Expand Down
Loading
Loading