diff --git a/cloud/blockstore/config/storage.proto b/cloud/blockstore/config/storage.proto index 90a5325ed4f..7afd643eec0 100644 --- a/cloud/blockstore/config/storage.proto +++ b/cloud/blockstore/config/storage.proto @@ -1363,4 +1363,13 @@ message TStorageServiceConfig // N = track every Nth operation // Higher values reduce monitoring overhead but provide less granular statistics. optional uint32 DeviceOperationTrackingFrequency = 467; + + // Enable to attach or detach hardware devices in runtime in Disk Agent. + optional bool AttachDetachPathsEnabled = 468; + + // Max inflight attach/detach path requests in progress. + optional uint64 MaxInflightAttachDetachPathRequestsProcessing = 469; + + // Timeout for attach/detach path requests (in milliseconds). + optional uint32 AttachDetachPathRequestTimeout = 470; } diff --git a/cloud/blockstore/libs/diagnostics/critical_events.h b/cloud/blockstore/libs/diagnostics/critical_events.h index d784754b33c..d307da38c25 100644 --- a/cloud/blockstore/libs/diagnostics/critical_events.h +++ b/cloud/blockstore/libs/diagnostics/critical_events.h @@ -127,6 +127,7 @@ using TCritEventParams = TVector>; xxx(ChecksumCalculationError) \ xxx(LogicalDiskIdMismatch) \ xxx(DeviceReplacementContractBroken) \ + xxx(DiskRegistryDetachPathWithDependentDisk) \ // BLOCKSTORE_IMPOSSIBLE_EVENTS //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/blockstore/libs/storage/api/disk_agent.h b/cloud/blockstore/libs/storage/api/disk_agent.h index e45c40820e6..a3c16f71730 100644 --- a/cloud/blockstore/libs/storage/api/disk_agent.h +++ b/cloud/blockstore/libs/storage/api/disk_agent.h @@ -29,6 +29,8 @@ namespace NCloud::NBlockStore::NStorage { xxx(EnableAgentDevice, __VA_ARGS__) \ xxx(PartiallySuspendAgent, __VA_ARGS__) \ xxx(DirectCopyBlocks, __VA_ARGS__) \ + xxx(AttachPath, __VA_ARGS__) \ + xxx(DetachPath, __VA_ARGS__) \ // BLOCKSTORE_DISK_AGENT_REQUESTS_PROTO @@ -97,6 +99,12 @@ struct TEvDiskAgent EvDirectCopyBlocksRequest = EvBegin + 23, EvDirectCopyBlocksResponse = EvBegin + 24, + EvAttachPathRequest = EvBegin + 25, + EvAttachPathResponse = EvBegin + 26, + + EvDetachPathRequest = EvBegin + 27, + EvDetachPathResponse = EvBegin + 28, + EvEnd }; diff --git a/cloud/blockstore/libs/storage/core/config.cpp b/cloud/blockstore/libs/storage/core/config.cpp index 02b54cc9486..4c25bdb5fa1 100644 --- a/cloud/blockstore/libs/storage/core/config.cpp +++ b/cloud/blockstore/libs/storage/core/config.cpp @@ -642,6 +642,11 @@ NProto::TLinkedDiskFillBandwidth GetBandwidth( xxx(HiveLocalServiceMemoryResourceLimit, ui64, 0 )\ xxx(HiveLocalServiceNetworkResourceLimit, ui64, 0 )\ xxx(DynamicNodeRegistrationTimeout, TDuration, Seconds(5) )\ + xxx(AttachDetachPathsEnabled, bool, false )\ + \ + xxx(MaxInflightAttachDetachPathRequestsProcessing, ui64, 100 )\ + \ + xxx(AttachDetachPathRequestTimeout, TDuration, Seconds(5) )\ // BLOCKSTORE_STORAGE_CONFIG_RW diff --git a/cloud/blockstore/libs/storage/core/config.h b/cloud/blockstore/libs/storage/core/config.h index 15d63a1d9c2..ac93152d802 100644 --- a/cloud/blockstore/libs/storage/core/config.h +++ b/cloud/blockstore/libs/storage/core/config.h @@ -737,6 +737,11 @@ class TStorageConfig [[nodiscard]] TDuration GetDynamicNodeRegistrationTimeout() const; [[nodiscard]] bool GetComputeDigestForEveryBlockOnCompaction() const; + + [[nodiscard]] bool GetAttachDetachPathsEnabled() const; + [[nodiscard]] ui64 GetMaxInflightAttachDetachPathRequestsProcessing() const; + + [[nodiscard]] TDuration GetAttachDetachPathRequestTimeout() const; }; ui64 GetAllocationUnit( diff --git a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor.cpp b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor.cpp index 6feb3a60375..39f3adb0a13 100644 --- a/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor.cpp +++ b/cloud/blockstore/libs/storage/disk_agent/disk_agent_actor.cpp @@ -187,6 +187,20 @@ TDuration TDiskAgentActor::GetMaxRequestTimeout() const Config->GetNonReplicatedMaxRequestTimeoutHDD()); } +void TDiskAgentActor::HandleAttachPath( + const TEvDiskAgent::TEvAttachPathRequest::TPtr& ev, + const TActorContext& ctx) +{ + RejectAttachPath(ev, ctx); +} + +void TDiskAgentActor::HandleDetachPath( + const TEvDiskAgent::TEvDetachPathRequest::TPtr& ev, + const TActorContext& ctx) +{ + RejectDetachPath(ev, ctx); +} + //////////////////////////////////////////////////////////////////////////////// void TDiskAgentActor::HandleReportDelayedDiskAgentConfigMismatch( diff --git a/cloud/blockstore/libs/storage/disk_common/monitoring_utils.cpp b/cloud/blockstore/libs/storage/disk_common/monitoring_utils.cpp index 7173f3ff8ce..f1844a25461 100644 --- a/cloud/blockstore/libs/storage/disk_common/monitoring_utils.cpp +++ b/cloud/blockstore/libs/storage/disk_common/monitoring_utils.cpp @@ -94,6 +94,9 @@ IOutputStream& DumpDeviceState( if (flags & EDeviceStateFlags::LAGGING) { out << " [lagging]"; } + if (flags & EDeviceStateFlags::DETACHED) { + out << " [detached]"; + } return out; } diff --git a/cloud/blockstore/libs/storage/disk_common/monitoring_utils.h b/cloud/blockstore/libs/storage/disk_common/monitoring_utils.h index 0fdce95d2eb..c0f9b3bcb0b 100644 --- a/cloud/blockstore/libs/storage/disk_common/monitoring_utils.h +++ b/cloud/blockstore/libs/storage/disk_common/monitoring_utils.h @@ -16,6 +16,7 @@ enum EDeviceStateFlags : uint16_t DIRTY = 1 << 2, SUSPENDED = 1 << 3, LAGGING = 1 << 4, + DETACHED = 1 << 5, }; inline EDeviceStateFlags operator|(EDeviceStateFlags a, EDeviceStateFlags b) diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.cpp index 8da702da62e..01ad275f19e 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.cpp @@ -273,6 +273,10 @@ void TDiskRegistryActor::KillActors(const TActorContext& ctx) for (auto& actor: Actors) { NCloud::Send(ctx, actor); } + + for (const auto& [agentId, actorId]: AgentsWithAttachDetachRequestsInProgress) { + NCloud::Send(ctx, actorId); + } } void TDiskRegistryActor::UnregisterCounters(const TActorContext& ctx) @@ -721,6 +725,10 @@ STFUNC(TDiskRegistryActor::StateWork) TEvDiskRegistryPrivate::TEvDiskRegistryAgentListExpiredParamsCleanup, TDiskRegistryActor::HandleDiskRegistryAgentListExpiredParamsCleanup); + HFunc( + TEvDiskRegistryPrivate::TEvAttachDetachPathOperationCompleted, + HandleAttachDetachPathOperationCompleted); + default: if (!HandleRequests(ev) && !HandleDefaultEvents(ev, SelfId())) { HandleUnexpectedEvent( diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h index 36b92074660..e1294dece72 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor.h @@ -129,6 +129,9 @@ class TDiskRegistryActor final TTransactionTimeTracker TransactionTimeTracker; + THashMap + AgentsWithAttachDetachRequestsInProgress; + public: TDiskRegistryActor( const NActors::TActorId& owner, @@ -291,6 +294,9 @@ class TDiskRegistryActor final void RenderDeviceHtmlInfo(IOutputStream& out, const TString& id) const; void RenderAgentHtmlInfo(IOutputStream& out, const TString& id) const; void RenderDiskHtmlInfo(IOutputStream& out, const TString& id) const; + void RenderPathAttachStates( + IOutputStream& out, + const NProto::TAgentConfig& agent) const; void SendHttpResponse( const NActors::TActorContext& ctx, @@ -325,6 +331,20 @@ class TDiskRegistryActor final void ProcessInitialAgentRejectionPhase(const NActors::TActorContext& ctx); + void ProcessPathsToAttachOnAgent( + const NActors::TActorContext& ctx, + const NProto::TAgentConfig* agent, + const THashSet& paths); + + void ProcessPathsToAttach(const NActors::TActorContext& ctx); + + void TryToDetachPaths( + const NActors::TActorContext& ctx, + const TString& agentId, + TVector paths, + TRequestInfoPtr requestInfo, + NProto::TAction_EType actionType); + private: STFUNC(StateBoot); STFUNC(StateInit); @@ -369,6 +389,11 @@ class TDiskRegistryActor final const TCgiParameters& params, TRequestInfoPtr requestInfo); + void HandleHttpInfo_UpdatePathAttachState( + const NActors::TActorContext& ctx, + const TCgiParameters& params, + TRequestInfoPtr requestInfo); + void HandleHttpInfo_RenderDisks( const NActors::TActorContext& ctx, const TCgiParameters& params, @@ -517,6 +542,11 @@ class TDiskRegistryActor final const TEvDiskRegistryPrivate::TEvSwitchAgentDisksToReadOnlyRequest::TPtr& ev, const NActors::TActorContext& ctx); + void HandleAttachDetachPathOperationCompleted( + const TEvDiskRegistryPrivate::TEvAttachDetachPathOperationCompleted:: + TPtr& ev, + const NActors::TActorContext& ctx); + BLOCKSTORE_DISK_REGISTRY_REQUESTS(BLOCKSTORE_IMPLEMENT_REQUEST, TEvDiskRegistry) BLOCKSTORE_DISK_REGISTRY_REQUESTS_FWD_SERVICE(BLOCKSTORE_IMPLEMENT_REQUEST, TEvService) BLOCKSTORE_DISK_REGISTRY_REQUESTS_PRIVATE(BLOCKSTORE_IMPLEMENT_REQUEST, TEvDiskRegistryPrivate) diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_attach_detach_path.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_attach_detach_path.cpp new file mode 100644 index 00000000000..c1e7e87da70 --- /dev/null +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_attach_detach_path.cpp @@ -0,0 +1,706 @@ +#include "disk_registry_actor.h" +#include "util/string/join.h" + +namespace NCloud::NBlockStore::NStorage { + +using namespace NActors; +using namespace NKikimr; +using namespace NKikimr::NTabletFlatExecutor; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +IEventBasePtr CreateResponseByActionType( + NProto::TAction_EType actionType, + NProto::TError error) +{ + switch (actionType) { + case NProto::TAction_EType_REMOVE_DEVICE: + return std::make_unique< + TEvDiskRegistryPrivate::TEvUpdateCmsHostDeviceStateResponse>( + std::move(error)); + case NProto::TAction_EType_REMOVE_HOST: + return std::make_unique< + TEvDiskRegistryPrivate::TEvUpdateCmsHostStateResponse>( + std::move(error)); + default: + Y_ABORT("Unsupported action type"); + } +} + +//////////////////////////////////////////////////////////////////////////////// + +struct TRequestContext { + TRequestInfoPtr RequestInfo; + NProto::TAction_EType ActionType; +}; + +class TAttachDetachPathActor: public TActorBootstrapped +{ +private: + const TActorId Owner; + const TString AgentId; + const ui64 NodeId; + const bool IsAttach; + const ui64 DrGeneration; + const ui64 DiskAgentGeneration; + const TDuration DiskAgentRequestTimeout; + TVector Paths; + + ui64 PendingRequests = 0; + + std::optional RequestContext; + +public: + TAttachDetachPathActor( + TActorId owner, + TString agentId, + ui64 nodeId, + bool isAttach, + ui64 drGeneration, + ui64 diskAgentGeneration, + TVector paths, + TDuration diskAgentRequestTimeout, + std::optional requestContext); + + void Bootstrap(const TActorContext& ctx); + +public: + template + IEventBasePtr CreateAttachDetachRequest(); + + void ReplyAndDie(const TActorContext& ctx, NProto::TError error); + + void UpdatePathAttachStates(const TActorContext& ctx); + + void UpdateDeviceStatesIfNeeded( + const TActorContext& ctx, + const NProto::TAttachPathResponse& response); + +public: + STFUNC(StateWaitAttachDetach); + STFUNC(StateMarkErrorDevices); + STFUNC(StateUpdatePathAttachState); + + template + void HandleAttachDetachPathRequestUndelivered( + const TEvRequest& ev, + const TActorContext& ctx); + + void HandleAttachDetachPathTimeout( + const TEvents::TEvWakeup::TPtr& ev, + const TActorContext& ctx); + + template + void HandleAttachDetachPathResult( + const TEvent& ev, + const NActors::TActorContext& ctx); + + void HandleChangeDeviceStateResponse( + const TEvDiskRegistry::TEvChangeDeviceStateResponse::TPtr& ev, + const NActors::TActorContext& ctx); + + void HandleUpdatePathAttachStateResult( + const TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse::TPtr& + ev, + const TActorContext& ctx); + + void HandlePoisonPill( + const TEvents::TEvPoisonPill::TPtr& ev, + const TActorContext& ctx); +}; + +//////////////////////////////////////////////////////////////////////////////// + +TAttachDetachPathActor::TAttachDetachPathActor( + TActorId owner, + TString agentId, + ui64 nodeId, + bool isAttach, + ui64 drGeneration, + ui64 diskAgentGeneration, + TVector paths, + TDuration diskAgentRequestTimeout, + std::optional requestContext) + : Owner(owner) + , AgentId(std::move(agentId)) + , NodeId(nodeId) + , IsAttach(isAttach) + , DrGeneration(drGeneration) + , DiskAgentGeneration(diskAgentGeneration) + , DiskAgentRequestTimeout(diskAgentRequestTimeout) + , Paths(std::move(paths)) + , RequestContext(std::move(requestContext)) +{} + +void TAttachDetachPathActor::Bootstrap(const TActorContext& ctx) +{ + IEventBasePtr request = + IsAttach + ? CreateAttachDetachRequest() + : CreateAttachDetachRequest(); + + auto event = std::make_unique( + MakeDiskAgentServiceId(NodeId), + ctx.SelfID, + request.release(), + IEventHandle::FlagForwardOnNondelivery, + NodeId, + &ctx.SelfID // forwardOnNondelivery + ); + + ctx.Send(std::move(event)); + ctx.Schedule(DiskAgentRequestTimeout, new TEvents::TEvWakeup()); + + LOG_INFO( + ctx, + TBlockStoreComponents::DISK_REGISTRY_WORKER, + "Sending %s paths request to disk agent[DiskAgentGeneration=%lu] with " + "paths[%s]", + IsAttach ? "attach" : "detach", + DiskAgentGeneration, + JoinSeq(",", Paths).c_str()); + + Become(&TThis::StateWaitAttachDetach); +} + +template +IEventBasePtr TAttachDetachPathActor::CreateAttachDetachRequest() +{ + auto request = std::make_unique(); + request->Record.SetDiskRegistryGeneration(DrGeneration); + request->Record.SetDiskAgentGeneration(DiskAgentGeneration); + + auto* mutablePaths = [&]() + { + if constexpr ( + std::is_same_v) + { + return request->Record.MutablePathsToAttach(); + } else { + return request->Record.MutablePathsToDetach(); + } + }(); + + mutablePaths->Add(Paths.begin(), Paths.end()); + + return request; +} + +void TAttachDetachPathActor::ReplyAndDie( + const TActorContext& ctx, + NProto::TError error) +{ + if (RequestContext) { + auto response = + CreateResponseByActionType(RequestContext->ActionType, error); + NCloud::Reply(ctx, *RequestContext->RequestInfo, std::move(response)); + } + + auto request = std::make_unique< + TEvDiskRegistryPrivate::TEvAttachDetachPathOperationCompleted>( + std::move(error)); + request->AgentId = AgentId; + request->IsAttach = IsAttach; + NCloud::Send(ctx, Owner, std::move(request)); + Die(ctx); +} + +void TAttachDetachPathActor::UpdatePathAttachStates(const TActorContext& ctx) +{ + Y_ABORT_UNLESS(PendingRequests == 0); + Y_ABORT_UNLESS(IsAttach); + for (auto& path : Paths) { + auto request = std::make_unique< + TEvDiskRegistryPrivate::TEvUpdatePathAttachStateRequest>(); + request->KnownGeneration = DiskAgentGeneration; + request->Path = path; + request->AgentId = AgentId; + request->NewState = NProto::PATH_ATTACH_STATE_ATTACHED; + + NCloud::Send(ctx, Owner, std::move(request)); + ++PendingRequests; + } + + Become(&TThis::StateUpdatePathAttachState); +} + +void TAttachDetachPathActor::UpdateDeviceStatesIfNeeded( + const TActorContext& ctx, + const NProto::TAttachPathResponse& response) +{ + Y_ABORT_UNLESS(PendingRequests == 0); + Y_ABORT_UNLESS(IsAttach); + bool hasErrorDevices = false; + for (const auto& device: response.GetAttachedDevices()) { + if (device.GetState() == NProto::DEVICE_STATE_ERROR) { + hasErrorDevices = true; + ++PendingRequests; + + auto request = std::make_unique< + TEvDiskRegistry::TEvChangeDeviceStateRequest>(); + + request->Record.SetDeviceUUID(device.GetDeviceUUID()); + request->Record.SetDeviceState(NProto::DEVICE_STATE_ERROR); + request->Record.SetReason(device.GetStateMessage()); + + NCloud::Send(ctx, Owner, std::move(request)); + } + } + + if (hasErrorDevices) { + Become(&TThis::StateMarkErrorDevices); + return; + } + UpdatePathAttachStates(ctx); +} + +template +void TAttachDetachPathActor::HandleAttachDetachPathRequestUndelivered( + const TEvRequest& ev, + const TActorContext& ctx) +{ + Y_UNUSED(ev); + ReplyAndDie(ctx, MakeError(E_REJECTED, "request undelivered")); +} + +void TAttachDetachPathActor::HandleAttachDetachPathTimeout( + const TEvents::TEvWakeup::TPtr& ev, + const TActorContext& ctx) +{ + Y_UNUSED(ev); + ReplyAndDie(ctx, MakeError(E_REJECTED, "request timed out")); +} + +template +void TAttachDetachPathActor::HandleAttachDetachPathResult( + const TEvResponse& ev, + const TActorContext& ctx) +{ + using TEvAttachResponse = TEvDiskAgent::TEvAttachPathResponse::TPtr; + + if constexpr (std::is_same_v) { + Y_ABORT_UNLESS(IsAttach); + } else { + Y_ABORT_UNLESS(!IsAttach); + } + + auto* msg = ev->Get(); + auto& record = msg->Record; + const auto& error = record.GetError(); + + if (HasError(error) && error.GetCode() != E_PRECONDITION_FAILED) { + ReplyAndDie(ctx, error); + return; + } + if (error.GetCode() == E_PRECONDITION_FAILED) { + LOG_WARN( + ctx, + TBlockStoreComponents::DISK_REGISTRY_WORKER, + "Attach detach path on agent %s disabled, ignoring error", + AgentId.Quote().c_str()); + } + + if constexpr (std::is_same_v) { + UpdateDeviceStatesIfNeeded(ctx, record); + } else { + ReplyAndDie(ctx, {}); + } +} + +void TAttachDetachPathActor::HandleChangeDeviceStateResponse( + const TEvDiskRegistry::TEvChangeDeviceStateResponse::TPtr& ev, + const NActors::TActorContext& ctx) +{ + auto* msg = ev->Get(); + auto& error = *msg->Record.MutableError(); + + if (HasError(error)) { + ReplyAndDie(ctx, std::move(error)); + return; + } + + --PendingRequests; + if (PendingRequests == 0) { + UpdatePathAttachStates(ctx); + } +} + +void TAttachDetachPathActor::HandleUpdatePathAttachStateResult( + const TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse::TPtr& ev, + const TActorContext& ctx) +{ + auto* msg = ev->Get(); + const auto& error = msg->Error; + + if (HasError(error)) { + ReplyAndDie(ctx, error); + return; + } + + --PendingRequests; + if (PendingRequests == 0) { + ReplyAndDie(ctx, {}); + } +} + +void TAttachDetachPathActor::HandlePoisonPill( + const TEvents::TEvPoisonPill::TPtr& ev, + const TActorContext& ctx) +{ + Y_UNUSED(ev); + ReplyAndDie(ctx, MakeTabletIsDeadError(E_REJECTED, __LOCATION__)); +} + +STFUNC(TAttachDetachPathActor::StateWaitAttachDetach) +{ + switch (ev->GetTypeRewrite()) { + HFunc(TEvents::TEvPoisonPill, HandlePoisonPill); + + HFunc( + TEvDiskAgent::TEvAttachPathRequest, + HandleAttachDetachPathRequestUndelivered); + HFunc( + TEvDiskAgent::TEvDetachPathRequest, + HandleAttachDetachPathRequestUndelivered); + + HFunc(TEvents::TEvWakeup, HandleAttachDetachPathTimeout) + + HFunc(TEvDiskAgent::TEvAttachPathResponse, HandleAttachDetachPathResult); + HFunc(TEvDiskAgent::TEvDetachPathResponse, HandleAttachDetachPathResult); + + + default: + HandleUnexpectedEvent( + ev, + TBlockStoreComponents::DISK_REGISTRY_WORKER, + __PRETTY_FUNCTION__); + break; + } +} + +STFUNC(TAttachDetachPathActor::StateMarkErrorDevices) +{ + switch (ev->GetTypeRewrite()) { + HFunc(TEvents::TEvPoisonPill, HandlePoisonPill); + + HFunc( + TEvDiskRegistry::TEvChangeDeviceStateResponse, + HandleChangeDeviceStateResponse); + + IgnoreFunc(TEvents::TEvWakeup); + + default: + HandleUnexpectedEvent( + ev, + TBlockStoreComponents::DISK_REGISTRY_WORKER, + __PRETTY_FUNCTION__); + break; + } +} + +STFUNC(TAttachDetachPathActor::StateUpdatePathAttachState) +{ + switch (ev->GetTypeRewrite()) { + HFunc(TEvents::TEvPoisonPill, HandlePoisonPill); + + HFunc( + TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse, + HandleUpdatePathAttachStateResult); + + IgnoreFunc(TEvents::TEvWakeup); + + default: + HandleUnexpectedEvent( + ev, + TBlockStoreComponents::DISK_REGISTRY_WORKER, + __PRETTY_FUNCTION__); + break; + } +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +void TDiskRegistryActor::HandleUpdatePathAttachState( + const TEvDiskRegistryPrivate::TEvUpdatePathAttachStateRequest::TPtr& ev, + const TActorContext& ctx) +{ + BLOCKSTORE_DISK_REGISTRY_COUNTER(UpdatePathAttachState); + + if (!Config->GetAttachDetachPathsEnabled()) { + auto response = std::make_unique< + TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse>(MakeError( + E_PRECONDITION_FAILED, + "path attach detach feature disabled")); + NCloud::Reply(ctx, *ev, std::move(response)); + return; + } + + auto* msg = ev->Get(); + + auto requestInfo = + CreateRequestInfo(ev->Sender, ev->Cookie, msg->CallContext); + + LOG_INFO( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "[%lu] Received UpdatePathAttachState request: Agent=%s, Path=%s, " + "State=%d", + TabletID(), + msg->AgentId.Quote().c_str(), + msg->Path.Quote().c_str(), + static_cast(msg->NewState)); + + ExecuteTx( + ctx, + std::move(requestInfo), + std::move(msg->AgentId), + std::move(msg->Path), + msg->NewState, + msg->KnownGeneration); +} + +//////////////////////////////////////////////////////////////////////////////// + +bool TDiskRegistryActor::PrepareUpdatePathAttachState( + const TActorContext& ctx, + TTransactionContext& tx, + TTxDiskRegistry::TUpdatePathAttachState& args) +{ + Y_UNUSED(ctx); + Y_UNUSED(tx); + Y_UNUSED(args); + + return true; +} + +void TDiskRegistryActor::ExecuteUpdatePathAttachState( + const TActorContext& ctx, + TTransactionContext& tx, + TTxDiskRegistry::TUpdatePathAttachState& args) +{ + Y_UNUSED(ctx); + TDiskRegistryDatabase db(tx.DB); + args.Error = State->UpdatePathAttachState( + db, + args.AgentId, + args.Path, + args.NewState, + args.KnownGeneration); +} + +void TDiskRegistryActor::CompleteUpdatePathAttachState( + const TActorContext& ctx, + TTxDiskRegistry::TUpdatePathAttachState& args) +{ + LOG_LOG( + ctx, + HasError(args.Error) ? NLog::PRI_ERROR : NLog::PRI_INFO, + TBlockStoreComponents::DISK_REGISTRY, + "UpdatePathAttachState result: Agent=%s, Path=%s, Error=%s", + args.AgentId.c_str(), + args.Path.c_str(), + FormatError(args.Error).c_str()); + + auto response = std::make_unique< + TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse>( + std::move(args.Error)); + NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + + SecureErase(ctx); + ProcessPathsToAttach(ctx); +} + +//////////////////////////////////////////////////////////////////////////////// + +void TDiskRegistryActor::ProcessPathsToAttachOnAgent( + const NActors::TActorContext& ctx, + const NProto::TAgentConfig* agent, + const THashSet& paths) +{ + const TString& agentId = agent->GetAgentId(); + TVector pathsToAttach; + for (const auto& path: paths) { + auto it = agent->GetPathAttachStates().find(path); + Y_DEBUG_ABORT_UNLESS(it != agent->GetPathAttachStates().end()); + if (it == agent->GetPathAttachStates().end()) { + continue; + } + + if (it->second != NProto::PATH_ATTACH_STATE_ATTACHING) { + continue; + } + + pathsToAttach.emplace_back(path); + } + + if (!pathsToAttach) { + return; + } + + auto actorId = NCloud::Register( + ctx, + ctx.SelfID, + agentId, + agent->GetNodeId(), + true, // isAttach + Executor()->Generation(), + State->GetDiskAgentGeneration(agentId), + std::move(pathsToAttach), + Config->GetAttachDetachPathRequestTimeout(), + std::nullopt // requestContext + ); + AgentsWithAttachDetachRequestsInProgress[agentId] = actorId; +} + +void TDiskRegistryActor::ProcessPathsToAttach(const TActorContext& ctx) +{ + if (!Config->GetAttachDetachPathsEnabled()) { + return; + } + + if (AgentsWithAttachDetachRequestsInProgress.size() == + Config->GetMaxInflightAttachDetachPathRequestsProcessing()) + { + LOG_WARN( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "[%lu] Max inflight of attach detach requests reached, will try " + "later", + TabletID()); + return; + } + + auto pathsNeedToProcess = State->GetPathsToAttachDetach(); + if (!pathsNeedToProcess) { + LOG_DEBUG( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "[%lu] No attach detach path requests to process, will try later", + TabletID()); + return; + } + + for (const auto& [agentId, paths]: pathsNeedToProcess) { + if (AgentsWithAttachDetachRequestsInProgress.contains(agentId)) { + continue; + } + + const auto* agent = State->FindAgent(agentId); + if (!agent || agent->GetState() == NProto::AGENT_STATE_UNAVAILABLE || + agent->GetTemporaryAgent()) + { + continue; + } + + ProcessPathsToAttachOnAgent(ctx, agent, paths); + + if (AgentsWithAttachDetachRequestsInProgress.size() == + Config->GetMaxInflightAttachDetachPathRequestsProcessing()) + { + return; + } + } +} + +void TDiskRegistryActor::TryToDetachPaths( + const NActors::TActorContext& ctx, + const TString& agentId, + TVector paths, + TRequestInfoPtr requestInfo, + NProto::TAction_EType actionType) +{ + if (AgentsWithAttachDetachRequestsInProgress.contains(agentId)) { + auto response = CreateResponseByActionType( + actionType, + MakeError( + E_REJECTED, + Sprintf( + "already has pending cms action request for agent[%s]", + agentId.Quote().c_str()))); + NCloud::Reply(ctx, *requestInfo, std::move(response)); + return; + } + + const auto* agent = State->FindAgent(agentId); + if (!agent) { + auto response = CreateResponseByActionType( + actionType, + MakeError( + E_NOT_FOUND, + Sprintf("agent[%s] not found", agentId.Quote().c_str()))); + NCloud::Reply(ctx, *requestInfo, std::move(response)); + return; + } + + for (const auto& path: paths) { + auto it = agent->GetPathAttachStates().find(path); + if (it == agent->GetPathAttachStates().end() || + it->second == NProto::PATH_ATTACH_STATE_DETACHED) + { + continue; + } + + auto response = CreateResponseByActionType( + actionType, + MakeError( + E_ARGUMENT, + Sprintf( + "path[%s] is not detached on agent[%s]", + path.Quote().c_str(), + agentId.Quote().c_str()))); + + NCloud::Reply(ctx, *requestInfo, std::move(response)); + return; + } + + auto actorId = NCloud::Register( + ctx, + ctx.SelfID, + agentId, + agent->GetNodeId(), + false, // isAttach + Executor()->Generation(), + State->GetDiskAgentGeneration(agentId), + std::move(paths), + Config->GetAttachDetachPathRequestTimeout(), + TRequestContext{ + .RequestInfo = std::move(requestInfo), + .ActionType = actionType}); + AgentsWithAttachDetachRequestsInProgress[agentId] = actorId; +} + +void TDiskRegistryActor::HandleAttachDetachPathOperationCompleted( + const TEvDiskRegistryPrivate::TEvAttachDetachPathOperationCompleted::TPtr& + ev, + const NActors::TActorContext& ctx) +{ + auto* msg = ev->Get(); + if (HasError(msg->Error)) { + LOG_ERROR( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "[%lu] Failed to %s path on agent %s: %s", + TabletID(), + msg->IsAttach ? "attach" : "detach", + msg->AgentId.Quote().c_str(), + FormatError(msg->Error).c_str()); + } else { + LOG_INFO( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "[%lu] Successfully %s path on agent %s", + TabletID(), + msg->IsAttach ? "attached" : "detached", + msg->AgentId.Quote().c_str()); + } + + AgentsWithAttachDetachRequestsInProgress.erase(msg->AgentId); + + ProcessPathsToAttach(ctx); +} + +} // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_attach_detach_path_monitoring.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_attach_detach_path_monitoring.cpp new file mode 100644 index 00000000000..578f3183b36 --- /dev/null +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_attach_detach_path_monitoring.cpp @@ -0,0 +1,338 @@ +#include "disk_registry_actor.h" + +namespace NCloud::NBlockStore::NStorage { + +using namespace NActors; +using namespace NKikimr; +using namespace NMonitoringUtils; + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +class TUpdatePathAttachStateActor final + : public TActorBootstrapped +{ +private: + const TActorId Owner; + const ui64 TabletID; + const TRequestInfoPtr RequestInfo; + const TString AgentID; + const TString Path; + const NProto::EPathAttachState NewState; + const ui64 DiskAgentGeneration; + +public: + TUpdatePathAttachStateActor( + const TActorId& owner, + ui64 tabletID, + TRequestInfoPtr requestInfo, + TString agentId, + TString path, + NProto::EPathAttachState newState, + ui64 diskAgentGeneration); + + void Bootstrap(const TActorContext& ctx); + +private: + void Notify( + const TActorContext& ctx, + TString message, + const EAlertLevel alertLevel); + + void ReplyAndDie(const TActorContext& ctx, NProto::TError error); + +private: + STFUNC(StateWork); + + void HandleUpdatePathAttachStateResponse( + const TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse::TPtr& + ev, + const TActorContext& ctx); + + void HandlePoisonPill( + const TEvents::TEvPoisonPill::TPtr& ev, + const TActorContext& ctx); +}; + +//////////////////////////////////////////////////////////////////////////////// + +TUpdatePathAttachStateActor::TUpdatePathAttachStateActor( + const TActorId& owner, + ui64 tabletID, + TRequestInfoPtr requestInfo, + TString agentId, + TString path, + NProto::EPathAttachState newState, + ui64 diskAgentGeneration) + : Owner(owner) + , TabletID(tabletID) + , RequestInfo(std::move(requestInfo)) + , AgentID(std::move(agentId)) + , Path(std::move(path)) + , NewState(newState) + , DiskAgentGeneration(diskAgentGeneration) +{} + +void TUpdatePathAttachStateActor::Bootstrap(const TActorContext& ctx) +{ + auto request = std::make_unique< + TEvDiskRegistryPrivate::TEvUpdatePathAttachStateRequest>(); + + request->AgentId = AgentID; + request->Path = Path; + request->NewState = NewState; + request->KnownGeneration = DiskAgentGeneration; + + NCloud::Send(ctx, Owner, std::move(request)); + + Become(&TThis::StateWork); +} + +void TUpdatePathAttachStateActor::Notify( + const TActorContext& ctx, + TString message, + const EAlertLevel alertLevel) +{ + TStringStream out; + + BuildNotifyPageWithRedirect( + out, + std::move(message), + TStringBuilder() << "./app?action=agent&TabletID=" << TabletID + << "&AgentID=" << AgentID, + alertLevel); + + auto response = std::make_unique(out.Str()); + NCloud::Reply(ctx, *RequestInfo, std::move(response)); +} + +void TUpdatePathAttachStateActor::ReplyAndDie( + const TActorContext& ctx, + NProto::TError error) +{ + if (!HasError(error)) { + Notify(ctx, "Operation successfully completed", EAlertLevel::SUCCESS); + } else { + Notify( + ctx, + TStringBuilder() + << "failed to change path[" << Path.Quote() + << "] state on agent[" << AgentID.Quote() << "] to " + << NProto::EPathAttachState_Name(NewState) << ": " + << FormatError(error), + EAlertLevel::DANGER); + } + + NCloud::Send( + ctx, + Owner, + std::make_unique()); + + Die(ctx); +} + +//////////////////////////////////////////////////////////////////////////////// + +void TUpdatePathAttachStateActor::HandlePoisonPill( + const TEvents::TEvPoisonPill::TPtr& ev, + const TActorContext& ctx) +{ + Y_UNUSED(ev); + ReplyAndDie(ctx, MakeTabletIsDeadError(E_REJECTED, __LOCATION__)); +} + +void TUpdatePathAttachStateActor::HandleUpdatePathAttachStateResponse( + const TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse::TPtr& ev, + const TActorContext& ctx) +{ + const auto* response = ev->Get(); + + ReplyAndDie(ctx, response->GetError()); +} + +//////////////////////////////////////////////////////////////////////////////// + +STFUNC(TUpdatePathAttachStateActor::StateWork) +{ + switch (ev->GetTypeRewrite()) { + HFunc(TEvents::TEvPoisonPill, HandlePoisonPill); + + HFunc( + TEvDiskRegistryPrivate::TEvUpdatePathAttachStateResponse, + HandleUpdatePathAttachStateResponse); + + default: + HandleUnexpectedEvent( + ev, + TBlockStoreComponents::DISK_REGISTRY_WORKER, + __PRETTY_FUNCTION__); + break; + } +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +void TDiskRegistryActor::HandleHttpInfo_UpdatePathAttachState( + const TActorContext& ctx, + const TCgiParameters& params, + TRequestInfoPtr requestInfo) +{ + if (!Config->GetAttachDetachPathsEnabled()) { + RejectHttpRequest( + ctx, + *requestInfo, + "Attach Detach path feature disabled"); + return; + } + + const auto& newStateRaw = params.Get("NewState"); + const auto& agentId = params.Get("AgentID"); + const auto& path = params.Get("Path"); + + if (!newStateRaw) { + RejectHttpRequest(ctx, *requestInfo, "No new state is given"); + return; + } + + if (!agentId) { + RejectHttpRequest(ctx, *requestInfo, "No agent id is given"); + return; + } + + if (!path) { + RejectHttpRequest(ctx, *requestInfo, "No path is given"); + return; + } + + NProto::EPathAttachState newState; + if (!EPathAttachState_Parse(newStateRaw, &newState)) { + RejectHttpRequest(ctx, *requestInfo, "Invalid new state"); + return; + } + + switch (newState) { + case NProto::PATH_ATTACH_STATE_ATTACHING: + case NProto::PATH_ATTACH_STATE_DETACHED: + break; + default: + RejectHttpRequest(ctx, *requestInfo, "Invalid new state"); + return; + } + + LOG_INFO( + ctx, + TBlockStoreComponents::DISK_REGISTRY, + "[%lu] Change state of path[%s] on agent[%s] on monitoring page to %s", + TabletID(), + path.Quote().c_str(), + agentId.Quote().c_str(), + newStateRaw.c_str()); + + const ui64 pathGeneration = State->GetDiskAgentGeneration(agentId); + + auto actor = NCloud::Register( + ctx, + SelfId(), + TabletID(), + std::move(requestInfo), + agentId, + path, + newState, + pathGeneration); + + Actors.insert(actor); +} + +void TDiskRegistryActor::RenderPathAttachStates( + IOutputStream& out, + const NProto::TAgentConfig& agent) const +{ + HTML (out) { + + TAG (TH3) { + out << "Disk Agent Generation: " + << State->GetDiskAgentGeneration(agent.GetAgentId()); + } + + TAG (TH3) { + out << "Path attach states"; + } + + TABLE_SORTABLE_CLASS("table table-bordered") + { + TABLEHEAD () { + TABLER () { + TABLEH () { + out << "Path"; + } + TABLEH () { + out << "Attach State"; + } + TABLEH () { + out << "Path generation"; + } + TABLEH () { + out << "Attach\\Detach Path"; + } + } + } + + for (const auto& [path, state]: agent.GetPathAttachStates()) { + TABLER () { + TABLED () { + out << path; + } + TABLED () { + switch (state) { + case NProto::PATH_ATTACH_STATE_ATTACHED: + out << "Attached"; + break; + case NProto::PATH_ATTACH_STATE_ATTACHING: + out << "Attaching"; + break; + case NProto::PATH_ATTACH_STATE_DETACHED: + out << "Detached"; + break; + default: + out << "unknown state"; + break; + } + } + TABLED () { + const auto renderAttachButton = + state == NProto::PATH_ATTACH_STATE_DETACHED; + const auto buttonText = + TString(renderAttachButton ? "Attach" : "Detach") + + " Path"; + const NProto::EPathAttachState desiriableState = + renderAttachButton + ? NProto::PATH_ATTACH_STATE_ATTACHING + : NProto::PATH_ATTACH_STATE_DETACHED; + + out << Sprintf( + R"(
+ + + + + + +
)", + agent.GetAgentId().c_str(), + path.c_str(), + NProto::EPathAttachState_Name(desiriableState) + .c_str(), + TabletID(), + buttonText.c_str()); + } + } + } + } + } +} + +} // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp index 20c5ae147ad..c72afeb3829 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_loadstate.cpp @@ -209,6 +209,8 @@ void TDiskRegistryActor::CompleteLoadState( ScheduleDiskRegistryAgentListExpiredParamsCleanup(ctx); + ProcessPathsToAttach(ctx); + if (auto orphanDevices = State->FindOrphanDevices()) { LOG_INFO( ctx, diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_monitoring.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_monitoring.cpp index 542c1f3af2c..0cbbf16c103 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_monitoring.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_monitoring.cpp @@ -33,6 +33,10 @@ EDeviceStateFlags GetDeviceStateFlags( if (state.IsSuspendedDevice(deviceUUID)) { deviceStateFlags |= EDeviceStateFlags::SUSPENDED; } + if (!state.IsAttachedDevice(deviceUUID)) { + deviceStateFlags |= EDeviceStateFlags::DETACHED; + } + return deviceStateFlags; } @@ -652,6 +656,10 @@ void TDiskRegistryActor::RenderAgentHtmlInfo( unknownDevices, "Unknown devices"); } + + if (Config->GetAttachDetachPathsEnabled()) { + RenderPathAttachStates(out, *agent); + } } } @@ -2436,16 +2444,17 @@ void TDiskRegistryActor::HandleHttpInfo( using THttpHandlers = THashMap; - static const THttpHandlers postActions{{ - {"volumeRealloc", &TDiskRegistryActor::HandleHttpInfo_VolumeRealloc}, - {"replaceDevice", &TDiskRegistryActor::HandleHttpInfo_ReplaceDevice}, - {"changeDeviceState", - &TDiskRegistryActor::HandleHttpInfo_ChangeDeviseState}, - {"changeAgentState", - &TDiskRegistryActor::HandleHttpInfo_ChangeAgentState}, - {"resetTransactionLatencyStats", - &TDiskRegistryActor::HandleHttpInfo_ResetTransactionLatencyStats}, - }}; + static const THttpHandlers postActions{ + {{"volumeRealloc", &TDiskRegistryActor::HandleHttpInfo_VolumeRealloc}, + {"replaceDevice", &TDiskRegistryActor::HandleHttpInfo_ReplaceDevice}, + {"changeDeviceState", + &TDiskRegistryActor::HandleHttpInfo_ChangeDeviseState}, + {"changeAgentState", + &TDiskRegistryActor::HandleHttpInfo_ChangeAgentState}, + {"resetTransactionLatencyStats", + &TDiskRegistryActor::HandleHttpInfo_ResetTransactionLatencyStats}, + {"updatePathAttachState", + &TDiskRegistryActor::HandleHttpInfo_UpdatePathAttachState}}}; static const THttpHandlers getActions{{ {"dev", &TDiskRegistryActor::HandleHttpInfo_RenderDeviceHtmlInfo}, diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_purge_host_cms.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_purge_host_cms.cpp index 8a464d7a4aa..b4e1f4984a6 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_purge_host_cms.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_purge_host_cms.cpp @@ -93,6 +93,8 @@ void TDiskRegistryActor::CompletePurgeHostCms( SecureErase(ctx); StartMigration(ctx); + ProcessPathsToAttach(ctx); + auto response = std::make_unique( std::move(args.Error), diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp index bdafc8bb57b..6b65c6bc550 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_register.cpp @@ -185,6 +185,19 @@ void TDiskRegistryActor::CompleteAddAgent( args.DevicesToDisableIO.begin(), args.DevicesToDisableIO.end()); + if (Config->GetAttachDetachPathsEnabled()) { + response->Record.SetDiskRegistryGeneration(Executor()->Generation()); + response->Record.SetDiskAgentGeneration( + State->GetDiskAgentGeneration(args.Config.GetAgentId())); + + auto pathsToAttach = + State->GetPathsToAttachOnRegistration(args.Config.GetAgentId()); + + response->Record.MutablePathsToAttach()->Add( + std::make_move_iterator(pathsToAttach.begin()), + std::make_move_iterator(pathsToAttach.end())); + } + NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); SendCachedAcquireRequestsToAgent(ctx, args.Config); diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_device_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_device_state.cpp index f84d725180c..bf034dce91c 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_device_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_device_state.cpp @@ -86,6 +86,12 @@ void TDiskRegistryActor::CompleteUpdateCmsHostDeviceState( const TActorContext& ctx, TTxDiskRegistry::TUpdateCmsHostDeviceState& args) { + const auto* agent = State->FindAgent(args.Host); + const bool needToDetachPath = + Config->GetAttachDetachPathsEnabled() && + args.State == NProto::DEVICE_STATE_WARNING && !HasError(args.Error) && + agent && agent->GetState() != NProto::AGENT_STATE_UNAVAILABLE; + LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY, "UpdateCmsDeviceState result: %s %u", FormatError(args.Error).c_str(), @@ -98,14 +104,27 @@ void TDiskRegistryActor::CompleteUpdateCmsHostDeviceState( SecureErase(ctx); StartMigration(ctx); - using TResponse = TEvDiskRegistryPrivate::TEvUpdateCmsHostDeviceStateResponse; + ProcessPathsToAttach(ctx); + + if (!needToDetachPath) { + using TResponse = + TEvDiskRegistryPrivate::TEvUpdateCmsHostDeviceStateResponse; + + auto response = std::make_unique( + std::move(args.Error), + args.Timeout, + std::move(args.AffectedDisks)); - auto response = std::make_unique( - std::move(args.Error), - args.Timeout, - std::move(args.AffectedDisks)); + NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + return; + } - NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + TryToDetachPaths( + ctx, + args.Host, + {args.Path}, + args.RequestInfo, + NProto::TAction::REMOVE_DEVICE); } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_host_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_host_state.cpp index cfb341f3955..1d407ac7f66 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_host_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_actor_update_cms_host_state.cpp @@ -79,6 +79,12 @@ void TDiskRegistryActor::CompleteUpdateCmsHostState( const TActorContext& ctx, TTxDiskRegistry::TUpdateCmsHostState& args) { + const auto* agent = State->FindAgent(args.Host); + const bool needToDetachPaths = + Config->GetAttachDetachPathsEnabled() && + args.State == NProto::AGENT_STATE_WARNING && !HasError(args.Error) && + agent && agent->GetState() != NProto::AGENT_STATE_UNAVAILABLE; + LOG_INFO(ctx, TBlockStoreComponents::DISK_REGISTRY, "UpdateCmsHost result: Host=%s Error=%s Timeout=%u AffectedDisks=%s", args.Host.c_str(), @@ -102,13 +108,38 @@ void TDiskRegistryActor::CompleteUpdateCmsHostState( SecureErase(ctx); StartMigration(ctx); - using TResponse = TEvDiskRegistryPrivate::TEvUpdateCmsHostStateResponse; + ProcessPathsToAttach(ctx); + + if (!needToDetachPaths) { + using TResponse = TEvDiskRegistryPrivate::TEvUpdateCmsHostStateResponse; + + auto response = std::make_unique(std::move(args.Error)); + response->Timeout = args.Timeout; + response->DependentDiskIds = std::move(args.AffectedDisks); - auto response = std::make_unique(std::move(args.Error)); - response->Timeout = args.Timeout; - response->DependentDiskIds = std::move(args.AffectedDisks); + NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + return; + } + + TVector paths; + for (const auto& [path, state]: agent->GetPathAttachStates()) { + if (state == NProto::PATH_ATTACH_STATE_DETACHED) { + paths.emplace_back(path); + } + } - NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); + for (const auto& device: agent->GetUnknownDevices()) { + paths.emplace_back(device.GetDeviceName()); + } + + SortUnique(paths); + + TryToDetachPaths( + ctx, + args.Host, + std::move(paths), + args.RequestInfo, + NProto::TAction::REMOVE_HOST); } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_private.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_private.h index 2ddda5318f2..3b53c024599 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_private.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_private.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -211,6 +212,7 @@ using TVolumeConfig = NKikimrBlockStore::TVolumeConfig; xxx(RestoreDiskRegistryPart, __VA_ARGS__) \ xxx(SwitchAgentDisksToReadOnly, __VA_ARGS__) \ xxx(PurgeHostCms, __VA_ARGS__) \ + xxx(UpdatePathAttachState, __VA_ARGS__) \ // BLOCKSTORE_DISK_REGISTRY_REQUESTS_PRIVATE //////////////////////////////////////////////////////////////////////////////// @@ -783,6 +785,32 @@ struct TEvDiskRegistryPrivate { }; + // + // UpdatePathAttachState + // + + struct TUpdatePathAttachStateRequest + { + TString AgentId; + TString Path; + NProto::EPathAttachState NewState; + ui64 KnownGeneration; + }; + + struct TUpdatePathAttachStateResponse + { + }; + + // + // AttachDetachPathOperationCompleted + // + + struct TAttachDetachPathOperationCompleted + { + TString AgentId; + bool IsAttach; + }; + // // Events declaration // @@ -801,6 +829,8 @@ struct TEvDiskRegistryPrivate EvDiskRegistryAgentListExpiredParamsCleanup, + EvAttachDetachPathOperationCompleted, + EvEnd }; @@ -823,6 +853,10 @@ struct TEvDiskRegistryPrivate using TEvDiskRegistryAgentListExpiredParamsCleanup = TRequestEvent< TDiskRegistryAgentListExpiredParamsCleanup, EvDiskRegistryAgentListExpiredParamsCleanup>; + + using TEvAttachDetachPathOperationCompleted = TResponseEvent< + TAttachDetachPathOperationCompleted, + EvAttachDetachPathOperationCompleted>; }; } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp index 834df623a7b..476c86148ca 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.cpp @@ -295,6 +295,26 @@ bool HasNewSerialNumber( oldConfig.GetSerialNumber() != newConfig.GetSerialNumber(); } +bool StateTransitionAllowed( + NProto::EPathAttachState newState, + NProto::EPathAttachState oldState) +{ + static const THashMap< + NProto::EPathAttachState, + THashSet> + AllowedTransitions{ + {NProto::PATH_ATTACH_STATE_ATTACHED, + {NProto::PATH_ATTACH_STATE_ATTACHING}}, + {NProto::PATH_ATTACH_STATE_ATTACHING, + {NProto::PATH_ATTACH_STATE_DETACHED}}, + {NProto::PATH_ATTACH_STATE_DETACHED, + {NProto::PATH_ATTACH_STATE_ATTACHING, + NProto::PATH_ATTACH_STATE_ATTACHED}}}; + + const auto* allowedOldStates = AllowedTransitions.FindPtr(newState); + return allowedOldStates && allowedOldStates->contains(oldState); +} + } // namespace //////////////////////////////////////////////////////////////////////////////// @@ -387,7 +407,8 @@ TDiskRegistryState::TDiskRegistryState( CollectDirtyDeviceUUIDs(dirtyDevices), std::move(suspendedDevices), CollectAllocatedDevices(disks), - StorageConfig->GetDiskRegistryAlwaysAllocatesLocalDisks()) + StorageConfig->GetDiskRegistryAlwaysAllocatesLocalDisks(), + StorageConfig->GetAttachDetachPathsEnabled()) , BrokenDisks(std::move(brokenDisks)) , AutomaticallyReplacedDevices(std::move(automaticallyReplacedDevices)) , CurrentConfig(std::move(config)) @@ -1027,6 +1048,23 @@ auto TDiskRegistryState::RegisterAgent( } } + // Needed to initialize path states at first start, can be deleted after + // full feature enabling. + if (StorageConfig->GetAttachDetachPathsEnabled()) { + for (const auto& device: agent.GetDevices()) { + if (agent.GetPathAttachStates().contains( + device.GetDeviceName())) + { + continue; + } + + auto& pathAttachStates = *agent.MutablePathAttachStates(); + + pathAttachStates[device.GetDeviceName()] = + NProto::PATH_ATTACH_STATE_ATTACHED; + } + } + DeviceList.UpdateDevices(agent, DevicePoolConfigs, r.PrevNodeId); for (const auto& uuid: r.NewDeviceIds) { @@ -1199,6 +1237,11 @@ auto TDiskRegistryState::GetUnavailableDevicesForDisk( return unavailableDeviceIds; } +ui64 TDiskRegistryState::GetDiskAgentGeneration(const TString& agentId) const +{ + return AgentList.GetDiskAgentGeneration(agentId); +} + NProto::TError TDiskRegistryState::UnregisterAgent( TDiskRegistryDatabase& db, ui32 nodeId) @@ -3022,6 +3065,18 @@ bool TDiskRegistryState::CanSecureErase( return false; } + if (StorageConfig->GetAttachDetachPathsEnabled()) { + if (auto it = agent->GetPathAttachStates().find(device.GetDeviceName()); + it != agent->GetPathAttachStates().end() && + it->second != NProto::PATH_ATTACH_STATE_ATTACHED) + { + STORAGE_DEBUG( + "Skip SecureErase for device '%s'. Device is detached", + device.GetDeviceUUID().c_str()); + return false; + } + } + if (agent->GetState() == NProto::AGENT_STATE_UNAVAILABLE) { STORAGE_DEBUG( "Skip SecureErase for device '%s'. Agent is unavailable", @@ -5441,8 +5496,9 @@ NProto::TError TDiskRegistryState::UpdateCmsHostState( CountBrokenHddPlacementGroupPartitionsAfterAgentRemoval(*agent); const ui32 maxBrokenHddPartitions = StorageConfig->GetMaxBrokenHddPlacementGroupPartitionsAfterDeviceRemoval(); - if (!hasDependentDisks - && brokenPlacementGroupPartitions <= maxBrokenHddPartitions) + + if (!hasDependentDisks && + brokenPlacementGroupPartitions <= maxBrokenHddPartitions) { // no dependent disks => we can return this host immediately timeout = TDuration::Zero(); @@ -5508,6 +5564,28 @@ NProto::TError TDiskRegistryState::UpdateCmsHostState( } } + auto attachDetachPaths = [&](bool attach) + { + TVector paths; + for (const auto& device: agent->GetDevices()) { + paths.emplace_back(device.GetDeviceName()); + } + for (const auto& [path, _]: agent->GetPathAttachStates()) { + paths.emplace_back(path); + } + SortUnique(paths); + + for (auto& path: paths) { + AttachDetachPathIfNeeded(db, *agent, path, attach); + } + }; + + if (!HasError(result) && newState == NProto::AGENT_STATE_ONLINE) { + attachDetachPaths(true); // attach + } else if (!HasError(result) && newState == NProto::AGENT_STATE_WARNING) { + attachDetachPaths(false); // detach + } + return result; } @@ -5940,6 +6018,56 @@ auto TDiskRegistryState::ResolveDevices( return { agent, std::move(devices) }; } +bool TDiskRegistryState::HasDependentDisks( + const TAgentId& agentId, + const TString& path) +{ + auto* agent = AgentList.FindAgent(agentId); + if (!agent) { + return false; + } + + return AnyOf( + *agent->MutableDevices(), + [&](auto& device) + { + if (device.GetDeviceName() != path || + device.GetState() == NProto::DEVICE_STATE_ERROR) + { + return false; + } + + return !!FindDisk(device.GetDeviceUUID()); + }); +} + +TVector TDiskRegistryState::GetPathsToAttachOnRegistration( + const TAgentId& agentId) const +{ + if (!StorageConfig->GetAttachDetachPathsEnabled()) { + return {}; + } + + const auto* agent = AgentList.FindAgent(agentId); + if (!agent) { + return {}; + } + + std::optional> pathsWithDependentDisks; + + TVector pathsToAttach; + + for (const auto& [path, attachState]: agent->GetPathAttachStates()) { + if (attachState == NProto::PATH_ATTACH_STATE_ATTACHED || + attachState == NProto::PATH_ATTACH_STATE_ATTACHING) + { + pathsToAttach.emplace_back(path); + } + } + + return pathsToAttach; +} + auto TDiskRegistryState::AddNewDevices( TDiskRegistryDatabase& db, NProto::TAgentConfig& agent, @@ -5967,6 +6095,8 @@ auto TDiskRegistryState::AddNewDevices( return {}; } + AttachPathIfNeeded(db, agent, path); + TVector ids; for (auto* device: devices) { ids.push_back(device->GetDeviceUUID()); @@ -6214,6 +6344,17 @@ auto TDiskRegistryState::UpdateCmsDeviceState( } } + if (!dryRun && !HasError(result.Error) && + newState == NProto::DEVICE_STATE_ONLINE) + { + AttachPathIfNeeded(db, *agent, path); + } else if ( + !dryRun && !HasError(result.Error) && + newState == NProto::DEVICE_STATE_WARNING) + { + DetachPathIfNeeded(db, *agent, path); + } + if (processed != devices.size()) { STORAGE_WARN( "UpdateCmsDeviceState stopped after processing %u devices" @@ -6695,6 +6836,150 @@ NProto::TError TDiskRegistryState::AddOutdatedLaggingDevices( return {}; } +NProto::TError TDiskRegistryState::UpdatePathAttachState( + TDiskRegistryDatabase& db, + const TAgentId& agentId, + const TString& path, + NProto::EPathAttachState state, + ui64 knownGeneration) +{ + auto [agent, devices] = ResolveDevices(agentId, path); + if (!agent) { + return MakeError(E_NOT_FOUND, "agent not found"); + } + + auto& pathStates = *agent->MutablePathAttachStates(); + auto it = pathStates.find(path); + if (it == pathStates.end()) { + return MakeError(E_NOT_FOUND, "path not found"); + } + + auto oldState = it->second; + if (oldState == state) { + return MakeError(S_ALREADY); + } + + auto currentGeneration = AgentList.GetDiskAgentGeneration(agentId); + if (currentGeneration != knownGeneration) { + return MakeError(E_FAIL, "outdated transition"); + } + + if (!StateTransitionAllowed(state, oldState)) { + return MakeError( + E_ARGUMENT, + Sprintf( + "Cant update path state from %s to %s", + NProto::EPathAttachState_Name(oldState).c_str(), + NProto::EPathAttachState_Name(state).c_str())); + } + + if (state == NProto::PATH_ATTACH_STATE_DETACHED) { + for (const auto& d: devices) { + auto diskId = DeviceList.FindDiskId(d->GetDeviceUUID()); + if (diskId) { + return MakeError( + E_INVALID_STATE, + "Can't detach path with disks on them"); + } + } + } + + it->second = state; + + DeviceList.UpdateDevices(*agent, DevicePoolConfigs); + UpdateAgent(db, *agent); + + if (state == NProto::PATH_ATTACH_STATE_ATTACHING) { + AgentList.AddPathToAttach(agentId, path); + AgentList.InrementAndGetDiskAgentGeneration(agentId); + } else { + if (state == NProto::PATH_ATTACH_STATE_DETACHED) { + AgentList.InrementAndGetDiskAgentGeneration(agentId); + } + AgentList.DeletePathToAttach(agentId, path); + } + + return {}; +} + +void TDiskRegistryState::AttachPathIfNeeded( + TDiskRegistryDatabase& db, + NProto::TAgentConfig& agent, + const TString& path) +{ + AttachDetachPathIfNeeded( + db, + agent, + path, + true // attach + ); +} + +void TDiskRegistryState::DetachPathIfNeeded( + TDiskRegistryDatabase& db, + NProto::TAgentConfig& agent, + const TString& path) +{ + AttachDetachPathIfNeeded( + db, + agent, + path, + false // attach + ); +} + +void TDiskRegistryState::AttachDetachPathIfNeeded( + TDiskRegistryDatabase& db, + NProto::TAgentConfig& agent, + const TString& path, + bool attach) +{ + if (!StorageConfig->GetAttachDetachPathsEnabled()) { + return; + } + + // We should not detach paths with dependent disks. + if (!attach && HasDependentDisks(agent.GetAgentId(), path)) { + ReportDiskRegistryDetachPathWithDependentDisk( + "Can't detach path with dependent disks", + {{"agent", agent.GetAgentId()}, {"path", path}}); + } + + const auto desirableState = attach ? NProto::PATH_ATTACH_STATE_ATTACHING + : NProto::PATH_ATTACH_STATE_DETACHED; + const auto companionState = attach ? NProto::PATH_ATTACH_STATE_ATTACHED + : NProto::PATH_ATTACH_STATE_DETACHED; + + auto& pathStates = *agent.MutablePathAttachStates(); + auto [it, inserted] = pathStates.insert({path, desirableState}); + auto& actualState = it->second; + if (!inserted && actualState != desirableState && + actualState != companionState) + { + actualState = desirableState; + inserted = true; + } + + if (inserted) { + STORAGE_INFO(Sprintf( + "added path[%s] for %s on agent[%s]", + path.Quote().c_str(), + attach ? "attach" : "detach", + agent.GetAgentId().Quote().c_str())); + + AgentList.AddPathToAttach(agent.GetAgentId(), path); + AgentList.InrementAndGetDiskAgentGeneration(agent.GetAgentId()); + UpdateAgent(db, agent); + DeviceList.UpdateDevices(agent, DevicePoolConfigs); + } +} + +const THashMap>& +TDiskRegistryState::GetPathsToAttachDetach() const +{ + return AgentList.GetPathsToAttach(); +} + auto TDiskRegistryState::FindReplicaByMigration( const TDiskId& masterDiskId, const TDeviceId& sourceDeviceId, @@ -7608,6 +7893,22 @@ bool TDiskRegistryState::IsDirtyDevice(const TDeviceId& id) const return DeviceList.IsDirtyDevice(id); } +bool TDiskRegistryState::IsAttachedDevice(const TDeviceId& id) const +{ + const auto* device = DeviceList.FindDevice(id); + if (!device) { + return false; + } + const auto* agent = AgentList.FindAgent(device->GetAgentId()); + + auto it = agent->GetPathAttachStates().find(device->GetDeviceName()); + if (it == agent->GetPathAttachStates().end()) { + return false; + } + + return it->second == NProto::PATH_ATTACH_STATE_ATTACHED; +} + TVector TDiskRegistryState::GetSuspendedDevices() const { return DeviceList.GetSuspendedDevices(); diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h index cf18583d987..90ab713aadf 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_state.h @@ -787,7 +787,28 @@ class TDiskRegistryState const TDiskId& diskId, TVector outdatedDevices); - NProto::TError SuspendDevice(TDiskRegistryDatabase& db, const TDeviceId& id); + [[nodiscard]] NProto::TError UpdatePathAttachState( + TDiskRegistryDatabase& db, + const TAgentId& agentId, + const TString& path, + NProto::EPathAttachState state, + ui64 knownGeneration); + + void AttachPathIfNeeded( + TDiskRegistryDatabase& db, + NProto::TAgentConfig& agent, + const TString& path); + + void DetachPathIfNeeded( + TDiskRegistryDatabase& db, + NProto::TAgentConfig& agent, + const TString& path); + + const THashMap>& GetPathsToAttachDetach() const; + + NProto::TError SuspendDevice( + TDiskRegistryDatabase& db, + const TDeviceId& id); void SuspendDeviceIfNeeded( TDiskRegistryDatabase& db, @@ -803,6 +824,9 @@ class TDiskRegistryState const TVector& ids); bool IsSuspendedDevice(const TDeviceId& id) const; bool IsDirtyDevice(const TDeviceId& id) const; + + bool IsAttachedDevice(const TDeviceId& id) const; + TVector GetSuspendedDevices() const; const TDeque& GetAutomaticallyReplacedDevices() const @@ -905,6 +929,16 @@ class TDiskRegistryState THashSet GetUnavailableDevicesForDisk( const TString& diskId) const; + ui64 GetDiskAgentGeneration(const TString& agentId) const; + + auto ResolveDevices(const TAgentId& agentId, const TString& path) + -> std::pair>; + + bool HasDependentDisks(const TAgentId& agentId, const TString& path); + + TVector GetPathsToAttachOnRegistration( + const TAgentId& agentId) const; + private: void ProcessConfig(const NProto::TDiskRegistryConfig& config); void ProcessDisks(TVector disks); @@ -1306,9 +1340,6 @@ class TDiskRegistryState NProto::EVolumeIOMode GetIoMode(TDiskState& disk, TInstant now) const; - auto ResolveDevices(const TAgentId& agentId, const TString& path) - -> std::pair>; - NProto::TError CmsAddDevice( TDiskRegistryDatabase& db, NProto::TAgentConfig& agent, @@ -1377,6 +1408,12 @@ class TDiskRegistryState void ReallocateDisksWithLostOrReappearedDevices( const TAgentList::TAgentRegistrationResult& r, THashSet& disksToReallocate); + + void AttachDetachPathIfNeeded( + TDiskRegistryDatabase& db, + NProto::TAgentConfig& agent, + const TString& path, + bool attach); }; } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h index 30515c6596b..a46e351a5ec 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_tx.h @@ -63,6 +63,7 @@ namespace NCloud::NBlockStore::NStorage { xxx(PurgeHostCms, __VA_ARGS__) \ xxx(RemoveOrphanDevices, __VA_ARGS__) \ xxx(AddOutdatedLaggingDevices, __VA_ARGS__) \ + xxx(UpdatePathAttachState, __VA_ARGS__) \ // BLOCKSTORE_DISK_REGISTRY_TRANSACTIONS //////////////////////////////////////////////////////////////////////////////// @@ -1479,6 +1480,39 @@ struct TTxDiskRegistry Error.Clear(); } }; + + // + // UpdatePathAttachState + // + + struct TUpdatePathAttachState + { + const TRequestInfoPtr RequestInfo; + const TString AgentId; + const TString Path; + const NProto::EPathAttachState NewState; + const ui64 KnownGeneration; + + NProto::TError Error; + + TUpdatePathAttachState( + TRequestInfoPtr requestInfo, + TString agentId, + TString path, + NProto::EPathAttachState newState, + ui64 knownGeneration) + : RequestInfo(std::move(requestInfo)) + , AgentId(std::move(agentId)) + , Path(std::move(path)) + , NewState(newState) + , KnownGeneration(knownGeneration) + {} + + void Clear() + { + Error.Clear(); + } + }; }; } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/disk_registry_ut_cms.cpp b/cloud/blockstore/libs/storage/disk_registry/disk_registry_ut_cms.cpp index 31ebe3d456b..a0a7b995da3 100644 --- a/cloud/blockstore/libs/storage/disk_registry/disk_registry_ut_cms.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/disk_registry_ut_cms.cpp @@ -1475,6 +1475,377 @@ Y_UNIT_TEST_SUITE(TDiskRegistryTest) auto response = DiskRegistry->RecvDisableAgentResponse(); UNIT_ASSERT_VALUES_EQUAL(E_NOT_FOUND, response->GetError().GetCode()); } + + Y_UNIT_TEST_F( + ShouldAttachPathOnAddDevice, + TFixture) + { + const auto agent = CreateAgentConfig( + "agent-1", + {Device("dev-1", "uuid-1", "rack-1", 10_GB), + Device("dev-2", "uuid-2", "rack-1", 10_GB)}); + + auto config = CreateDefaultStorageConfig(); + + config.SetAttachDetachPathsEnabled(true); + + SetUpRuntime(TTestRuntimeBuilder().WithAgents({agent}).With(config).Build()); + + DiskRegistry->SetWritableState(true); + + DiskRegistry->UpdateConfig(CreateRegistryConfig(0, {})); + + RegisterAgents(*Runtime, 1); + + ui64 attachRequests = 0; + ui64 detachRequests = 0; + + ui64 diskAgentGeneration = 0; + TVector pathsToAttach; + TVector pathsToDetach; + + Runtime->SetObserverFunc( + [&](TAutoPtr& event) + { + if (event->GetTypeRewrite() == + TEvDiskAgent::EvAttachPathRequest) { + attachRequests += 1; + auto* baseEvent = + event->Get(); + diskAgentGeneration = baseEvent->Record.GetDiskAgentGeneration(); + for (const auto& path: baseEvent->Record.GetPathsToAttach()) + { + pathsToAttach.emplace_back(path); + } + } else if ( + event->GetTypeRewrite() == + TEvDiskAgent::EvDetachPathRequest) + { + detachRequests += 1; + auto* baseEvent = + event->Get(); + diskAgentGeneration = + baseEvent->Record.GetDiskAgentGeneration(); + for (const auto& path: baseEvent->Record.GetPathsToDetach()) + { + pathsToDetach.emplace_back(path); + } + } + + return TTestActorRuntimeBase::DefaultObserverFunc(event); + }); + + AddHost("agent-1"); + + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + + UNIT_ASSERT_VALUES_EQUAL(1, attachRequests); + UNIT_ASSERT_VALUES_EQUAL(0, detachRequests); + UNIT_ASSERT_VALUES_EQUAL(2, pathsToAttach.size()); + UNIT_ASSERT_VALUES_EQUAL(0, pathsToDetach.size()); + + + UNIT_ASSERT_VALUES_EQUAL(3, diskAgentGeneration); + UNIT_ASSERT(FindPtr(pathsToAttach, "dev-1")); + UNIT_ASSERT(FindPtr(pathsToAttach, "dev-2")); + + pathsToAttach.clear(); + pathsToDetach.clear(); + + for (;;) { + auto error = RemoveHost("agent-1").first; + if (!HasError(error)) { + break; + } + UNIT_ASSERT_VALUES_EQUAL(E_TRY_AGAIN, error.GetCode()); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + } + + UNIT_ASSERT_VALUES_EQUAL(1, attachRequests); + UNIT_ASSERT_VALUES_EQUAL(1, detachRequests); + UNIT_ASSERT_VALUES_EQUAL(0, pathsToAttach.size()); + UNIT_ASSERT_VALUES_EQUAL(2, pathsToDetach.size()); + + + UNIT_ASSERT_VALUES_EQUAL(5, diskAgentGeneration); + UNIT_ASSERT(FindPtr(pathsToDetach, "dev-1")); + UNIT_ASSERT(FindPtr(pathsToDetach, "dev-2")); + + pathsToAttach.clear(); + pathsToDetach.clear(); + + AddDevice("agent-1", "dev-1"); + + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + + UNIT_ASSERT_VALUES_EQUAL(2, attachRequests); + UNIT_ASSERT_VALUES_EQUAL(1, detachRequests); + UNIT_ASSERT_VALUES_EQUAL(1, pathsToAttach.size()); + UNIT_ASSERT_VALUES_EQUAL(0, pathsToDetach.size()); + + UNIT_ASSERT_VALUES_EQUAL(6, diskAgentGeneration); + UNIT_ASSERT(FindPtr(pathsToAttach, "dev-1")); + + pathsToAttach.clear(); + pathsToDetach.clear(); + + + for (;;) { + auto error = RemoveDevice("agent-1", "dev-1").first; + if (!HasError(error)) { + break; + } + UNIT_ASSERT_VALUES_EQUAL(E_TRY_AGAIN, error.GetCode()); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + } + + + UNIT_ASSERT_VALUES_EQUAL(2, attachRequests); + UNIT_ASSERT_VALUES_EQUAL(2, detachRequests); + UNIT_ASSERT_VALUES_EQUAL(0, pathsToAttach.size()); + UNIT_ASSERT_VALUES_EQUAL(1, pathsToDetach.size()); + + UNIT_ASSERT_VALUES_EQUAL(7, diskAgentGeneration); + UNIT_ASSERT(FindPtr(pathsToDetach, "dev-1")); + } + + Y_UNIT_TEST_F( + ShouldPassDrGeneration, + TFixture) + { + const auto agent = CreateAgentConfig( + "agent-1", + {Device("dev-1", "uuid-1", "rack-1", 10_GB), + Device("dev-2", "uuid-2", "rack-1", 10_GB)}); + + auto config = CreateDefaultStorageConfig(); + + config.SetAttachDetachPathsEnabled(true); + + SetUpRuntime(TTestRuntimeBuilder().WithAgents({agent}).With(config).Build()); + + DiskRegistry->SetWritableState(true); + + DiskRegistry->UpdateConfig(CreateRegistryConfig(0, {})); + + RegisterAgents(*Runtime, 1); + + ui32 drGeneration = 0; + + Runtime->SetObserverFunc( + [&](TAutoPtr& event) + { + if (event->GetTypeRewrite() == + TEvDiskAgent::EvAttachPathRequest) { + auto* baseEvent = + event->Get(); + drGeneration = baseEvent->Record.GetDiskRegistryGeneration(); + } else if ( + event->GetTypeRewrite() == + TEvDiskAgent::EvDetachPathRequest) + { + auto* baseEvent = + event->Get(); + drGeneration = baseEvent->Record.GetDiskRegistryGeneration(); + } + + return TTestActorRuntimeBase::DefaultObserverFunc(event); + }); + + AddHost("agent-1"); + UNIT_ASSERT_VALUES_EQUAL(2, drGeneration); + + drGeneration = 0; + + auto [res, timeout] = RemoveHost("agent-1"); + UNIT_ASSERT_VALUES_EQUAL(S_OK, res.GetCode()); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(100)); + UNIT_ASSERT_VALUES_EQUAL(2, drGeneration); + + DiskRegistry->RebootTablet(); + DiskRegistry->WaitReady(); + + AddHost("agent-1"); + UNIT_ASSERT_VALUES_EQUAL(3, drGeneration); + + drGeneration = 0; + + RemoveHost("agent-1"); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(100)); + UNIT_ASSERT_VALUES_EQUAL(3, drGeneration); + } + + Y_UNIT_TEST_F(ShouldNotAllocateDisksOnNotAttachedDevices, TFixture) + { + const auto agent = CreateAgentConfig( + "agent-1", + {Device("dev-1", "uuid-1", "rack-1", 10_GB), + Device("dev-2", "uuid-2", "rack-1", 10_GB)}); + + auto config = CreateDefaultStorageConfig(); + + config.SetAttachDetachPathsEnabled(true); + + SetUpRuntime(TTestRuntimeBuilder().WithAgents({agent}).With(config).Build()); + + DiskRegistry->SetWritableState(true); + + DiskRegistry->UpdateConfig(CreateRegistryConfig(0, {})); + + RegisterAgents(*Runtime, 1); + + Runtime->SetObserverFunc( + [&](TAutoPtr& event) + { + if (event->GetTypeRewrite() == + TEvDiskAgent::EvAttachPathResponse) { + return TTestActorRuntimeBase::EEventAction::DROP; + } + + return TTestActorRuntimeBase::DefaultObserverFunc(event); + }); + + auto [res, timeout] = AddHost("agent-1"); + UNIT_ASSERT_VALUES_EQUAL(S_OK, res.GetCode()); + Runtime->AdvanceCurrentTime(TDuration::Minutes(2)); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + + // check that no devices available for allocate + DiskRegistry->SendAllocateDiskRequest("vol1", 10_GB); + auto response = DiskRegistry->RecvAllocateDiskResponse(); + UNIT_ASSERT_VALUES_EQUAL( + E_BS_DISK_ALLOCATION_FAILED, + response->GetStatus()); + + Runtime->SetObserverFunc(&TTestActorRuntimeBase::DefaultObserverFunc); + + Runtime->AdvanceCurrentTime(TDuration::Minutes(2)); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + + DiskRegistry->AllocateDisk("vol1", 10_GB); + } + + Y_UNIT_TEST_F(ShouldNotDetachPathsWithDisksOnThem, TFixture) + { + const auto agent = CreateAgentConfig( + "agent-1", + {Device("dev-1", "uuid-1", "rack-1", 10_GB), + Device("dev-2", "uuid-2", "rack-1", 10_GB)}); + + auto config = CreateDefaultStorageConfig(); + + config.SetAttachDetachPathsEnabled(true); + + SetUpRuntime( + TTestRuntimeBuilder().WithAgents({agent}).With(config).Build()); + + DiskRegistry->SetWritableState(true); + + DiskRegistry->UpdateConfig(CreateRegistryConfig(0, {})); + + RegisterAgents(*Runtime, 1); + + AddDevice("agent-1", "dev-1"); + WaitForSecureErase(*Runtime, 1); + + auto response = DiskRegistry->AllocateDisk("vol0", 10_GB); + const auto& msg = response->Record; + UNIT_ASSERT_VALUES_EQUAL(1, msg.DevicesSize()); + UNIT_ASSERT_VALUES_EQUAL(0, msg.MigrationsSize()); + + AddDevice("agent-1", "dev-2"); + WaitForSecureErase(*Runtime, 1); + + ui64 detachRequests = 0; + + Runtime->SetObserverFunc( + [&](TAutoPtr& event) + { + if (event->GetTypeRewrite() == + TEvDiskAgent::EvDetachPathRequest) { + detachRequests += 1; + } + + return TTestActorRuntimeBase::DefaultObserverFunc(event); + }); + + auto [error, timeout] = RemoveDevice(agent.GetAgentId(), "dev-1"); + + UNIT_ASSERT_VALUES_EQUAL(E_TRY_AGAIN, error.GetCode()); + UNIT_ASSERT_VALUES_EQUAL( + config.GetNonReplicatedInfraTimeout(), + timeout * 1000); + + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + + UNIT_ASSERT_VALUES_EQUAL(0, detachRequests); + + DiskRegistry->FinishMigration("vol0", "uuid-1", "uuid-2"); + + for (;;) { + auto error = RemoveDevice(agent.GetAgentId(), "dev-1").first; + if (!HasError(error)) { + break; + } + UNIT_ASSERT_VALUES_EQUAL(E_TRY_AGAIN, error.GetCode()); + Runtime->DispatchEvents({}, TDuration::MilliSeconds(10)); + } + + UNIT_ASSERT_VALUES_EQUAL(1, detachRequests); + } + + Y_UNIT_TEST_F(ShouldBrokeDeviceOnAttachError, TFixture) + { + const auto agent = CreateAgentConfig( + "agent-1", + {CreateDeviceConfig( + {.Name = "dev-1", + .Id = "uuid-1", + .Rack = "rack-1", + .TotalSize = 10_GB})}); + + auto config = CreateDefaultStorageConfig(); + + config.SetAttachDetachPathsEnabled(true); + + SetUpRuntime( + TTestRuntimeBuilder().WithAgents({agent}).With(config).Build()); + + DiskRegistry->SetWritableState(true); + + DiskRegistry->UpdateConfig(CreateRegistryConfig(0, {})); + + RegisterAgents(*Runtime, 1); + + auto response = DiskRegistry->BackupDiskRegistryState(false); + UNIT_ASSERT( + NProto::DEVICE_STATE_ONLINE == response->Record.GetBackup() + .GetAgents(0) + .GetUnknownDevices(0) + .GetState()); + + // Simulate device braking during attach. + { + auto sender = Runtime->AllocateEdgeActor(); + auto nodeId = Runtime->GetNodeId(0); + + auto request = std::make_unique(); + request->DeviceUUID = "uuid-1"; + + Runtime->Send(new NActors::IEventHandle( + MakeDiskAgentServiceId(nodeId), + sender, + request.release())); + + Runtime->DispatchEvents({}, TDuration::Seconds(1)); + } + + AddDevice("agent-1", "dev-1"); + response = DiskRegistry->BackupDiskRegistryState(false); + UNIT_ASSERT( + NProto::DEVICE_STATE_ERROR == + response->Record.GetBackup().GetAgents(0).GetDevices(0).GetState()); + } } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/model/agent_list.cpp b/cloud/blockstore/libs/storage/disk_registry/model/agent_list.cpp index 0527039e2ed..b0d11559c39 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/agent_list.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/agent_list.cpp @@ -88,7 +88,8 @@ TAgentList::TAgentList( const TAgentListConfig& config, NMonitoring::TDynamicCountersPtr counters, TVector configs, - THashMap diskRegistryAgentListParams, + THashMap + diskRegistryAgentListParams, TLog log) : Config(config) , ComponentGroup(std::move(counters)) @@ -104,6 +105,14 @@ TAgentList::TAgentList( if (ComponentGroup) { RejectAgentTimeoutCounter.Register(ComponentGroup, "RejectAgentTimeout"); } + + for (const auto& agents: Agents) { + for (const auto& [path, state]: agents.GetPathAttachStates()) { + if (state == NProto::PATH_ATTACH_STATE_ATTACHING) { + AddPathToAttach(agents.GetAgentId(), path); + } + } + } } NProto::TAgentConfig& TAgentList::AddAgent(NProto::TAgentConfig config) @@ -647,6 +656,8 @@ void TAgentList::RemoveAgentByIdx(size_t index) auto& agent = Agents.back(); + DiskAgentGenerations.erase(agent.GetAgentId()); + PathsToAttach.erase(agent.GetAgentId()); AgentIdToIdx.erase(agent.GetAgentId()); NodeIdToIdx.erase(agent.GetNodeId()); @@ -692,4 +703,53 @@ TVector TAgentList::GetAgentIdsWithOverriddenListParams() const return agentIds; } +ui64 TAgentList::GetDiskAgentGeneration(const TString& agentId) const +{ + const auto* agentGeneration = DiskAgentGenerations.FindPtr(agentId); + + return agentGeneration ? *agentGeneration : 1; +} + +ui64 TAgentList::InrementAndGetDiskAgentGeneration(const TString& agentId) +{ + auto& agentGeneration = DiskAgentGenerations[agentId]; + + if (!agentGeneration) { + agentGeneration = 1; + } + + return ++agentGeneration; +} + +auto TAgentList::GetPathsToAttach() const + -> const THashMap>& +{ + return PathsToAttach; +} + +void TAgentList::AddPathToAttach( + const TString& agentId, + const TString& path) +{ + auto& pathsForAgent = PathsToAttach[agentId]; + pathsForAgent.insert(path); +} + +void TAgentList::DeletePathToAttach( + const TString& agentId, + const TString& path) +{ + auto it = PathsToAttach.find(agentId); + if (it == PathsToAttach.end()) { + return; + } + + auto& pathsForAgent = it->second; + pathsForAgent.erase(path); + + if (!pathsForAgent) { + PathsToAttach.erase(it); + } +} + } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/disk_registry/model/agent_list.h b/cloud/blockstore/libs/storage/disk_registry/model/agent_list.h index 2e28d709baa..eb46818d133 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/agent_list.h +++ b/cloud/blockstore/libs/storage/disk_registry/model/agent_list.h @@ -43,6 +43,8 @@ class TAgentList using TDeviceId = TString; using TNodeId = ui32; + using TPathGenerations = THashMap; + private: const TAgentListConfig Config; const NMonitoring::TDynamicCountersPtr ComponentGroup; @@ -60,6 +62,10 @@ class TAgentList THashMap DiskRegistryAgentListParams; + THashMap DiskAgentGenerations; + + THashMap> PathsToAttach; + TLog Log; public: @@ -134,6 +140,13 @@ class TAgentList TVector CleanupExpiredAgentListParams(TInstant now); TVector GetAgentIdsWithOverriddenListParams() const; + ui64 GetDiskAgentGeneration(const TString& agentId) const; + ui64 InrementAndGetDiskAgentGeneration(const TString& agentId); + + const THashMap>& GetPathsToAttach() const; + void AddPathToAttach(const TString& agentId, const TString& path); + void DeletePathToAttach(const TString& agentId, const TString& path); + private: NProto::TAgentConfig& AddAgent(NProto::TAgentConfig config); diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp index 6e7bbfe1757..6f0a387a592 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp @@ -83,13 +83,15 @@ auto FindDeviceRange( TDeviceList::TDeviceList() : AlwaysAllocateLocalDisks(false) + , AttachDetachPathsEnabled(false) {} TDeviceList::TDeviceList( TVector dirtyDevices, TVector suspendedDevices, TVector> allocatedDevices, - bool alwaysAllocateLocalDisks) + bool alwaysAllocateLocalDisks, + bool attachDetachPathsEnabled) : AllocatedDevices( std::make_move_iterator(allocatedDevices.begin()), std::make_move_iterator(allocatedDevices.end())) @@ -97,6 +99,7 @@ TDeviceList::TDeviceList( std::make_move_iterator(dirtyDevices.begin()), std::make_move_iterator(dirtyDevices.end())) , AlwaysAllocateLocalDisks(alwaysAllocateLocalDisks) + , AttachDetachPathsEnabled(attachDetachPathsEnabled) { for (auto& device: suspendedDevices) { auto id = device.GetId(); @@ -162,11 +165,20 @@ void TDeviceList::UpdateDevices( const ui64 deviceSize = device.GetBlockSize() * device.GetBlocksCount(); + bool isDeviceAttached = true; + if (AttachDetachPathsEnabled) { + auto it = agent.GetPathAttachStates().find(device.GetDeviceName()); + if (it != agent.GetPathAttachStates().end()) { + isDeviceAttached = + it->second == NProto::PATH_ATTACH_STATE_ATTACHED; + } + } + const bool isFree = DevicesAllocationAllowed(device.GetPoolKind(), agent.GetState()) && device.GetState() == NProto::DEVICE_STATE_ONLINE && !AllocatedDevices.contains(uuid) && !DirtyDevices.contains(uuid) && - !SuspendedDevices.contains(uuid); + !SuspendedDevices.contains(uuid) && isDeviceAttached; const auto* poolConfig = poolConfigs.FindPtr(device.GetPoolName()); if (!poolConfig || poolConfig->GetKind() != device.GetPoolKind() || diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h index b345f04263b..79d7fdd72fe 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list.h +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list.h @@ -65,6 +65,7 @@ class TDeviceList THashMap> PoolKind2PoolNames; THashMap PoolName2DeviceCount; const bool AlwaysAllocateLocalDisks; + const bool AttachDetachPathsEnabled; public: struct TAllocationQuery @@ -91,7 +92,8 @@ class TDeviceList TVector dirtyDevices, TVector suspendedDevices, TVector> allocatedDevices, - bool alwaysAllocateLocalDisks); + bool alwaysAllocateLocalDisks, + bool attachDetachPathsEnabled); [[nodiscard]] size_t Size() const { diff --git a/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp b/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp index 23a348a3f8d..95adfff1474 100644 --- a/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp +++ b/cloud/blockstore/libs/storage/disk_registry/model/device_list_ut.cpp @@ -1258,7 +1258,8 @@ Y_UNIT_TEST_SUITE(TDeviceListTest) {}, // dirtyDevices {}, // suspendedDevices {{device.GetDeviceUUID(), diskId}}, - false // alwaysAllocateLocalDisks + false, // alwaysAllocateLocalDisks, + false // attachDetachPathsEnabled }; deviceList.UpdateDevices(agent, poolConfigs); @@ -1268,10 +1269,11 @@ Y_UNIT_TEST_SUITE(TDeviceListTest) { TDeviceList deviceList{ - {}, // dirtyDevices - {}, // suspendedDevices - {}, // allocatedDevices - false // alwaysAllocateLocalDisks + {}, // dirtyDevices + {}, // suspendedDevices + {}, // allocatedDevices + false, // alwaysAllocateLocalDisks + false // attachDetachPathsEnabled }; deviceList.UpdateDevices(agent, poolConfigs); diff --git a/cloud/blockstore/libs/storage/disk_registry/testlib/test_env.h b/cloud/blockstore/libs/storage/disk_registry/testlib/test_env.h index 1a6d2d5edd1..0fd2d2f91d6 100644 --- a/cloud/blockstore/libs/storage/disk_registry/testlib/test_env.h +++ b/cloud/blockstore/libs/storage/disk_registry/testlib/test_env.h @@ -47,6 +47,7 @@ constexpr ui32 DefaultLogicalBlockSize = 4_KB; enum ETestEvents { EvRegisterAgent = EventSpaceBegin(NKikimr::TEvents::ES_USERSPACE + 1), + EvBrokeDevice = EvRegisterAgent + 1, }; struct TEvRegisterAgent @@ -55,6 +56,13 @@ struct TEvRegisterAgent DEFINE_SIMPLE_NONLOCAL_EVENT(TEvRegisterAgent, "TEvRegisterAgent"); }; +struct TEvBrokeDevice + : public NActors::TEventBase +{ + TString DeviceUUID; + DEFINE_SIMPLE_NONLOCAL_EVENT(TEvBrokeDevice, "TEvBrokeDevice"); +}; + //////////////////////////////////////////////////////////////////////////////// struct TByUUID @@ -168,7 +176,12 @@ class TTestDiskAgent final HFunc(TEvDiskAgent::TEvSecureEraseDeviceRequest, HandleSecureEraseDevice); HFunc(TEvDiskAgent::TEvEnableAgentDeviceRequest, HandleEnableAgentDevice); + HFunc(TEvDiskAgent::TEvAttachPathRequest, HandleAttachPath); + HFunc(TEvDiskAgent::TEvDetachPathRequest, HandleDetachPath); + + HFunc(TEvRegisterAgent, HandleRegisterAgent) + HFunc(TEvBrokeDevice, HandleBrokeDevice) default: //HandleUnexpectedEvent(ev, TBlockStoreComponents::DISK_REGISTRY); @@ -193,6 +206,21 @@ class TTestDiskAgent final RegisterAgent(ctx); } + void HandleBrokeDevice( + const TEvBrokeDevice::TPtr& ev, + const NActors::TActorContext& ctx) + { + Y_UNUSED(ctx); + auto& uuid = ev->Get()->DeviceUUID; + auto* device = FindIfPtr( + *Config.MutableDevices(), + [&](auto& device) { return device.GetDeviceUUID() == uuid; }); + + if (device) { + device->SetState(NProto::DEVICE_STATE_ERROR); + } + } + void HandleRegisterAgent( const TEvDiskRegistry::TEvRegisterAgentResponse::TPtr& ev, const NActors::TActorContext& ctx) @@ -268,6 +296,34 @@ class TTestDiskAgent final *ev, std::make_unique()); } + + void HandleAttachPath( + const TEvDiskAgent::TEvAttachPathRequest::TPtr& ev, + const NActors::TActorContext& ctx) + { + auto& record = ev->Get()->Record; + THashSet paths; + for (const auto& path: record.GetPathsToAttach()) { + paths.emplace(path); + } + + auto response = std::make_unique(); + for (auto& device : Config.GetDevices()) { + if (paths.contains(device.GetDeviceName())) { + *response->Record.AddAttachedDevices() = device; + } + } + NCloud::Reply(ctx, *ev, std::move(response)); + } + + void HandleDetachPath( + const TEvDiskAgent::TEvDetachPathRequest::TPtr& ev, + const NActors::TActorContext& ctx) + { + auto response = std::make_unique(); + + NCloud::Reply(ctx, *ev, std::move(response)); + } }; inline TTestDiskAgent* CreateTestDiskAgent(NProto::TAgentConfig config) @@ -318,6 +374,7 @@ struct TCreateDeviceConfigParams TString Rack = "the-rack"; ui64 TotalSize = 10_GB; ui32 BlockSize = DefaultBlockSize; + NProto::EDeviceState DeviceState = NProto::DEVICE_STATE_ONLINE; }; inline NProto::TDeviceConfig CreateDeviceConfig( @@ -334,6 +391,7 @@ inline NProto::TDeviceConfig CreateDeviceConfig( device.SetTransportId(params.Id); device.MutableRdmaEndpoint()->SetHost(std::move(params.Id)); device.MutableRdmaEndpoint()->SetPort(10020); + device.SetState(params.DeviceState); return device; } @@ -343,14 +401,16 @@ inline NProto::TDeviceConfig Device( TString uuid, TString rack = "the-rack", ui64 totalSize = 10_GB, - ui32 blockSize = DefaultBlockSize) + ui32 blockSize = DefaultBlockSize, + NProto::EDeviceState deviceState = NProto::DEVICE_STATE_ONLINE) { return CreateDeviceConfig({ .Name = std::move(name), .Id = std::move(uuid), .Rack = std::move(rack), .TotalSize = totalSize, - .BlockSize = blockSize + .BlockSize = blockSize, + .DeviceState = deviceState }); } diff --git a/cloud/blockstore/libs/storage/disk_registry/ya.make b/cloud/blockstore/libs/storage/disk_registry/ya.make index 0773550a270..61b96c572ad 100644 --- a/cloud/blockstore/libs/storage/disk_registry/ya.make +++ b/cloud/blockstore/libs/storage/disk_registry/ya.make @@ -4,6 +4,8 @@ SRCS( disk_registry_actor_acquire.cpp disk_registry_actor_add_lagging_devices.cpp disk_registry_actor_allocate.cpp + disk_registry_actor_attach_detach_path_monitoring.cpp + disk_registry_actor_attach_detach_path.cpp disk_registry_actor_backup_state.cpp disk_registry_actor_change_agent_state.cpp disk_registry_actor_change_device_state.cpp diff --git a/cloud/blockstore/libs/storage/protos/disk.proto b/cloud/blockstore/libs/storage/protos/disk.proto index 3661260dd50..1260cf93e21 100644 --- a/cloud/blockstore/libs/storage/protos/disk.proto +++ b/cloud/blockstore/libs/storage/protos/disk.proto @@ -33,6 +33,47 @@ enum ECheckpointState //////////////////////////////////////////////////////////////////////////////// +// passive - send attach/detach path request only on registration +// active - send attach/detach path request on registration and trying to send this requests to DA until success + +/////////////////////////////////////////////////////////////////////////////////////////////////////////// +// // // // +// State // Can allocate disks? // Should try attach or detach path? // +// // // // +/////////////////////////////////////////////////////////////////////////////////////////////////////////// +// // // // +// PATH_ATTACH_STATE_ATTACHED // Yes // passive attach // +// // // // +/////////////////////////////////////////////////////////////////////////////////////////////////////////// +// // // // +// PATH_ATTACH_STATE_ATTACHING // No // active attach // +// // // // +/////////////////////////////////////////////////////////////////////////////////////////////////////////// +// // // // +// PATH_ATTACH_STATE_DETACHED // No // passive detach // +// // // // +/////////////////////////////////////////////////////////////////////////////////////////////////////////// + + +enum EPathAttachState +{ + PATH_ATTACH_STATE_ATTACHED = 0; + PATH_ATTACH_STATE_ATTACHING = 1; + PATH_ATTACH_STATE_DETACHED = 2; +} + +//////////////////////////////////////////////////////////////////////////////// + +message TPathToGeneration +{ + // Path to HW device. + string Path = 1; + + // Monotonously increasing counter, needed to order attach and detach requests. + uint64 Generation = 2; +} + +//////////////////////////////////////////////////////////////////////////////// // Mean time between failures. message TMeanTimeBetweenFailures { @@ -86,6 +127,9 @@ message TAgentConfig // Flag which indicates that the agent was launched as a temporary one // during blue-green deployment. bool TemporaryAgent = 13; + + // Attach state for each path. + map PathAttachStates = 14; } //////////////////////////////////////////////////////////////////////////////// @@ -592,6 +636,17 @@ message TRegisterAgentResponse // List of device IDs to which Disk Agent should disable IO operations. repeated string DevicesToDisableIO = 2; + + // Monotonously increasing counters needed to order attach/detach + // paths requests from one generation of DR tablet. + uint64 DiskAgentGeneration = 3; + + // Disk Registry Tablet generation needed to order attach/detach path + // requests sent from two different DR generations. + uint32 DiskRegistryGeneration = 4; + + // Devices that should be attached to the agent. + repeated string PathsToAttach = 5; } //////////////////////////////////////////////////////////////////////////////// @@ -1847,3 +1902,58 @@ message TGetClusterCapacityResponse // List of capacities. repeated TClusterCapacityInfo Capacity = 2; } + +//////////////////////////////////////////////////////////////////////////////// +// AttachPath + +message TAttachPathRequest +{ + // Optional request headers. + THeaders Headers = 1; + + // List of paths we want to attach. + repeated string PathsToAttach = 2; + + // Monotonously increasing counters needed to order attach/detach + // paths requests from one generation of DR tablet. + uint64 DiskAgentGeneration = 3; + + // Disk Registry Tablet generation needed to order attach/detach path + // requests sent from two different DR generations. + uint32 DiskRegistryGeneration = 4; +} + +message TAttachPathResponse +{ + // Optional error, set only if error happened. + NCloud.NProto.TError Error = 1; + + // List of attached devices. + repeated TDeviceConfig AttachedDevices = 2; +} + +//////////////////////////////////////////////////////////////////////////////// +// DetachPath + +message TDetachPathRequest +{ + // Optional request headers. + THeaders Headers = 1; + + // List of paths we want to detach. + repeated string PathsToDetach = 2; + + // Monotonously increasing counters needed to order attach/detach + // paths requests from one generation of DR tablet. + uint64 DiskAgentGeneration = 3; + + // Disk Registry Tablet generation needed to order attach/detach path + // requests sent from two different DR generations. + uint32 DiskRegistryGeneration = 4; +} + +message TDetachPathResponse +{ + // Optional error, set only if error happened. + NCloud.NProto.TError Error = 1; +}