Skip to content

Commit feec364

Browse files
[NBS-6247]: Extend TDurableClient retry logic with optional whitelist
1 parent b5b6972 commit feec364

File tree

5 files changed

+619
-19
lines changed

5 files changed

+619
-19
lines changed

cloud/blockstore/config/client.proto

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,91 @@ package NCloud.NBlockStore.NProto;
1010

1111
option go_package = "github.com/ydb-platform/nbs/cloud/blockstore/config";
1212

13+
////////////////////////////////////////////////////////////////////////////////
14+
// Well-known error codes.
15+
16+
enum EWellKnownErrorCode
17+
{
18+
S_OK = 0x00000000;
19+
S_FALSE = 0x00000001;
20+
S_ALREADY = 0x00000002;
21+
22+
E_FAIL = -0x80000000; // 0x80000000
23+
E_ARGUMENT = -0x7FFFFFFF; // 0x80000001
24+
E_REJECTED = -0x7FFFFFFE; // 0x80000002
25+
26+
reserved -0x7FFFFFFD; // 0x80000003
27+
28+
E_INVALID_STATE = -0x7FFFFFFC; // 0x80000004
29+
E_TIMEOUT = -0x7FFFFFFB; // 0x80000005
30+
E_NOT_FOUND = -0x7FFFFFFA; // 0x80000006
31+
E_UNAUTHORIZED = -0x7FFFFFF9; // 0x80000007
32+
E_NOT_IMPLEMENTED = -0x7FFFFFF8; // 0x80000008
33+
E_ABORTED = -0x7FFFFFF7; // 0x80000009
34+
E_TRY_AGAIN = -0x7FFFFFF6; // 0x8000000A
35+
E_IO = -0x7FFFFFF5; // 0x8000000B
36+
E_CANCELLED = -0x7FFFFFF4; // 0x8000000C
37+
E_IO_SILENT = -0x7FFFFFF3; // 0x8000000D
38+
E_RETRY_TIMEOUT = -0x7FFFFFF2; // 0x8000000E
39+
E_PRECONDITION_FAILED = -0x7FFFFFF1; // 0x8000000F
40+
41+
E_GRPC_CANCELLED = -0x7FFDFFFF; // 0x80020001
42+
E_GRPC_UNKNOWN = -0x7FFDFFFE; // 0x80020002
43+
E_GRPC_INVALID_ARGUMENT = -0x7FFDFFFD; // 0x80020003
44+
E_GRPC_DEADLINE_EXCEEDED = -0x7FFDFFFC; // 0x80020004
45+
E_GRPC_NOT_FOUND = -0x7FFDFFFB; // 0x80020005
46+
E_GRPC_ALREADY_EXISTS = -0x7FFDFFFA; // 0x80020006
47+
E_GRPC_PERMISSION_DENIED = -0x7FFDFFF9; // 0x80020007
48+
E_GRPC_RESOURCE_EXHAUSTED = -0x7FFDFFF8; // 0x80020008
49+
E_GRPC_FAILED_PRECONDITION = -0x7FFDFFF7; // 0x80020009
50+
E_GRPC_ABORTED = -0x7FFDFFF6; // 0x8002000A
51+
E_GRPC_OUT_OF_RANGE = -0x7FFDFFF5; // 0x8002000B
52+
E_GRPC_UNIMPLEMENTED = -0x7FFDFFF4; // 0x8002000C
53+
E_GRPC_INTERNAL = -0x7FFDFFF3; // 0x8002000D
54+
E_GRPC_UNAVAILABLE = -0x7FFDFFF2; // 0x8002000E
55+
E_GRPC_DATA_LOSS = -0x7FFDFFF1; // 0x8002000F
56+
E_GRPC_UNAUTHENTICATED = -0x7FFDFFF0; // 0x80020010
57+
58+
E_BS_INVALID_SESSION = -0x7FFAFFFF; // 0x80050001
59+
E_BS_OUT_OF_SPACE = -0x7FFAFFFE; // 0x80050002
60+
E_BS_THROTTLED = -0x7FFAFFFD; // 0x80050003
61+
E_BS_RESOURCE_EXHAUSTED = -0x7FFAFFFC; // 0x80050004
62+
E_BS_DISK_ALLOCATION_FAILED = -0x7FFAFFFB; // 0x80050005
63+
E_BS_MOUNT_CONFLICT = -0x7FFAFFFA; // 0x80050006
64+
65+
E_FS_IO = -0x7FF90000; // 0x80070000
66+
E_FS_PERM = -0x7FF8FFFF; // 0x80070001
67+
E_FS_NOENT = -0x7FF8FFFE; // 0x80070002
68+
E_FS_NXIO = -0x7FF8FFFD; // 0x80070003
69+
E_FS_ACCESS = -0x7FF8FFFC; // 0x80070004
70+
E_FS_EXIST = -0x7FF8FFFB; // 0x80070005
71+
E_FS_XDEV = -0x7FF8FFFA; // 0x80070006
72+
E_FS_NODEV = -0x7FF8FFF9; // 0x80070007
73+
E_FS_NOTDIR = -0x7FF8FFF8; // 0x80070008
74+
E_FS_ISDIR = -0x7FF8FFF7; // 0x80070009
75+
E_FS_INVAL = -0x7FF8FFF6; // 0x8007000A
76+
E_FS_FBIG = -0x7FF8FFF5; // 0x8007000B
77+
E_FS_NOSPC = -0x7FF8FFF4; // 0x8007000C
78+
E_FS_ROFS = -0x7FF8FFF3; // 0x8007000D
79+
E_FS_MLINK = -0x7FF8FFF2; // 0x8007000E
80+
E_FS_NAMETOOLONG = -0x7FF8FFF1; // 0x8007000F
81+
E_FS_NOTEMPTY = -0x7FF8FFF0; // 0x80070010
82+
E_FS_DQUOT = -0x7FF8FFEF; // 0x80070011
83+
E_FS_STALE = -0x7FF8FFEE; // 0x80070012
84+
E_FS_REMOTE = -0x7FF8FFED; // 0x80070013
85+
E_FS_BADHANDLE = -0x7FF8FFEC; // 0x80070014
86+
E_FS_NOTSUPP = -0x7FF8FFEB; // 0x80070015
87+
88+
E_FS_WOULDBLOCK = -0x7FF8FFE2; // 0x8007001E
89+
E_FS_NOLCK = -0x7FF8FFE1; // 0x8007001F
90+
91+
E_FS_INVALID_SESSION = -0x7FF8FF9C; // 0x80070064
92+
E_FS_OUT_OF_SPACE = -0x7FF8FF9B; // 0x80070065
93+
E_FS_THROTTLED = -0x7FF8FF9A; // 0x80070066
94+
95+
E_RDMA_UNAVAILABLE = -0x7FF7FFFF; // 0x80080001
96+
}
97+
1398
////////////////////////////////////////////////////////////////////////////////
1499

15100
message TClientMediaKindThrottlingConfig
@@ -191,6 +276,18 @@ message TClientConfig
191276

192277
// First retry timeout for YDB based disks (in milliseconds).
193278
optional uint32 YDBBasedDiskInitialRetryTimeout = 44;
279+
280+
// Optional whitelist for request errors to be passed up without retry in DurableClient.
281+
// If empty, whitelist mechanism is disabled and default
282+
// "EErrorKind::ErrorRetriable"-based predicate is used.
283+
// Applied for disks with IsReliableMediaKind == true.
284+
repeated EWellKnownErrorCode NonRetriableErrorsForReliableMedia = 45;
285+
286+
// Optional whitelist for request errors to be passed up without retry in DurableClient.
287+
// If empty, whitelist mechanism is disabled and default
288+
// "EErrorKind::ErrorRetriable"-based predicate is used.
289+
// Applied for disks with IsReliableMediaKind == false.
290+
repeated EWellKnownErrorCode NonRetriableErrorsForUnreliableMedia = 46;
194291
}
195292

196293
////////////////////////////////////////////////////////////////////////////////

cloud/blockstore/libs/client/config.cpp

Lines changed: 82 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@ TDuration MSeconds(ui64 x)
4343
xxx(ConnectionErrorMaxRetryTimeout, TDuration, MSeconds(100) )\
4444
xxx(GrpcReconnectBackoff, TDuration, MSeconds(100) )\
4545
xxx(DiskRegistryBasedDiskInitialRetryTimeout, TDuration, MSeconds(500) )\
46-
xxx(YDBBasedDiskInitialRetryTimeout, TDuration, MSeconds(500) )\
47-
\
46+
xxx(YDBBasedDiskInitialRetryTimeout, TDuration, MSeconds(500) ) \
47+
xxx(NonRetriableErrorsForReliableMedia, TVector<ui32>, {}) \
48+
xxx(NonRetriableErrorsForUnreliableMedia, TVector<ui32>, {}) \
49+
\
4850
xxx(MemoryQuotaBytes, ui32, 0 )\
4951
xxx(SecurePort, ui32, 0 )\
5052
xxx(RootCertsFile, TString, {} )\
@@ -89,7 +91,7 @@ TTarget ConvertValue(TSource value)
8991
}
9092

9193
template <>
92-
TDuration ConvertValue<TDuration, ui32>(ui32 value)
94+
TDuration ConvertValue<TDuration, const ui32&>(const ui32& value)
9395
{
9496
return TDuration::MilliSeconds(value);
9597
}
@@ -102,9 +104,27 @@ ConvertValue<TRequestThresholds, TProtoRequestThresholds>(
102104
return ConvertRequestThresholds(value);
103105
}
104106

107+
template <>
108+
TVector<ui32>
109+
ConvertValue<TVector<ui32>, const google::protobuf::RepeatedField<int>&>(
110+
const google::protobuf::RepeatedField<int>& value)
111+
{
112+
TVector<ui32> v;
113+
for (const auto& x: value) {
114+
v.push_back(static_cast<ui32>(x));
115+
}
116+
return v;
117+
}
118+
119+
template <typename T>
120+
bool IsEmpty(const google::protobuf::RepeatedField<T>& value)
121+
{
122+
return value.empty();
123+
}
124+
105125
////////////////////////////////////////////////////////////////////////////////
106126

107-
IOutputStream& operator <<(IOutputStream& out, NProto::EClientIpcType ipcType)
127+
IOutputStream& operator <<(IOutputStream& out, const NProto::EClientIpcType& ipcType)
108128
{
109129
switch (ipcType) {
110130
case NProto::IPC_GRPC:
@@ -124,6 +144,22 @@ IOutputStream& operator <<(IOutputStream& out, NProto::EClientIpcType ipcType)
124144
}
125145
}
126146

147+
template <typename T>
148+
void DumpImpl(const T& t, IOutputStream& os)
149+
{
150+
os << t;
151+
}
152+
153+
void DumpImpl(const TVector<ui32>& value, IOutputStream& os)
154+
{
155+
for (size_t i = 0; i < value.size(); ++i) {
156+
if (i) {
157+
os << ",";
158+
}
159+
os << value[i];
160+
}
161+
}
162+
127163
} // namespace
128164

129165
////////////////////////////////////////////////////////////////////////////////
@@ -136,12 +172,42 @@ TClientAppConfig::TClientAppConfig(NProto::TClientAppConfig appConfig)
136172
, IamConfig(AppConfig.GetIamConfig())
137173
{}
138174

139-
#define BLOCKSTORE_CONFIG_GETTER(name, type, ...) \
140-
type TClientAppConfig::Get##name() const \
141-
{ \
142-
auto has = ClientConfig.Has##name(); \
143-
return has ? ConvertValue<type>(ClientConfig.Get##name()) : Default##name; \
144-
} \
175+
#define DECLARE_FIELD_CHECKER(name, type, ...) \
176+
template <typename TProtoConfig, typename = void> \
177+
struct Has##name##Method: std::false_type \
178+
{ \
179+
}; \
180+
\
181+
template <typename TProtoConfig> \
182+
struct Has##name##Method< \
183+
TProtoConfig, \
184+
std::void_t<decltype(std::declval<TProtoConfig>().Has##name())>> \
185+
: std::true_type \
186+
{ \
187+
}; \
188+
\
189+
template <typename TProtoConfig, typename TProtoValue> \
190+
bool IsEmpty##name(const TProtoConfig& config, const TProtoValue& value) \
191+
{ \
192+
if constexpr (Has##name##Method<TProtoConfig>::value) { \
193+
return !config.Has##name(); \
194+
} else { \
195+
return IsEmpty(value); \
196+
} \
197+
}
198+
199+
BLOCKSTORE_CLIENT_CONFIG(DECLARE_FIELD_CHECKER)
200+
201+
#undef DECLARE_FIELD_CHECKER
202+
203+
#define BLOCKSTORE_CONFIG_GETTER(name, type, ...) \
204+
type TClientAppConfig::Get##name() const \
205+
{ \
206+
const auto& value = ClientConfig.Get##name(); \
207+
return IsEmpty##name(ClientConfig, value) \
208+
? Default##name \
209+
: ConvertValue<type, decltype(value)>(value); \
210+
}
145211
// BLOCKSTORE_CONFIG_GETTER
146212

147213
BLOCKSTORE_CLIENT_CONFIG(BLOCKSTORE_CONFIG_GETTER)
@@ -154,9 +220,11 @@ TRequestThresholds TClientAppConfig::GetRequestThresholds() const {
154220

155221
void TClientAppConfig::Dump(IOutputStream& out) const
156222
{
157-
#define BLOCKSTORE_CONFIG_DUMP(name, ...) \
158-
out << #name << ": " << Get##name() << Endl; \
159-
// BLOCKSTORE_CONFIG_DUMP
223+
#define BLOCKSTORE_CONFIG_DUMP(name, ...) \
224+
out << #name << ": "; \
225+
DumpImpl(Get##name(), out); \
226+
out << Endl; \
227+
// BLOCKSTORE_CONFIG_DUMP
160228

161229
BLOCKSTORE_CLIENT_CONFIG(BLOCKSTORE_CONFIG_DUMP);
162230

@@ -168,7 +236,7 @@ void TClientAppConfig::DumpHtml(IOutputStream& out) const
168236
#define BLOCKSTORE_CONFIG_DUMP(name, ...) \
169237
TABLER() { \
170238
TABLED() { out << #name; } \
171-
TABLED() { out << Get##name(); } \
239+
TABLED() { DumpImpl(Get##name(), out); } \
172240
} \
173241
// BLOCKSTORE_CONFIG_DUMP
174242

cloud/blockstore/libs/client/config.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
#include "public.h"
44

55
#include <cloud/blockstore/config/client.pb.h>
6-
76
#include <cloud/blockstore/libs/diagnostics/dumpable.h>
87

98
#include <cloud/storage/core/libs/diagnostics/trace_reader.h>
109
#include <cloud/storage/core/libs/iam/iface/config.h>
1110

1211
#include <util/datetime/base.h>
1312
#include <util/generic/string.h>
13+
#include <util/generic/vector.h>
1414
#include <util/stream/output.h>
1515

1616
namespace NCloud::NBlockStore::NClient {
@@ -60,6 +60,8 @@ class TClientAppConfig
6060
TDuration GetGrpcReconnectBackoff() const;
6161
TDuration GetDiskRegistryBasedDiskInitialRetryTimeout() const;
6262
TDuration GetYDBBasedDiskInitialRetryTimeout() const;
63+
TVector<ui32> GetNonRetriableErrorsForReliableMedia() const;
64+
TVector<ui32> GetNonRetriableErrorsForUnreliableMedia() const;
6365
ui32 GetMemoryQuotaBytes() const;
6466
ui32 GetSecurePort() const;
6567
bool GetSkipCertVerification() const;

cloud/blockstore/libs/client/durable.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -478,20 +478,42 @@ class TRetryPolicy final: public IRetryPolicy
478478
private:
479479
TClientAppConfigPtr Config;
480480
TDuration InitialRetryTimeout;
481+
bool MediaIsReliable;
482+
TVector<ui32> NeverRetriableWithErrorListPolicy;
481483

482484
public:
483-
TRetryPolicy(TClientAppConfigPtr config, TDuration initialRetryTimeout)
485+
TRetryPolicy(
486+
TClientAppConfigPtr config,
487+
TDuration initialRetryTimeout,
488+
bool mediaIsReliable)
484489
: Config(std::move(config))
485490
, InitialRetryTimeout(initialRetryTimeout)
491+
, MediaIsReliable(mediaIsReliable)
492+
, NeverRetriableWithErrorListPolicy({E_BS_INVALID_SESSION})
486493
{}
487494

488495
TRetrySpec ShouldRetry(
489496
TRetryState& state,
490497
const NProto::TError& error) override
491498
{
492499
TRetrySpec spec;
493-
spec.IsRetriableError =
494-
GetErrorKind(error) == EErrorKind::ErrorRetriable;
500+
501+
auto nonRetriableErrorsList =
502+
MediaIsReliable ? Config->GetNonRetriableErrorsForReliableMedia()
503+
: Config->GetNonRetriableErrorsForUnreliableMedia();
504+
505+
if (nonRetriableErrorsList.empty()) {
506+
spec.IsRetriableError =
507+
GetErrorKind(error) == EErrorKind::ErrorRetriable;
508+
} else {
509+
spec.IsRetriableError =
510+
std::ranges::find(
511+
NeverRetriableWithErrorListPolicy,
512+
error.GetCode()) ==
513+
NeverRetriableWithErrorListPolicy.end() &&
514+
std::ranges::find(nonRetriableErrorsList, error.GetCode()) ==
515+
nonRetriableErrorsList.end();
516+
}
495517

496518
if (!spec.IsRetriableError ||
497519
TInstant::Now() - state.Started >= Config->GetRetryTimeout())
@@ -546,7 +568,8 @@ IRetryPolicyPtr CreateRetryPolicy(
546568

547569
return std::make_shared<TRetryPolicy>(
548570
std::move(config),
549-
initialRetryTimeout);
571+
initialRetryTimeout,
572+
IsReliableMediaKind(mediaKind.value_or(NProto::STORAGE_MEDIA_DEFAULT)));
550573
}
551574

552575
IBlockStorePtr CreateDurableClient(

0 commit comments

Comments
 (0)