Skip to content

Commit 11bf7fd

Browse files
[NBS-6247]: Extend TDurableClient retry logic with optional whitelist
1 parent 453906b commit 11bf7fd

File tree

6 files changed

+684
-15
lines changed

6 files changed

+684
-15
lines changed

cloud/blockstore/config/client.proto

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,99 @@ package NCloud.NBlockStore.NProto;
1010

1111
option go_package = "github.com/ydb-platform/nbs/cloud/blockstore/config";
1212

13+
////////////////////////////////////////////////////////////////////////////////
14+
// Well-known error codes.
15+
// Warning: This enum is a mirror of
16+
// enum EWellKnownResultCodes from cloud/storage/core/libs/common/error.h
17+
//
18+
// Note: Since proto2 enum values are strictly limited to int32_t values,
19+
// which are [-2^31, 2^31-1], and error codes with SEVERITY_ERROR
20+
// have the highest bit on, such values are represented in the enum as
21+
// NAME = <signed representation>; // <actual unsigned value>,
22+
// and this style must be followed by any change within the enum
23+
24+
enum EWellKnownErrorCode
25+
{
26+
S_OK = 0x00000000;
27+
S_FALSE = 0x00000001;
28+
S_ALREADY = 0x00000002;
29+
30+
E_FAIL = -0x80000000; // 0x80000000
31+
E_ARGUMENT = -0x7FFFFFFF; // 0x80000001
32+
E_REJECTED = -0x7FFFFFFE; // 0x80000002
33+
34+
reserved -0x7FFFFFFD; // 0x80000003
35+
36+
E_INVALID_STATE = -0x7FFFFFFC; // 0x80000004
37+
E_TIMEOUT = -0x7FFFFFFB; // 0x80000005
38+
E_NOT_FOUND = -0x7FFFFFFA; // 0x80000006
39+
E_UNAUTHORIZED = -0x7FFFFFF9; // 0x80000007
40+
E_NOT_IMPLEMENTED = -0x7FFFFFF8; // 0x80000008
41+
E_ABORTED = -0x7FFFFFF7; // 0x80000009
42+
E_TRY_AGAIN = -0x7FFFFFF6; // 0x8000000A
43+
E_IO = -0x7FFFFFF5; // 0x8000000B
44+
E_CANCELLED = -0x7FFFFFF4; // 0x8000000C
45+
E_IO_SILENT = -0x7FFFFFF3; // 0x8000000D
46+
E_RETRY_TIMEOUT = -0x7FFFFFF2; // 0x8000000E
47+
E_PRECONDITION_FAILED = -0x7FFFFFF1; // 0x8000000F
48+
49+
E_GRPC_CANCELLED = -0x7FFDFFFF; // 0x80020001
50+
E_GRPC_UNKNOWN = -0x7FFDFFFE; // 0x80020002
51+
E_GRPC_INVALID_ARGUMENT = -0x7FFDFFFD; // 0x80020003
52+
E_GRPC_DEADLINE_EXCEEDED = -0x7FFDFFFC; // 0x80020004
53+
E_GRPC_NOT_FOUND = -0x7FFDFFFB; // 0x80020005
54+
E_GRPC_ALREADY_EXISTS = -0x7FFDFFFA; // 0x80020006
55+
E_GRPC_PERMISSION_DENIED = -0x7FFDFFF9; // 0x80020007
56+
E_GRPC_RESOURCE_EXHAUSTED = -0x7FFDFFF8; // 0x80020008
57+
E_GRPC_FAILED_PRECONDITION = -0x7FFDFFF7; // 0x80020009
58+
E_GRPC_ABORTED = -0x7FFDFFF6; // 0x8002000A
59+
E_GRPC_OUT_OF_RANGE = -0x7FFDFFF5; // 0x8002000B
60+
E_GRPC_UNIMPLEMENTED = -0x7FFDFFF4; // 0x8002000C
61+
E_GRPC_INTERNAL = -0x7FFDFFF3; // 0x8002000D
62+
E_GRPC_UNAVAILABLE = -0x7FFDFFF2; // 0x8002000E
63+
E_GRPC_DATA_LOSS = -0x7FFDFFF1; // 0x8002000F
64+
E_GRPC_UNAUTHENTICATED = -0x7FFDFFF0; // 0x80020010
65+
66+
E_BS_INVALID_SESSION = -0x7FFAFFFF; // 0x80050001
67+
E_BS_OUT_OF_SPACE = -0x7FFAFFFE; // 0x80050002
68+
E_BS_THROTTLED = -0x7FFAFFFD; // 0x80050003
69+
E_BS_RESOURCE_EXHAUSTED = -0x7FFAFFFC; // 0x80050004
70+
E_BS_DISK_ALLOCATION_FAILED = -0x7FFAFFFB; // 0x80050005
71+
E_BS_MOUNT_CONFLICT = -0x7FFAFFFA; // 0x80050006
72+
73+
E_FS_IO = -0x7FF90000; // 0x80070000
74+
E_FS_PERM = -0x7FF8FFFF; // 0x80070001
75+
E_FS_NOENT = -0x7FF8FFFE; // 0x80070002
76+
E_FS_NXIO = -0x7FF8FFFD; // 0x80070003
77+
E_FS_ACCESS = -0x7FF8FFFC; // 0x80070004
78+
E_FS_EXIST = -0x7FF8FFFB; // 0x80070005
79+
E_FS_XDEV = -0x7FF8FFFA; // 0x80070006
80+
E_FS_NODEV = -0x7FF8FFF9; // 0x80070007
81+
E_FS_NOTDIR = -0x7FF8FFF8; // 0x80070008
82+
E_FS_ISDIR = -0x7FF8FFF7; // 0x80070009
83+
E_FS_INVAL = -0x7FF8FFF6; // 0x8007000A
84+
E_FS_FBIG = -0x7FF8FFF5; // 0x8007000B
85+
E_FS_NOSPC = -0x7FF8FFF4; // 0x8007000C
86+
E_FS_ROFS = -0x7FF8FFF3; // 0x8007000D
87+
E_FS_MLINK = -0x7FF8FFF2; // 0x8007000E
88+
E_FS_NAMETOOLONG = -0x7FF8FFF1; // 0x8007000F
89+
E_FS_NOTEMPTY = -0x7FF8FFF0; // 0x80070010
90+
E_FS_DQUOT = -0x7FF8FFEF; // 0x80070011
91+
E_FS_STALE = -0x7FF8FFEE; // 0x80070012
92+
E_FS_REMOTE = -0x7FF8FFED; // 0x80070013
93+
E_FS_BADHANDLE = -0x7FF8FFEC; // 0x80070014
94+
E_FS_NOTSUPP = -0x7FF8FFEB; // 0x80070015
95+
96+
E_FS_WOULDBLOCK = -0x7FF8FFE2; // 0x8007001E
97+
E_FS_NOLCK = -0x7FF8FFE1; // 0x8007001F
98+
99+
E_FS_INVALID_SESSION = -0x7FF8FF9C; // 0x80070064
100+
E_FS_OUT_OF_SPACE = -0x7FF8FF9B; // 0x80070065
101+
E_FS_THROTTLED = -0x7FF8FF9A; // 0x80070066
102+
103+
E_RDMA_UNAVAILABLE = -0x7FF7FFFF; // 0x80080001
104+
}
105+
13106
////////////////////////////////////////////////////////////////////////////////
14107

15108
message TClientMediaKindThrottlingConfig
@@ -191,6 +284,28 @@ message TClientConfig
191284

192285
// First retry timeout for YDB based disks (in milliseconds).
193286
optional uint32 YDBBasedDiskInitialRetryTimeout = 44;
287+
288+
// Enable list-based retry rules in TDurableClient.
289+
// If disabled (default),
290+
// "EErrorKind::ErrorRetriable"-based predicate is used.
291+
// If enabled, TDurableClient will additionally retry all error except:
292+
// For all media:
293+
// - cloud/blockstore/libs/client/durable.cpp::NeverRetriableErrors[]
294+
// For reliable media:
295+
// - NonRetriableErrorsForReliableMedia[]
296+
// For non-reliable media:
297+
// - NonRetriableErrorsForUnreliableMedia[].
298+
optional bool EnableListBasedRetryRules = 45;
299+
300+
// List of non-retriable errors that will not be retried
301+
// even if EnableListBasedRetryRules is set.
302+
// Applied for disks with IsReliableMediaKind == true.
303+
repeated EWellKnownErrorCode NonRetriableErrorsForReliableMedia = 46;
304+
305+
// List of non-retriable errors that will not be retried
306+
// even if EnableListBasedRetryRules is set.
307+
// Applied for disks with IsReliableMediaKind == false.
308+
repeated EWellKnownErrorCode NonRetriableErrorsForUnreliableMedia = 47;
194309
}
195310

196311
////////////////////////////////////////////////////////////////////////////////

cloud/blockstore/libs/client/config.cpp

Lines changed: 86 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ TDuration MSeconds(ui64 x)
4444
xxx(GrpcReconnectBackoff, TDuration, MSeconds(100) )\
4545
xxx(DiskRegistryBasedDiskInitialRetryTimeout, TDuration, MSeconds(500) )\
4646
xxx(YDBBasedDiskInitialRetryTimeout, TDuration, MSeconds(500) )\
47+
xxx(EnableListBasedRetryRules, bool, false )\
48+
xxx(NonRetriableErrorsForReliableMedia, \
49+
TVector<EWellKnownResultCodes>, \
50+
{}) \
51+
xxx(NonRetriableErrorsForUnreliableMedia, \
52+
TVector<EWellKnownResultCodes>, \
53+
{}) \
4754
\
4855
xxx(MemoryQuotaBytes, ui32, 0 )\
4956
xxx(SecurePort, ui32, 0 )\
@@ -89,7 +96,7 @@ TTarget ConvertValue(TSource value)
8996
}
9097

9198
template <>
92-
TDuration ConvertValue<TDuration, ui32>(ui32 value)
99+
TDuration ConvertValue<TDuration, const ui32&>(const ui32& value)
93100
{
94101
return TDuration::MilliSeconds(value);
95102
}
@@ -102,9 +109,28 @@ ConvertValue<TRequestThresholds, TProtoRequestThresholds>(
102109
return ConvertRequestThresholds(value);
103110
}
104111

112+
template <>
113+
TVector<EWellKnownResultCodes> ConvertValue<
114+
TVector<EWellKnownResultCodes>,
115+
const google::protobuf::RepeatedField<int>&>(
116+
const google::protobuf::RepeatedField<int>& value)
117+
{
118+
TVector<EWellKnownResultCodes> v;
119+
for (const auto& x: value) {
120+
v.push_back(static_cast<EWellKnownResultCodes>(x));
121+
}
122+
return v;
123+
}
124+
125+
template <typename T>
126+
bool IsEmpty(const google::protobuf::RepeatedField<T>& value)
127+
{
128+
return value.empty();
129+
}
130+
105131
////////////////////////////////////////////////////////////////////////////////
106132

107-
IOutputStream& operator <<(IOutputStream& out, NProto::EClientIpcType ipcType)
133+
IOutputStream& operator <<(IOutputStream& out, const NProto::EClientIpcType& ipcType)
108134
{
109135
switch (ipcType) {
110136
case NProto::IPC_GRPC:
@@ -124,6 +150,22 @@ IOutputStream& operator <<(IOutputStream& out, NProto::EClientIpcType ipcType)
124150
}
125151
}
126152

153+
template <typename T>
154+
void DumpImpl(const T& t, IOutputStream& os)
155+
{
156+
os << t;
157+
}
158+
159+
void DumpImpl(const TVector<EWellKnownResultCodes>& value, IOutputStream& os)
160+
{
161+
for (size_t i = 0; i < value.size(); ++i) {
162+
if (i) {
163+
os << ",";
164+
}
165+
os << value[i];
166+
}
167+
}
168+
127169
} // namespace
128170

129171
////////////////////////////////////////////////////////////////////////////////
@@ -136,12 +178,42 @@ TClientAppConfig::TClientAppConfig(NProto::TClientAppConfig appConfig)
136178
, IamConfig(AppConfig.GetIamConfig())
137179
{}
138180

139-
#define BLOCKSTORE_CONFIG_GETTER(name, type, ...) \
140-
type TClientAppConfig::Get##name() const \
141-
{ \
142-
auto has = ClientConfig.Has##name(); \
143-
return has ? ConvertValue<type>(ClientConfig.Get##name()) : Default##name; \
144-
} \
181+
#define DECLARE_FIELD_CHECKER(name, type, ...) \
182+
template <typename TProtoConfig, typename = void> \
183+
struct Has##name##Method: std::false_type \
184+
{ \
185+
}; \
186+
\
187+
template <typename TProtoConfig> \
188+
struct Has##name##Method< \
189+
TProtoConfig, \
190+
std::void_t<decltype(std::declval<TProtoConfig>().Has##name())>> \
191+
: std::true_type \
192+
{ \
193+
}; \
194+
\
195+
template <typename TProtoConfig, typename TProtoValue> \
196+
bool IsEmpty##name(const TProtoConfig& config, const TProtoValue& value) \
197+
{ \
198+
if constexpr (Has##name##Method<TProtoConfig>::value) { \
199+
return !config.Has##name(); \
200+
} else { \
201+
return IsEmpty(value); \
202+
} \
203+
}
204+
205+
BLOCKSTORE_CLIENT_CONFIG(DECLARE_FIELD_CHECKER)
206+
207+
#undef DECLARE_FIELD_CHECKER
208+
209+
#define BLOCKSTORE_CONFIG_GETTER(name, type, ...) \
210+
type TClientAppConfig::Get##name() const \
211+
{ \
212+
const auto& value = ClientConfig.Get##name(); \
213+
return IsEmpty##name(ClientConfig, value) \
214+
? Default##name \
215+
: ConvertValue<type, decltype(value)>(value); \
216+
}
145217
// BLOCKSTORE_CONFIG_GETTER
146218

147219
BLOCKSTORE_CLIENT_CONFIG(BLOCKSTORE_CONFIG_GETTER)
@@ -154,9 +226,11 @@ TRequestThresholds TClientAppConfig::GetRequestThresholds() const {
154226

155227
void TClientAppConfig::Dump(IOutputStream& out) const
156228
{
157-
#define BLOCKSTORE_CONFIG_DUMP(name, ...) \
158-
out << #name << ": " << Get##name() << Endl; \
159-
// BLOCKSTORE_CONFIG_DUMP
229+
#define BLOCKSTORE_CONFIG_DUMP(name, ...) \
230+
out << #name << ": "; \
231+
DumpImpl(Get##name(), out); \
232+
out << Endl; \
233+
// BLOCKSTORE_CONFIG_DUMP
160234

161235
BLOCKSTORE_CLIENT_CONFIG(BLOCKSTORE_CONFIG_DUMP);
162236

@@ -168,7 +242,7 @@ void TClientAppConfig::DumpHtml(IOutputStream& out) const
168242
#define BLOCKSTORE_CONFIG_DUMP(name, ...) \
169243
TABLER() { \
170244
TABLED() { out << #name; } \
171-
TABLED() { out << Get##name(); } \
245+
TABLED() { DumpImpl(Get##name(), out); } \
172246
} \
173247
// BLOCKSTORE_CONFIG_DUMP
174248

cloud/blockstore/libs/client/config.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
#include "public.h"
44

55
#include <cloud/blockstore/config/client.pb.h>
6-
76
#include <cloud/blockstore/libs/diagnostics/dumpable.h>
87

8+
#include <cloud/storage/core/libs/common/error.h>
99
#include <cloud/storage/core/libs/diagnostics/trace_reader.h>
1010
#include <cloud/storage/core/libs/iam/iface/config.h>
1111

1212
#include <util/datetime/base.h>
1313
#include <util/generic/string.h>
14+
#include <util/generic/vector.h>
1415
#include <util/stream/output.h>
1516

1617
namespace NCloud::NBlockStore::NClient {
@@ -60,6 +61,11 @@ class TClientAppConfig
6061
TDuration GetGrpcReconnectBackoff() const;
6162
TDuration GetDiskRegistryBasedDiskInitialRetryTimeout() const;
6263
TDuration GetYDBBasedDiskInitialRetryTimeout() const;
64+
bool GetEnableListBasedRetryRules() const;
65+
TVector<EWellKnownResultCodes>
66+
GetNonRetriableErrorsForReliableMedia() const;
67+
TVector<EWellKnownResultCodes>
68+
GetNonRetriableErrorsForUnreliableMedia() const;
6369
ui32 GetMemoryQuotaBytes() const;
6470
ui32 GetSecurePort() const;
6571
bool GetSkipCertVerification() const;

cloud/blockstore/libs/client/durable.cpp

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,27 @@ namespace {
2929

3030
////////////////////////////////////////////////////////////////////////////////
3131

32+
// List of errors that must never be retried and semantically have
33+
// no point in retrying
34+
// Note: changes is this list must me reflected in
35+
// cloud/blockstore/libs/client/durable_ut.cpp:
36+
// ShouldUseNonRetriableListWhenEnabledAndNotRetryNeverRetriable
37+
const TVector<EWellKnownResultCodes> NeverRetriableErrors = {
38+
E_BS_INVALID_SESSION, // This error must never be retried as
39+
// not passing in up may break
40+
// the remounting logic
41+
E_CANCELLED, // Request is canceled,
42+
// no point in retrying
43+
E_ARGUMENT, // Request is ill-formed,
44+
// no point in retrying
45+
E_IO_SILENT, // Unrecoverable error, that must be
46+
// passed up to the client
47+
E_IO, // Unrecoverable error, that must be
48+
// passed up to the client
49+
};
50+
51+
////////////////////////////////////////////////////////////////////////////////
52+
3253
ELogPriority GetDetailsLogPriority(const NProto::TError& error)
3354
{
3455
const auto kind = GetDiagnosticsErrorKind(error);
@@ -478,20 +499,30 @@ class TRetryPolicy final: public IRetryPolicy
478499
private:
479500
TClientAppConfigPtr Config;
480501
TDuration InitialRetryTimeout;
502+
bool MediaIsReliable;
481503

482504
public:
483-
TRetryPolicy(TClientAppConfigPtr config, TDuration initialRetryTimeout)
505+
TRetryPolicy(
506+
TClientAppConfigPtr config,
507+
TDuration initialRetryTimeout,
508+
bool mediaIsReliable)
484509
: Config(std::move(config))
485510
, InitialRetryTimeout(initialRetryTimeout)
511+
, MediaIsReliable(mediaIsReliable)
486512
{}
487513

488514
TRetrySpec ShouldRetry(
489515
TRetryState& state,
490516
const NProto::TError& error) override
491517
{
492518
TRetrySpec spec;
519+
493520
spec.IsRetriableError =
494521
GetErrorKind(error) == EErrorKind::ErrorRetriable;
522+
if (Config->GetEnableListBasedRetryRules()) {
523+
spec.IsRetriableError =
524+
spec.IsRetriableError || !IsInNonRetriableList(error);
525+
}
495526

496527
if (!spec.IsRetriableError ||
497528
TInstant::Now() - state.Started >= Config->GetRetryTimeout())
@@ -524,6 +555,18 @@ class TRetryPolicy final: public IRetryPolicy
524555
state.RetryTimeout = newRetryTimeout;
525556
return spec;
526557
}
558+
559+
private:
560+
bool IsInNonRetriableList(const NProto::TError& error) const
561+
{
562+
if (FindPtr(NeverRetriableErrors, error.GetCode())) {
563+
return true;
564+
}
565+
auto nonRetriableErrorsList =
566+
MediaIsReliable ? Config->GetNonRetriableErrorsForReliableMedia()
567+
: Config->GetNonRetriableErrorsForUnreliableMedia();
568+
return FindPtr(nonRetriableErrorsList, error.GetCode()) != nullptr;
569+
}
527570
};
528571

529572
} // namespace
@@ -546,7 +589,8 @@ IRetryPolicyPtr CreateRetryPolicy(
546589

547590
return std::make_shared<TRetryPolicy>(
548591
std::move(config),
549-
initialRetryTimeout);
592+
initialRetryTimeout,
593+
IsReliableMediaKind(mediaKind.value_or(NProto::STORAGE_MEDIA_DEFAULT)));
550594
}
551595

552596
IBlockStorePtr CreateDurableClient(

0 commit comments

Comments
 (0)