Skip to content

Commit 75d9745

Browse files
authored
Chaos for DiskAgent (#4511)
The disk agent has learned to respond to some requests with errors and at the same time spoil the writted data.
1 parent 86e8e6e commit 75d9745

File tree

15 files changed

+766
-26
lines changed

15 files changed

+766
-26
lines changed

cloud/blockstore/config/disk.proto

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ enum EDiskAgentBackendType
3131
DISK_AGENT_BACKEND_IO_URING_NULL = 4;
3232
}
3333

34+
////////////////////////////////////////////////////////////////////////////////
35+
// The policy of critical events reporting when chaos generates an error.
36+
37+
enum EChaosCritEventReportingPolicy
38+
{
39+
CCERP_FIRST_ERROR = 0;
40+
CCERP_ALL_ERRORS = 1;
41+
}
42+
3443
////////////////////////////////////////////////////////////////////////////////
3544

3645
message TMemoryDeviceArgs
@@ -168,6 +177,26 @@ message TDiskAgentThrottlingConfig
168177

169178
////////////////////////////////////////////////////////////////////////////////
170179

180+
message TChaosConfig
181+
{
182+
// The policy of critical events reporting when chaos generates an error.
183+
optional EChaosCritEventReportingPolicy CritEventReportingPolicy = 1;
184+
185+
// The probability of getting an error on the request start and finish.
186+
optional double ChaosProbability = 2;
187+
188+
// Error codes that the chaos will generate. When empty E_REJECTED is used.
189+
repeated uint32 ErrorCodes = 3;
190+
191+
// The probability of data corruption when responding with an error.
192+
optional double DataDamageProbability = 4;
193+
194+
// The probability of a response immediately upon receipt of the request.
195+
optional double ImmediateReplyProbability = 5;
196+
}
197+
198+
////////////////////////////////////////////////////////////////////////////////
199+
171200
message TDiskAgentConfig
172201
{
173202
optional bool Enabled = 1;
@@ -319,6 +348,9 @@ message TDiskAgentConfig
319348
// Enables data integrity validation for the DR-based disks.
320349
// See "EnableDataIntegrityClient" in server.proto for more details.
321350
optional bool EnableDataIntegrityValidationForDrBasedDisks = 44;
351+
352+
// DANGER! It is used to generate errors during chaos testing.
353+
optional TChaosConfig ChaosConfig = 45;
322354
}
323355

324356
////////////////////////////////////////////////////////////////////////////////

cloud/blockstore/libs/diagnostics/critical_events.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ using TCritEventParams = TVector<std::pair<TStringBuf, TValue>>;
8787
xxx(DiskAgentSessionCacheRestoreError) \
8888
xxx(DiskAgentSessionCacheUpdateError) \
8989
xxx(UnexpectedIdentifierRepetition) \
90+
xxx(ChaosGeneratedError) \
9091
// BLOCKSTORE_DISK_AGENT_CRITICAL_EVENTS
9192

9293
#define BLOCKSTORE_IMPOSSIBLE_EVENTS(xxx) \

cloud/blockstore/libs/service/request.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ const TString& GetBlockStoreRequestName(EBlockStoreRequest request)
2626
return Unknown;
2727
}
2828

29+
////////////////////////////////////////////////////////////////////////////////
30+
31+
#define BLOCKSTORE_DECLARE_METHOD(name, ...) \
32+
template <> \
33+
TString GetBlockStoreRequestName<NProto::T##name##Request>() \
34+
{ \
35+
return #name; \
36+
}
37+
38+
BLOCKSTORE_SERVICE(BLOCKSTORE_DECLARE_METHOD)
39+
40+
#undef BLOCKSTORE_DECLARE_METHOD
41+
42+
////////////////////////////////////////////////////////////////////////////////
43+
2944
TString GetSysRequestName(ESysRequestType requestType)
3045
{
3146
return ToString(requestType);

cloud/blockstore/libs/service/request.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ constexpr size_t BlockStoreRequestsCount = (size_t)EBlockStoreRequest::MAX;
174174

175175
const TString& GetBlockStoreRequestName(EBlockStoreRequest requestType);
176176

177+
template<typename TRequest>
178+
TString GetBlockStoreRequestName();
179+
177180
enum class ESysRequestType
178181
{
179182
Compaction = 10000,

cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,14 @@ void TDiskAgentActor::PerformIO(
177177
auto* actorSystem = ctx.ActorSystem();
178178
auto replyFrom = ctx.SelfID;
179179

180-
auto replySuccess = [=] (auto result) {
180+
auto reply = [actorSystem,
181+
replyFrom,
182+
requestInfo,
183+
range,
184+
volumeRequestId,
185+
deviceUUID,
186+
started](auto result)
187+
{
181188
Reply<TMethod>(
182189
*actorSystem,
183190
replyFrom,
@@ -189,18 +196,6 @@ void TDiskAgentActor::PerformIO(
189196
started);
190197
};
191198

192-
auto replyError = [=] (auto code, auto message) {
193-
Reply<TMethod>(
194-
*actorSystem,
195-
replyFrom,
196-
*requestInfo,
197-
MakeError(code, std::move(message)),
198-
range,
199-
volumeRequestId,
200-
deviceUUID,
201-
started);
202-
};
203-
204199
LOG_TRACE(ctx, TBlockStoreComponents::DISK_AGENT,
205200
"%s [%s / %s]",
206201
TMethod::Name,
@@ -221,7 +216,7 @@ void TDiskAgentActor::PerformIO(
221216
{"isWrite", IsWriteDeviceMethod<TMethod>},
222217
{"isRdma", 0}});
223218
}
224-
replyError(E_REJECTED, "Secure erase in progress");
219+
reply(MakeError(E_REJECTED, "Secure erase in progress"));
225220
return;
226221
}
227222

@@ -232,9 +227,10 @@ void TDiskAgentActor::PerformIO(
232227
std::invoke(operation, *State, ctx.Now(), std::move(record));
233228

234229
result.Subscribe(
235-
[=] (auto future) {
230+
[reply, deviceUUID, clientId, actorSystem](auto future)
231+
{
236232
try {
237-
replySuccess(future.ExtractValue());
233+
reply(future.ExtractValue());
238234
} catch (...) {
239235
auto [code, message] = HandleException(
240236
*actorSystem,
@@ -243,7 +239,7 @@ void TDiskAgentActor::PerformIO(
243239
deviceUUID,
244240
clientId);
245241

246-
replyError(code, message);
242+
reply(MakeError(code, message));
247243
}
248244
});
249245
} catch (...) {
@@ -254,7 +250,7 @@ void TDiskAgentActor::PerformIO(
254250
deviceUUID,
255251
clientId);
256252

257-
replyError(code, message);
253+
reply(MakeError(code, message));
258254
}
259255
}
260256

cloud/blockstore/libs/storage/disk_agent/model/bootstrap.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "bootstrap.h"
22

3+
#include "chaos_storage_provider.h"
34
#include "config.h"
45

56
#include <cloud/blockstore/libs/nvme/nvme.h>
@@ -108,11 +109,14 @@ IStorageProviderPtr CreateStorageProvider(
108109
NServer::IFileIOServiceProviderPtr provider,
109110
NNvme::INvmeManagerPtr nvmeManager)
110111
{
112+
IStorageProviderPtr result;
113+
111114
switch (config.GetBackend()) {
112-
case NProto::DISK_AGENT_BACKEND_SPDK:
115+
case NProto::DISK_AGENT_BACKEND_SPDK: {
113116
break;
114-
case NProto::DISK_AGENT_BACKEND_AIO:
115-
return CreateLocalStorageProvider(
117+
}
118+
case NProto::DISK_AGENT_BACKEND_AIO: {
119+
result = CreateLocalStorageProvider(
116120
std::move(provider),
117121
std::move(nvmeManager),
118122
{
@@ -122,11 +126,15 @@ IStorageProviderPtr CreateStorageProvider(
122126
.EnableDataIntegrityValidation =
123127
config.GetEnableDataIntegrityValidationForDrBasedDisks(),
124128
});
125-
case NProto::DISK_AGENT_BACKEND_NULL:
126-
return NServer::CreateNullStorageProvider();
129+
break;
130+
}
131+
case NProto::DISK_AGENT_BACKEND_NULL: {
132+
result = NServer::CreateNullStorageProvider();
133+
break;
134+
}
127135
case NProto::DISK_AGENT_BACKEND_IO_URING:
128-
case NProto::DISK_AGENT_BACKEND_IO_URING_NULL:
129-
return CreateLocalStorageProvider(
136+
case NProto::DISK_AGENT_BACKEND_IO_URING_NULL: {
137+
result = CreateLocalStorageProvider(
130138
std::move(provider),
131139
std::move(nvmeManager),
132140
{
@@ -137,9 +145,17 @@ IStorageProviderPtr CreateStorageProvider(
137145
.EnableDataIntegrityValidation =
138146
config.GetEnableDataIntegrityValidationForDrBasedDisks(),
139147
});
148+
break;
149+
}
140150
}
141151

142-
return nullptr;
152+
if (result && config.HasChaosConfig()) {
153+
result = NServer::CreateChaosStorageProvider(
154+
std::move(result),
155+
config.GetChaosConfig());
156+
}
157+
158+
return result;
143159
}
144160

145161
NNvme::INvmeManagerPtr CreateNvmeManager(const TDiskAgentConfig& config)

0 commit comments

Comments
 (0)