-
Notifications
You must be signed in to change notification settings - Fork 926
Only call emptyData after RDB validation #2600
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from all commits
c9897ea
f937c88
011ca4d
d81b62c
58b2ae2
1504749
5d7a369
e0f95e1
744e03b
cd70684
2d5a96c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -1986,8 +1986,8 @@ void replicationSendNewlineToPrimary(void) { | |||||||||||
/* Callback used by emptyData() while flushing away old data to load | ||||||||||||
* the new dataset received by the primary and by discardTempDb() | ||||||||||||
* after loading succeeded or failed. */ | ||||||||||||
void replicationEmptyDbCallback(hashtable *d) { | ||||||||||||
UNUSED(d); | ||||||||||||
void replicationEmptyDbCallback(hashtable *ht) { | ||||||||||||
UNUSED(ht); | ||||||||||||
if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary(); | ||||||||||||
} | ||||||||||||
|
||||||||||||
|
@@ -2280,11 +2280,6 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark, | |||||||||||
/* We will soon start loading the RDB from socket, the replication history is changed, | ||||||||||||
* we must discard the cached primary structure and force resync of sub-replicas. */ | ||||||||||||
replicationAttachToNewPrimary(); | ||||||||||||
|
||||||||||||
/* Even though we are on-empty-db and the database is empty, we still call emptyData. */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data"); | ||||||||||||
emptyData(-1, empty_db_flags, replicationEmptyDbCallback); | ||||||||||||
|
||||||||||||
ChiliPaneer marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
dbarray = server.db; | ||||||||||||
functions_lib_ctx = functionsLibCtxGetCurrent(); | ||||||||||||
} | ||||||||||||
|
@@ -2301,7 +2296,10 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark, | |||||||||||
if (replicationSupportSkipRDBChecksum(conn, 1, *usemark)) rdb.flags |= RIO_FLAG_SKIP_RDB_CHECKSUM; | ||||||||||||
int loadingFailed = 0; | ||||||||||||
rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx}; | ||||||||||||
if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, rsi, &loadingCtx) != C_OK) { | ||||||||||||
/* If we aren't using the swapdb method, then we want to empty the data before loading the rdb */ | ||||||||||||
int empty_data_flag = server.repl_diskless_load != REPL_DISKLESS_LOAD_SWAPDB ? RDBFLAGS_EMPTY_DATA : RDBFLAGS_NONE; | ||||||||||||
int retval = rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION | empty_data_flag, rsi, &loadingCtx); | ||||||||||||
Comment on lines
+2300
to
+2301
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this logic look simpler if we use a flags variable and conditionally add the EMPTY_DATA flag to it? I think so...
Suggested change
|
||||||||||||
if (retval != RDB_OK) { | ||||||||||||
/* RDB loading failed. */ | ||||||||||||
serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB " | ||||||||||||
"from socket, check server logs."); | ||||||||||||
|
@@ -2327,9 +2325,14 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark, | |||||||||||
disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx); | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background"); | ||||||||||||
} else { | ||||||||||||
/* Remove the half-loaded data in case we started with an empty replica. */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); | ||||||||||||
emptyData(-1, empty_db_flags, replicationEmptyDbCallback); | ||||||||||||
/* If we received RDB_INCOMPATIBLE, the old data was preserved */ | ||||||||||||
if (retval == RDB_INCOMPATIBLE) { | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: RDB version or signature incompatible, old data preserved"); | ||||||||||||
} else { | ||||||||||||
/* Remove the half-loaded data in case the load failed for other reasons. */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); | ||||||||||||
emptyData(-1, empty_db_flags, replicationEmptyDbCallback); | ||||||||||||
} | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This bracket is mis-indented. |
||||||||||||
} | ||||||||||||
|
||||||||||||
/* Note that there's no point in restarting the AOF on SYNC | ||||||||||||
|
@@ -2410,14 +2413,12 @@ int replicaLoadPrimaryRDBFromDisk(rdbSaveInfo *rsi) { | |||||||||||
* we must discard the cached primary structure and force resync of sub-replicas. */ | ||||||||||||
replicationAttachToNewPrimary(); | ||||||||||||
|
||||||||||||
/* Empty the databases only after the RDB file is ok, that is, before the RDB file | ||||||||||||
* is actually loaded, in case we encounter an error and drop the replication stream | ||||||||||||
* and leave an empty database. */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data"); | ||||||||||||
emptyData(-1, empty_db_flags, replicationEmptyDbCallback); | ||||||||||||
|
||||||||||||
/* We pass RDBFLAGS_EMPTY_DATA to call emptyData() after validating rdb compatability | ||||||||||||
* and before loading the data from the RDB */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory"); | ||||||||||||
if (rdbLoad(server.rdb_filename, rsi, RDBFLAGS_REPLICATION) != RDB_OK) { | ||||||||||||
int retval = rdbLoad(server.rdb_filename, rsi, RDBFLAGS_REPLICATION | RDBFLAGS_EMPTY_DATA); | ||||||||||||
ChiliPaneer marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
|
||||||||||||
if (retval != RDB_OK) { | ||||||||||||
serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization " | ||||||||||||
"DB from disk, check server logs."); | ||||||||||||
if (server.rdb_del_sync_files && allPersistenceDisabled()) { | ||||||||||||
|
@@ -2427,9 +2428,15 @@ int replicaLoadPrimaryRDBFromDisk(rdbSaveInfo *rsi) { | |||||||||||
bg_unlink(server.rdb_filename); | ||||||||||||
} | ||||||||||||
|
||||||||||||
/* If disk-based RDB loading fails, remove the half-loaded dataset. */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); | ||||||||||||
emptyData(-1, empty_db_flags, replicationEmptyDbCallback); | ||||||||||||
/* If RDB failed compatibility check, we did not load the new data set or flush our old data. */ | ||||||||||||
if (retval == RDB_INCOMPATIBLE) { | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Skipping flush, no new data was loaded."); | ||||||||||||
} else { | ||||||||||||
/* If disk-based RDB loading fails, remove the half-loaded dataset. */ | ||||||||||||
serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); | ||||||||||||
emptyData(-1, empty_db_flags, replicationEmptyDbCallback); | ||||||||||||
} | ||||||||||||
|
||||||||||||
|
||||||||||||
/* Note that there's no point in restarting the AOF on sync failure, | ||||||||||||
* it'll be restarted when sync succeeds or replica promoted. */ | ||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can look at rdb.tcl / replication.tcl to add integration tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Discussed above, but testing this is challenging with the TCL framework. I've reproducible demo of the fix, and there exist tests that cover the code paths already. If anyone has any ideas for covering an incompatible RDB load during a full sync, I'd be open to suggesitons.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will look into adding a change to debug.c that allows us to hit this code path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We hit this code path also when loading an RDB from a file? Then we can test this instead of a full sync. We can copy a file into the right place and then call DEBUG RELOAD NOSAVE to load it. If the load fails because of bad signature or future version, we shouldn't flush the databases. Is this right?
There are some RDB files here, for example one with the hyptetical future RDB version 987: https://github.com/valkey-io/valkey/tree/unstable/tests/assets