Skip to content

Commit d45ce3a

Browse files
authored
Merge pull request #5589 from cloudflare/milan/STOR-4521-repro
Don't schedule a move-later alarm if an interleaving commit setAlarm()
2 parents a080069 + 5ab21e0 commit d45ce3a

File tree

3 files changed

+47
-27
lines changed

3 files changed

+47
-27
lines changed

src/workerd/io/actor-sqlite-test.c++

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,9 @@ KJ_TEST("setting later alarm times does scheduling after db commit") {
10451045
auto gateWait2Ms = test.gate.wait();
10461046
KJ_ASSERT(!gateWait2Ms.poll(test.ws));
10471047

1048-
// Set alarm to 3ms. Expect 3ms db commit to start.
1048+
// Set alarm to 3ms. Expect 3ms db commit to start. The 2ms scheduleRun will never happen now
1049+
// that we've overwritten it while it was persisting to SQLite (before we send an update to the
1050+
// alarm manager).
10491051
test.setAlarm(threeMs);
10501052
auto commit3MsFulfiller = kj::mv(test.pollAndExpectCalls({"commit"})[0]);
10511053
test.pollAndExpectCalls({});
@@ -1054,11 +1056,10 @@ KJ_TEST("setting later alarm times does scheduling after db commit") {
10541056
auto gateWait3Ms = test.gate.wait();
10551057
KJ_ASSERT(!gateWait3Ms.poll(test.ws));
10561058

1057-
// Fulfill 2ms db commit. Expect 2ms alarm to be scheduled and 2ms gate to be unblocked.
1059+
// Expect 2ms gate to be unblocked once the commit finishes, but don't expect a scheduleRun(2ms).
10581060
KJ_ASSERT(!gateWait2Ms.poll(test.ws));
10591061
commit2MsFulfiller->fulfill();
10601062
KJ_ASSERT(gateWait2Ms.poll(test.ws));
1061-
auto fulfiller2Ms = kj::mv(test.pollAndExpectCalls({"scheduleRun(2ms)"})[0]);
10621063
test.pollAndExpectCalls({});
10631064

10641065
// Fulfill 3ms db commit. Expect 3ms alarm to be scheduled and 3ms gate to be unblocked.
@@ -1067,10 +1068,6 @@ KJ_TEST("setting later alarm times does scheduling after db commit") {
10671068
commit3MsFulfiller->fulfill();
10681069
KJ_ASSERT(gateWait3Ms.poll(test.ws));
10691070

1070-
// The 3ms scheduleRun waits for the 2ms scheduleRun to complete first, since we chain
1071-
// "move later" operations to ensure they execute in order at the alarm manager.
1072-
fulfiller2Ms->fulfill();
1073-
10741071
auto fulfiller3Ms = kj::mv(test.pollAndExpectCalls({"scheduleRun(3ms)"})[0]);
10751072
test.pollAndExpectCalls({});
10761073

src/workerd/io/actor-sqlite.c++

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,12 @@ kj::Promise<void> ActorSqlite::commitImpl(ActorSqlite::PrecommitAlarmState preco
433433
// pending commit so that the next commitImpl() invocation starts its own set of precommit alarm
434434
// updates and db commit.
435435
auto alarmStateForCommit = metadata.getAlarm();
436+
437+
// Capture the alarm version before going async to detect concurrent alarm changes. If the
438+
// alarmVersion changes while we are in-flight, we should skip attempting any move-later alarm
439+
// update.
440+
auto alarmVersionBeforeAsync = alarmVersion;
441+
436442
auto commitCallbackPromise = commitCallback();
437443
pendingCommit = kj::none;
438444

@@ -449,27 +455,36 @@ kj::Promise<void> ActorSqlite::commitImpl(ActorSqlite::PrecommitAlarmState preco
449455
// Notify any merged commitImpl() requests that the db persistence completed.
450456
fulfiller->fulfill();
451457

452-
// If the db state is now later than the in-flight scheduled alarms, issue a request to update
453-
// it to match the db state.
454-
if (willFireEarlier(alarmScheduledNoLaterThan, alarmStateForCommit)) {
455-
if (debugAlarmSync) {
456-
KJ_LOG(WARNING, "NOSENTRY DEBUG_ALARM: Moving alarm later", "sqlite_has",
457-
logDate(alarmStateForCommit), "alarmScheduledNoLaterThan",
458-
logDate(alarmScheduledNoLaterThan));
458+
// If another commit modified the alarm while we were async, skip post-commit alarm sync.
459+
//
460+
// We do this for a few reasons:
461+
// 1. The other commit will handle its own alarm sync
462+
// 2. Post-commit syncs are inherently optional (the alarm will self-correct)
463+
// 3. This coalesces redundant alarm updates for better performance
464+
// 4. This avoids race conditions where a later commit moved the alarm earlier, requiring a
465+
// pre-commit alarm update, and this update may have already been made before we get here.
466+
if (alarmVersion == alarmVersionBeforeAsync) {
467+
// No intervening alarm changes, it is safe to schedule a move-later alarm update if needed.
468+
if (willFireEarlier(alarmScheduledNoLaterThan, alarmStateForCommit)) {
469+
if (debugAlarmSync) {
470+
KJ_LOG(WARNING, "NOSENTRY DEBUG_ALARM: Moving alarm later", "sqlite_has",
471+
logDate(alarmStateForCommit), "alarmScheduledNoLaterThan",
472+
logDate(alarmScheduledNoLaterThan));
473+
}
474+
// We need to extend our alarmLaterChain now that we're adding a new "move-later" alarm task.
475+
//
476+
// Technically, we don't need serialize our "move-later" alarms since SQLite has the later
477+
// time committed locally. We could just set the `alarmLaterChain` and pass a `kj::READY_NOW`
478+
// to requestScheduledAlarm, and so if we have a partial failure we would just recover when
479+
// the alarm runs early. That said, it doesn't hurt to serialize on the client-side.
480+
alarmLaterChain = requestScheduledAlarm(alarmStateForCommit, alarmLaterChain.addBranch())
481+
.catch_([](kj::Exception&& e) {
482+
// If an exception occurs when scheduling the alarm later, it's OK -- the alarm will
483+
// eventually fire at the earlier time, and the rescheduling will be retried.
484+
// We catch here to prevent the chain from breaking on errors.
485+
LOG_WARNING_PERIODICALLY("NOSENTRY SQLite reschedule later alarm failed", e);
486+
}).fork();
459487
}
460-
// We need to extend our alarmLaterChain now that we're adding a new "move-later" alarm task.
461-
//
462-
// Technically, we don't need serialize our "move-later" alarms since SQLite has the later
463-
// time committed locally. We could just set the `alarmLaterChain` and pass a `kj::READY_NOW`
464-
// to requestScheduledAlarm, and so if we have a partial failure we would just recover when
465-
// the alarm runs early. That said, it doesn't hurt to serialize on the client-side.
466-
alarmLaterChain = requestScheduledAlarm(alarmStateForCommit, alarmLaterChain.addBranch())
467-
.catch_([](kj::Exception&& e) {
468-
// If an exception occurs when scheduling the alarm later, it's OK -- the alarm will
469-
// eventually fire at the earlier time, and the rescheduling will be retried.
470-
// We catch here to prevent the chain from breaking on errors.
471-
LOG_WARNING_PERIODICALLY("NOSENTRY SQLite reschedule later alarm failed", e);
472-
}).fork();
473488
}
474489
}
475490

@@ -642,6 +657,9 @@ kj::Maybe<kj::Promise<void>> ActorSqlite::setAlarm(
642657
kj::Maybe<kj::Date> newAlarmTime, WriteOptions options) {
643658
requireNotBroken();
644659

660+
// Increment version counter on any alarm modification
661+
++alarmVersion;
662+
645663
// TODO(someday): When deleting alarm data in an otherwise empty database, clear the database to
646664
// free up resources?
647665

src/workerd/io/actor-sqlite.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,11 @@ class ActorSqlite final: public ActorCacheInterface, private kj::TaskSet::ErrorH
239239
// at the alarm manager. Each update waits for the previous one to complete.
240240
kj::ForkedPromise<void> alarmLaterChain = kj::Promise<void>(kj::READY_NOW).fork();
241241

242+
// Version counter that increments on every alarm change. Used to detect if another commit
243+
// modified the alarm while we were async, allowing us to skip redundant post-commit alarm
244+
// syncs. This provides automatic coalescing of rapid alarm changes.
245+
uint64_t alarmVersion = 0;
246+
242247
// Debug flag for tracing alarm synchronization issues for specific namespaces
243248
bool debugAlarmSync = false;
244249

0 commit comments

Comments
 (0)