@@ -383,19 +383,51 @@ kj::Promise<void> ActorSqlite::requestScheduledAlarm(
383383 });
384384}
385385
386+ void ActorSqlite::scheduleLaterAlarm (kj::Maybe<kj::Date> newAlarmTime, SpanParent parentSpan) {
387+ if (alarmLaterIsInFlight) {
388+ // There's already a move-later request in-flight. Just store the desired time; the in-flight
389+ // request's completion handler will pick it up and start a new request. This overwrites any
390+ // previously pending time, which is fine -- only the latest value matters.
391+ pendingLaterAlarmTime = newAlarmTime;
392+ return ;
393+ }
394+
395+ alarmLaterIsInFlight = true ;
396+ alarmLaterInFlight = requestScheduledAlarm (newAlarmTime, alarmLaterInFlight.addBranch ())
397+ .attach (parentSpan.newChild (" actor_sqlite_alarm_sync" _kjc))
398+ .catch_ ([](kj::Exception&& e) {
399+ // If an exception occurs when scheduling the alarm later, it's OK -- the alarm will
400+ // eventually fire at the earlier time, and the rescheduling will be retried.
401+ // We catch here to prevent the chain from breaking on errors.
402+ LOG_WARNING_PERIODICALLY (" NOSENTRY SQLite reschedule later alarm failed" , e);
403+ }).fork ();
404+
405+ commitTasks.add (alarmLaterInFlight.addBranch ()
406+ .then ([this ]() {
407+ alarmLaterIsInFlight = false ;
408+ KJ_IF_SOME (nextTime, kj::mv (pendingLaterAlarmTime)) {
409+ scheduleLaterAlarm (nextTime, nullptr );
410+ }
411+ }).catch_ ([](kj::Exception&& e) {
412+ // Move-later alarm failures are non-fatal; catch here to prevent taskFailed() from
413+ // breaking the output gate.
414+ LOG_WARNING_PERIODICALLY (" NOSENTRY SQLite reschedule later alarm drain failed" , e);
415+ }));
416+ }
417+
386418ActorSqlite::PrecommitAlarmState ActorSqlite::startPrecommitAlarmScheduling () {
387419 PrecommitAlarmState state;
388420 if (pendingCommit == kj::none &&
389421 willFireEarlier (metadata.getAlarm (), alarmScheduledNoLaterThan)) {
390- // We must wait on the `alarmLaterChain` here, otherwise, if there is a pending "move later"
391- // alarm task and it fails, our "move earlier" alarm might interleave, succeed, and be followed
392- // by a retry of the "move later" alarm. This happens because "move later" alarms complete after
393- // we commit to local SQLite.
422+ // We must wait on the `alarmLaterInFlight` promise here, otherwise, if there is a pending
423+ // "move later" alarm task and it fails, our "move earlier" alarm might interleave, succeed,
424+ // and be followed by a retry of the "move later" alarm. This happens because "move later"
425+ // alarms complete after we commit to local SQLite.
394426 //
395- // By waiting on any pending "move later" alarm, we correctly serialize our `scheduleRun()`
427+ // By waiting on any in-flight "move later" alarm, we correctly serialize our `scheduleRun()`
396428 // calls to the alarm manager.
397429 state.schedulingPromise =
398- requestScheduledAlarm (metadata.getAlarm (), alarmLaterChain .addBranch ());
430+ requestScheduledAlarm (metadata.getAlarm (), alarmLaterInFlight .addBranch ());
399431 }
400432 return kj::mv (state);
401433}
@@ -466,17 +498,18 @@ kj::Promise<void> ActorSqlite::commitImpl(
466498 KJ_LOG (WARNING, " NOSENTRY DEBUG_ALARM: Move earlier loop iteration" , syncIterations,
467499 logDate (currentAlarmState), logDate (alarmScheduledNoLaterThan), alarmVersion);
468500 }
469- // Note that we do not pass alarmLaterChain here. We don't need to for the following reasons:
501+ // Note that we do not pass alarmLaterInFlight here. We don't need to for the following
502+ // reasons:
470503 //
471- // 1. We already waited for the chain in the precommitAlarmState promise above.
504+ // 1. We already waited for it in the precommitAlarmState promise above.
472505 // 2. We set the `pendingCommit` prior to yielding to the event loop earlier, so any subsequent
473506 // commits have to wait for us to fulfill the pendingCommit promise. In short, no one could
474- // have added another "move-later" alarm to the chain , not until we finish.
507+ // have started another "move-later" alarm, not until we finish.
475508 //
476- // While we *could* pass the alarmLaterChain promise (it wouldn't be incorrect), when calling
477- // addBranch() on a resolved ForkedPromise, the continuation would be evaluated on a future turn
478- // of the event loop. That means we're going to suspend, even if the promise is ready, which
479- // means we'd take a performance hit.
509+ // While we *could* pass the alarmLaterInFlight promise (it wouldn't be incorrect), when
510+ // calling addBranch() on a resolved ForkedPromise, the continuation would be evaluated on a
511+ // future turn of the event loop. That means we're going to suspend, even if the promise is
512+ // ready, which means we'd take a performance hit.
480513 co_await requestScheduledAlarm (metadata.getAlarm (), kj::READY_NOW);
481514 syncIterations++;
482515 }
@@ -535,20 +568,7 @@ kj::Promise<void> ActorSqlite::commitImpl(
535568 KJ_LOG (WARNING, " NOSENTRY DEBUG_ALARM: Moving alarm later" , " sqlite_has" ,
536569 logDate (alarmStateForCommit), logDate (alarmScheduledNoLaterThan), alarmVersion);
537570 }
538- // We need to extend our alarmLaterChain now that we're adding a new "move-later" alarm task.
539- //
540- // Technically, we don't need serialize our "move-later" alarms since SQLite has the later
541- // time committed locally. We could just set the `alarmLaterChain` and pass a `kj::READY_NOW`
542- // to requestScheduledAlarm, and so if we have a partial failure we would just recover when
543- // the alarm runs early. That said, it doesn't hurt to serialize on the client-side.
544- alarmLaterChain = requestScheduledAlarm (alarmStateForCommit, alarmLaterChain.addBranch ())
545- .attach (commitSpan.newChild (" actor_sqlite_alarm_sync" _kjc))
546- .catch_ ([](kj::Exception&& e) {
547- // If an exception occurs when scheduling the alarm later, it's OK -- the alarm will
548- // eventually fire at the earlier time, and the rescheduling will be retried.
549- // We catch here to prevent the chain from breaking on errors.
550- LOG_WARNING_PERIODICALLY (" NOSENTRY SQLite reschedule later alarm failed" , e);
551- }).fork ();
571+ scheduleLaterAlarm (alarmStateForCommit, SpanParent (commitSpan));
552572 }
553573 }
554574}
@@ -945,14 +965,20 @@ kj::OneOf<ActorSqlite::CancelAlarmHandler, ActorSqlite::RunAlarmHandler> ActorSq
945965 " NOSENTRY SQLite alarm handler canceled with requestScheduledAlarm." , scheduledTime,
946966 localAlarmState.orDefault (kj::UNIX_EPOCH), actorId);
947967
948- // Since we're requesting to move the alarm time to later, we need to add to our
949- // `alarmLaterChain`. Note that for the chain, we want to make sure any scheduling failure
950- // does not break us, but for the `CancelAlarmHandler`, we want the caller to receive the
951- // exception normally, so we do not consume the exception.
968+ // Since we're requesting to move the alarm time to later, we need to update the
969+ // alarmLaterInFlight promise. We issue a single requestScheduledAlarm call, fork it,
970+ // and use one branch for tracking (with error catching) and the other for the caller
971+ // (which propagates errors). Note that we directly update alarmLaterInFlight here
972+ // rather than using scheduleLaterAlarm(), because we need a separate un-caught branch
973+ // of the promise for the CancelAlarmHandler return value.
952974 auto schedulingPromise =
953- requestScheduledAlarm (localAlarmState, alarmLaterChain.addBranch ()).fork ();
954- alarmLaterChain = schedulingPromise.addBranch ()
955- .catch_ ([](kj::Exception&& e) {
975+ requestScheduledAlarm (localAlarmState, alarmLaterInFlight.addBranch ()).fork ();
976+ // Clear any stale pending time so that when the existing completion handler
977+ // fires it does not start a redundant scheduleLaterAlarm for the same time that
978+ // armAlarmHandler is already scheduling.
979+ pendingLaterAlarmTime = kj::none;
980+ alarmLaterInFlight = schedulingPromise.addBranch ()
981+ .catch_ ([](kj::Exception&& e) {
956982 // If an exception occurs when scheduling the alarm later, it's OK -- the alarm will
957983 // eventually fire at the earlier time, and the rescheduling will be retried.
958984 // We catch here to prevent the chain from breaking on errors.
@@ -978,7 +1004,7 @@ kj::OneOf<ActorSqlite::CancelAlarmHandler, ActorSqlite::RunAlarmHandler> ActorSq
9781004 // handler invocation.
9791005 //
9801006 // We pass kj::READY_NOW because being in this branch (SQLite is ahead of the alarm manager)
981- // means there's no recent move-later operation to wait for, so no need for alarmLaterChain .
1007+ // means there's no recent move-later operation to wait for, so no need for alarmLaterInFlight .
9821008 return CancelAlarmHandler{
9831009 .waitBeforeCancel = requestScheduledAlarm (localAlarmState, kj::READY_NOW)};
9841010 }
0 commit comments