diff --git a/docs/release-notes/release-notes-0.20.0.md b/docs/release-notes/release-notes-0.20.0.md index 26e45114be..d9ff44480a 100644 --- a/docs/release-notes/release-notes-0.20.0.md +++ b/docs/release-notes/release-notes-0.20.0.md @@ -46,6 +46,13 @@ sweeper where some outputs would not be resolved due to an error string mismatch. +- [Fixed](https://github.com/lightningnetwork/lnd/pull/10277) a case in the + status manager where we would have disabled a channel although it was already + active in the switch. When a channel reconnects it starts a timer which might + only trigger after the disable timer has already been triggered, so we had a + race condition where we would unnecessarily mark a channel as disabled + although it was active in the first place. + # New Features * Use persisted [nodeannouncement](https://github.com/lightningnetwork/lnd/pull/8825) diff --git a/netann/chan_status_manager.go b/netann/chan_status_manager.go index feb3a5dd19..6fd2893f63 100644 --- a/netann/chan_status_manager.go +++ b/netann/chan_status_manager.go @@ -568,6 +568,24 @@ func (m *ChanStatusManager) disableInactiveChannels() { continue } + // Re-verify the channel is still inactive before disabling. + // This prevents the race condition where a channel becomes + // active again after being marked as pending disabled but + // before the disable timeout expires. Otherwise there is a + // race condition where the channel is disabled even though it + // is active. + chanID := lnwire.NewChanIDFromOutPoint(outpoint) + if m.cfg.IsChannelActive(chanID) { + // Channel became active again, cancel the pending + // disable. + log.Debugf("Channel(%v) became active, canceling "+ + "scheduled disable", outpoint) + + m.chanStates.markEnabled(outpoint) + + continue + } + log.Infof("Announcing channel(%v) disabled "+ "[detected]", outpoint) diff --git a/netann/chan_status_manager_test.go b/netann/chan_status_manager_test.go index 024b779322..3bdf109095 100644 --- a/netann/chan_status_manager_test.go +++ b/netann/chan_status_manager_test.go @@ -909,6 +909,58 @@ var stateMachineTests = []stateMachineTest{ ) }, }, + { + name: "channel reconnects before disable", + startActive: true, + startEnabled: true, + fn: func(h testHarness) { + // This test demonstrates the race condition fix + // where a channel becomes inactive, gets marked as + // pending disabled, then becomes active again before + // the disable timeout expires. + // The channel should NOT be disabled in this case. + + // Step 1: Simulate disconnection - channel becomes + // inactive. + h.markInactive(h.graph.chans()) + + // Step 2: Wait for the channel to be marked as + // pending disabled. Sample interval of the manager + // is 50ms, so we wait for 50ms + 50ms (buffer). + time.Sleep(50*time.Millisecond + 50*time.Millisecond) + + // Step 3: Simulate reconnection - channel becomes + // active again. + // + // NOTE: This does not reflect the actual behavior of + // LND because as soon as the channel becomes active it + // will start an enable timer and send an enable update. + // However we want to avoid testing these timings + // here. In general it is important that the channel + // does not get disabled in case it reconnects before + // the disable timeout expires. + h.markActive(h.graph.chans()) + + // Step 4: Wait for the disable timeout to expire. + // The disable timeout (1 second) expires, but our fix + // should prevent the disable because the channel is + // active again. + time.Sleep(time.Second + 200*time.Millisecond) + + // Step 5: Verify that NO disable update was sent. + // The channel should remain enabled because it became + // active again before the disable timeout expired, + // and our fix re-checked its status before disabling. + h.assertNoUpdates(500 * time.Millisecond) + + // Step 6: Verify that the channel is still enabled by + // checking that we can still request enable without + // sending an update. This means the channel is still + // enabled. + h.assertEnables(h.graph.chans(), nil, false) + h.assertNoUpdates(500 * time.Millisecond) + }, + }, } // TestChanStatusManagerStateMachine tests the possible state transitions that