From ede51542f3cf8aab8fc075ae9188d1a12be039cf Mon Sep 17 00:00:00 2001 From: Wilmer Paulino Date: Tue, 8 Apr 2025 14:04:30 -0700 Subject: [PATCH] Avoid disconnect on message timeout while waiting on monitor/signer When sending/receiving `commitment_signed`/`revoke_and_ack`, we may expect the counterparty to follow up with one of their own in response. In some cases, they are not allowed to send it because they are actually waiting for one from us. Async monitor updates and signing requests may result in the message we need to send to the counterparty being delayed, and our disconnect logic previously did not consider that. It doesn't make sense to disconnect our counterparty when we're the ones seemingly blocking progress. This commit ensures we no longer disconnect when we're waiting on an async monitor update or signing request, unless we're negotiating quiescence. Note that while our counterparty is still able to enforce a similar disconnect logic on their side, as they have no insight into why we're not able to make progress, this commit at least helps prevent reconnection cycles with those that don't enforce one. --- lightning/src/ln/async_signer_tests.rs | 128 +++++++++++++++++++++++++ lightning/src/ln/channel.rs | 44 ++++++--- lightning/src/ln/quiescence_tests.rs | 43 +++------ 3 files changed, 174 insertions(+), 41 deletions(-) diff --git a/lightning/src/ln/async_signer_tests.rs b/lightning/src/ln/async_signer_tests.rs index 9cd319c044c..1f6f508d837 100644 --- a/lightning/src/ln/async_signer_tests.rs +++ b/lightning/src/ln/async_signer_tests.rs @@ -21,6 +21,7 @@ use crate::chain::ChannelMonitorUpdateStatus; use crate::events::bump_transaction::WalletSource; use crate::events::{ClosureReason, Event}; use crate::ln::chan_utils::ClosingTransaction; +use crate::ln::channel::DISCONNECT_PEER_AWAITING_RESPONSE_TICKS; use crate::ln::channel_state::{ChannelDetails, ChannelShutdownState}; use crate::ln::channelmanager::{PaymentId, RAACommitmentOrder, RecipientOnionFields}; use crate::ln::msgs::{BaseMessageHandler, ChannelMessageHandler, MessageSendEvent}; @@ -1091,3 +1092,130 @@ fn do_test_closing_signed(extra_closing_signed: bool, reconnect: bool) { check_closed_event!(nodes[0], 1, ClosureReason::LocallyInitiatedCooperativeClosure, [nodes[1].node.get_our_node_id()], 100000); check_closed_event!(nodes[1], 1, ClosureReason::CounterpartyInitiatedCooperativeClosure, [nodes[0].node.get_our_node_id()], 100000); } + +#[test] +fn test_no_disconnect_while_async_revoke_and_ack_expecting_remote_commitment_signed() { + // Nodes with async signers may be expecting to receive a `commitment_signed` from the + // counterparty even if a `revoke_and_ack` has yet to be sent due to an async signer. Test that + // we don't disconnect the async signer node due to not receiving the `commitment_signed` within + // the timeout while the `revoke_and_ack` is not ready. + let chanmon_cfgs = create_chanmon_cfgs(2); + let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); + let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); + let nodes = create_network(2, &node_cfgs, &node_chanmgrs); + let chan_id = create_announced_chan_between_nodes(&nodes, 0, 1).2; + + let node_id_0 = nodes[0].node.get_our_node_id(); + let node_id_1 = nodes[1].node.get_our_node_id(); + + let payment_amount = 1_000_000; + send_payment(&nodes[0], &[&nodes[1]], payment_amount * 4); + + nodes[1].disable_channel_signer_op(&node_id_0, &chan_id, SignerOp::ReleaseCommitmentSecret); + + // We'll send a payment from both nodes to each other. + let (route1, payment_hash1, _, payment_secret1) = + get_route_and_payment_hash!(&nodes[0], &nodes[1], payment_amount); + let onion1 = RecipientOnionFields::secret_only(payment_secret1); + let payment_id1 = PaymentId(payment_hash1.0); + nodes[0].node.send_payment_with_route(route1, payment_hash1, onion1, payment_id1).unwrap(); + check_added_monitors(&nodes[0], 1); + + let (route2, payment_hash2, _, payment_secret2) = + get_route_and_payment_hash!(&nodes[1], &nodes[0], payment_amount); + let onion2 = RecipientOnionFields::secret_only(payment_secret2); + let payment_id2 = PaymentId(payment_hash2.0); + nodes[1].node.send_payment_with_route(route2, payment_hash2, onion2, payment_id2).unwrap(); + check_added_monitors(&nodes[1], 1); + + let update = get_htlc_update_msgs!(&nodes[0], node_id_1); + nodes[1].node.handle_update_add_htlc(node_id_0, &update.update_add_htlcs[0]); + nodes[1].node.handle_commitment_signed_batch_test(node_id_0, &update.commitment_signed); + check_added_monitors(&nodes[1], 1); + + let update = get_htlc_update_msgs!(&nodes[1], node_id_0); + nodes[0].node.handle_update_add_htlc(node_id_1, &update.update_add_htlcs[0]); + nodes[0].node.handle_commitment_signed_batch_test(node_id_1, &update.commitment_signed); + check_added_monitors(&nodes[0], 1); + + // nodes[0] can only respond with a `revoke_and_ack`. The `commitment_signed` that would follow + // is blocked on receiving a counterparty `revoke_and_ack`, which nodes[1] is still pending on. + let revoke_and_ack = get_event_msg!(&nodes[0], MessageSendEvent::SendRevokeAndACK, node_id_1); + nodes[1].node.handle_revoke_and_ack(node_id_0, &revoke_and_ack); + check_added_monitors(&nodes[1], 1); + + assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty()); + assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty()); + + // nodes[0] will disconnect the counterparty as it's waiting on a `revoke_and_ack`. + // nodes[1] is waiting on a `commitment_signed`, but since it hasn't yet sent its own + // `revoke_and_ack`, it shouldn't disconnect yet. + for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS { + nodes[0].node.timer_tick_occurred(); + nodes[1].node.timer_tick_occurred(); + } + let has_disconnect_event = |event| { + matches!( + event, MessageSendEvent::HandleError { action , .. } + if matches!(action, msgs::ErrorAction::DisconnectPeerWithWarning { .. }) + ) + }; + assert!(nodes[0].node.get_and_clear_pending_msg_events().into_iter().any(has_disconnect_event)); + assert!(nodes[1].node.get_and_clear_pending_msg_events().is_empty()); +} + +#[test] +fn test_no_disconnect_while_async_commitment_signed_expecting_remote_revoke_and_ack() { + // Nodes with async signers may be expecting to receive a `revoke_and_ack` from the + // counterparty even if a `commitment_signed` has yet to be sent due to an async signer. Test + // that we don't disconnect the async signer node due to not receiving the `revoke_and_ack` + // within the timeout while the `commitment_signed` is not ready. + let chanmon_cfgs = create_chanmon_cfgs(2); + let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); + let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); + let nodes = create_network(2, &node_cfgs, &node_chanmgrs); + let chan_id = create_announced_chan_between_nodes(&nodes, 0, 1).2; + + let node_id_0 = nodes[0].node.get_our_node_id(); + let node_id_1 = nodes[1].node.get_our_node_id(); + + // Route a payment and attempt to claim it. + let payment_amount = 1_000_000; + let (preimage, payment_hash, ..) = route_payment(&nodes[0], &[&nodes[1]], payment_amount); + nodes[1].node.claim_funds(preimage); + check_added_monitors(&nodes[1], 1); + + // We'll disable signing counterparty commitments on the payment sender. + nodes[0].disable_channel_signer_op(&node_id_1, &chan_id, SignerOp::SignCounterpartyCommitment); + + // After processing the `update_fulfill`, they'll only be able to send `revoke_and_ack` until + // the `commitment_signed` is no longer pending. + let update = get_htlc_update_msgs!(&nodes[1], node_id_0); + nodes[0].node.handle_update_fulfill_htlc(node_id_1, &update.update_fulfill_htlcs[0]); + nodes[0].node.handle_commitment_signed_batch_test(node_id_1, &update.commitment_signed); + check_added_monitors(&nodes[0], 1); + + let revoke_and_ack = get_event_msg!(&nodes[0], MessageSendEvent::SendRevokeAndACK, node_id_1); + nodes[1].node.handle_revoke_and_ack(node_id_0, &revoke_and_ack); + check_added_monitors(&nodes[1], 1); + + // The payment sender shouldn't disconnect the counterparty due to a missing `revoke_and_ack` + // because the `commitment_signed` isn't ready yet. The payment recipient may disconnect the + // sender because it doesn't have an async signer and it's expecting a timely + // `commitment_signed` response. + for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS { + nodes[0].node.timer_tick_occurred(); + nodes[1].node.timer_tick_occurred(); + } + let has_disconnect_event = |event| { + matches!( + event, MessageSendEvent::HandleError { action , .. } + if matches!(action, msgs::ErrorAction::DisconnectPeerWithWarning { .. }) + ) + }; + assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty()); + assert!(nodes[1].node.get_and_clear_pending_msg_events().into_iter().any(has_disconnect_event)); + + expect_payment_sent(&nodes[0], preimage, None, false, false); + expect_payment_claimed!(nodes[1], payment_hash, payment_amount); +} diff --git a/lightning/src/ln/channel.rs b/lightning/src/ln/channel.rs index 005ec566f0e..1a450d6ebb2 100644 --- a/lightning/src/ln/channel.rs +++ b/lightning/src/ln/channel.rs @@ -3205,6 +3205,15 @@ impl ChannelContext where SP::Target: SignerProvider { } } + /// Checks whether the channel has any HTLC additions, HTLC removals, or fee updates that have + /// been sent by either side but not yet irrevocably committed on both commitments because we're + /// waiting on a pending monitor update or signer request. + pub fn is_monitor_or_signer_pending_channel_update(&self) -> bool { + self.channel_state.is_monitor_update_in_progress() + || self.signer_pending_revoke_and_ack + || self.signer_pending_commitment_update + } + /// Checks whether the channel has any HTLC additions, HTLC removals, or fee updates that have /// been sent by either side but not yet irrevocably committed on both commitments. Holding cell /// updates are not considered because they haven't been sent to the peer yet. @@ -3212,16 +3221,17 @@ impl ChannelContext where SP::Target: SignerProvider { /// This can be used to satisfy quiescence's requirement when sending `stfu`: /// - MUST NOT send `stfu` if any of the sender's htlc additions, htlc removals /// or fee updates are pending for either peer. - fn has_pending_channel_update(&self) -> bool { + /// + /// Note that it is still possible for an update to be pending that's not captured here due to a + /// pending monitor update or signer request. `is_monitor_or_signer_pending_channel_update` + /// should also be checked in such cases. + fn is_waiting_on_peer_pending_channel_update(&self) -> bool { // An update from the local/remote node may be pending on the remote/local commitment since // they are not tracked within our state, so we rely on whether any `commitment_signed` or // `revoke_and_ack` messages are owed. // // We check these flags first as they are more likely to be set. - if self.channel_state.is_awaiting_remote_revoke() || self.expecting_peer_commitment_signed - || self.monitor_pending_revoke_and_ack || self.signer_pending_revoke_and_ack - || self.monitor_pending_commitment_signed || self.signer_pending_commitment_update - { + if self.channel_state.is_awaiting_remote_revoke() || self.expecting_peer_commitment_signed { return true; } @@ -7417,8 +7427,11 @@ impl FundedChannel where || self.context.channel_state.is_local_stfu_sent() // Cleared upon receiving a message that triggers the end of quiescence. || self.context.channel_state.is_quiescent() - // Cleared upon receiving `revoke_and_ack`. - || self.context.has_pending_channel_update() + // Cleared upon receiving `revoke_and_ack`. Since we're not queiscent, as we just + // checked above, we intentionally don't disconnect our counterparty if we're waiting on + // a monitor update or signer request. + || (self.context.is_waiting_on_peer_pending_channel_update() + && !self.context.is_monitor_or_signer_pending_channel_update()) { // This is the first tick we've seen after expecting to make forward progress. self.context.sent_message_awaiting_response = Some(1); @@ -9306,7 +9319,9 @@ impl FundedChannel where ); debug_assert!(self.context.is_live()); - if self.context.has_pending_channel_update() { + if self.context.is_waiting_on_peer_pending_channel_update() + || self.context.is_monitor_or_signer_pending_channel_update() + { return Err(ChannelError::Ignore( "We cannot send `stfu` while state machine is pending".to_owned() )); @@ -9367,10 +9382,11 @@ impl FundedChannel where )); } - // We don't check `has_pending_channel_update` prior to setting the flag because it - // considers pending updates from either node. This means we may accept a counterparty - // `stfu` while they had pending updates, but that's fine as we won't send ours until - // _all_ pending updates complete, allowing the channel to become quiescent then. + // We don't check `is_waiting_on_peer_pending_channel_update` prior to setting the flag + // because it considers pending updates from either node. This means we may accept a + // counterparty `stfu` while they had pending updates, but that's fine as we won't send + // ours until _all_ pending updates complete, allowing the channel to become quiescent + // then. self.context.channel_state.set_remote_stfu_sent(); let is_holder_initiator = if self.context.channel_state.is_awaiting_quiescence() { @@ -9394,7 +9410,9 @@ impl FundedChannel where // We were expecting to receive `stfu` because we already sent ours. self.mark_response_received(); - if self.context.has_pending_channel_update() { + if self.context.is_waiting_on_peer_pending_channel_update() + || self.context.is_monitor_or_signer_pending_channel_update() + { // Since we've already sent `stfu`, it should not be possible for one of our updates to // be pending, so anything pending currently must be from a counterparty update. return Err(ChannelError::WarnAndDisconnect( diff --git a/lightning/src/ln/quiescence_tests.rs b/lightning/src/ln/quiescence_tests.rs index d35fe5a33be..c2d44942b04 100644 --- a/lightning/src/ln/quiescence_tests.rs +++ b/lightning/src/ln/quiescence_tests.rs @@ -180,10 +180,10 @@ fn allow_shutdown_while_awaiting_quiescence(local_shutdown: bool) { } #[test] -fn test_quiescence_tracks_monitor_update_in_progress_and_waits_for_async_signer() { +fn test_quiescence_waits_for_async_signer_and_monitor_update() { // Test that quiescence: // a) considers an async signer when determining whether a pending channel update exists - // b) tracks in-progress monitor updates until no longer quiescent + // b) waits until pending monitor updates complete to send `stfu`/become quiescent let chanmon_cfgs = create_chanmon_cfgs(2); let node_cfgs = create_node_cfgs(2, &chanmon_cfgs); let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]); @@ -244,29 +244,12 @@ fn test_quiescence_tracks_monitor_update_in_progress_and_waits_for_async_signer( let revoke_and_ack = find_msg!(msg_events, SendRevokeAndACK); let stfu = find_msg!(msg_events, SendStfu); - // While handling the last `revoke_and_ack` on nodes[0], we'll hold the monitor update and - // become quiescent. + // While handling the last `revoke_and_ack` on nodes[0], we'll hold the monitor update. We + // cannot become quiescent until it completes. chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress); nodes[0].node.handle_revoke_and_ack(node_id_1, &revoke_and_ack); nodes[0].node.handle_stfu(node_id_1, &stfu); - let stfu = get_event_msg!(&nodes[0], MessageSendEvent::SendStfu, node_id_1); - nodes[1].node.handle_stfu(node_id_0, &stfu); - - nodes[0].node.exit_quiescence(&node_id_1, &chan_id).unwrap(); - nodes[1].node.exit_quiescence(&node_id_0, &chan_id).unwrap(); - - // After exiting quiescence, we should be able to resume payments from nodes[0], but the monitor - // update has yet to complete. Attempting to send a payment now will be delayed until the - // monitor update completes. - { - let (route, payment_hash, _, payment_secret) = - get_route_and_payment_hash!(&nodes[0], &nodes[1], payment_amount); - let onion = RecipientOnionFields::secret_only(payment_secret); - let payment_id = PaymentId(payment_hash.0); - nodes[0].node.send_payment_with_route(route, payment_hash, onion, payment_id).unwrap(); - } - check_added_monitors(&nodes[0], 0); assert!(nodes[0].node.get_and_clear_pending_msg_events().is_empty()); // We have two updates pending: @@ -276,17 +259,21 @@ fn test_quiescence_tracks_monitor_update_in_progress_and_waits_for_async_signer( chain_monitor.latest_monitor_update_id.lock().unwrap().get(&chan_id).unwrap().clone(); let chain_monitor = &nodes[0].chain_monitor.chain_monitor; // One for the latest commitment transaction update from the last `revoke_and_ack` - chain_monitor.channel_monitor_updated(chan_id, latest_update - 1).unwrap(); + chain_monitor.channel_monitor_updated(chan_id, latest_update).unwrap(); expect_payment_sent(&nodes[0], preimage, None, true, true); // One for the commitment secret update from the last `revoke_and_ack` - chain_monitor.channel_monitor_updated(chan_id, latest_update).unwrap(); + chain_monitor.channel_monitor_updated(chan_id, latest_update + 1).unwrap(); } - // With the pending monitor updates complete, we'll see a new monitor update go out when freeing - // the holding cells to send out the new HTLC. - nodes[0].chain_monitor.complete_sole_pending_chan_update(&chan_id); - let _ = get_htlc_update_msgs!(&nodes[0], node_id_1); - check_added_monitors(&nodes[0], 1); + // With the updates completed, we can now become quiescent. + let stfu = get_event_msg!(&nodes[0], MessageSendEvent::SendStfu, node_id_1); + nodes[1].node.handle_stfu(node_id_0, &stfu); + + nodes[0].node.exit_quiescence(&node_id_1, &chan_id).unwrap(); + nodes[1].node.exit_quiescence(&node_id_0, &chan_id).unwrap(); + + // After exiting quiescence, we should be able to resume payments from nodes[0]. + send_payment(&nodes[0], &[&nodes[1]], payment_amount); } #[test]