Skip to content
This repository was archived by the owner on Nov 15, 2023. It is now read-only.

Commit b8854b5

Browse files
authored
sc-consensus-beefy: restart voter on pallet reset (#14821)
When detecting pallet-beefy consensus reset, just reinitialize the worker and continue without bringing down the task (and possibly the node). Signed-off-by: Adrian Catangiu <[email protected]>
1 parent 8dac0ab commit b8854b5

File tree

7 files changed

+130
-102
lines changed

7 files changed

+130
-102
lines changed

Diff for: client/consensus/beefy/src/communication/peers.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use std::collections::{HashMap, VecDeque};
2424

2525
/// Report specifying a reputation change for a given peer.
2626
#[derive(Debug, PartialEq)]
27-
pub(crate) struct PeerReport {
27+
pub struct PeerReport {
2828
pub who: PeerId,
2929
pub cost_benefit: ReputationChange,
3030
}

Diff for: client/consensus/beefy/src/communication/request_response/incoming_requests_handler.rs

+5-6
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
1919
use codec::DecodeAll;
2020
use futures::{channel::oneshot, StreamExt};
21-
use log::{debug, error, trace};
21+
use log::{debug, trace};
2222
use sc_client_api::BlockBackend;
2323
use sc_network::{
2424
config as netconfig, config::RequestResponseConfig, types::ProtocolName, PeerId,
@@ -182,7 +182,9 @@ where
182182
}
183183

184184
/// Run [`BeefyJustifsRequestHandler`].
185-
pub async fn run(mut self) {
185+
///
186+
/// Should never end, returns `Error` otherwise.
187+
pub async fn run(&mut self) -> Error {
186188
trace!(target: BEEFY_SYNC_LOG_TARGET, "🥩 Running BeefyJustifsRequestHandler");
187189

188190
while let Ok(request) = self
@@ -215,9 +217,6 @@ where
215217
},
216218
}
217219
}
218-
error!(
219-
target: crate::LOG_TARGET,
220-
"🥩 On-demand requests receiver stream terminated, closing worker."
221-
);
220+
Error::RequestsReceiverStreamClosed
222221
}
223222
}

Diff for: client/consensus/beefy/src/communication/request_response/mod.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ pub struct JustificationRequest<B: Block> {
7575
}
7676

7777
#[derive(Debug, thiserror::Error)]
78-
pub(crate) enum Error {
78+
pub enum Error {
7979
#[error(transparent)]
8080
Client(#[from] sp_blockchain::Error),
8181

@@ -102,4 +102,7 @@ pub(crate) enum Error {
102102

103103
#[error("Internal error while getting response.")]
104104
ResponseError,
105+
106+
#[error("On-demand requests receiver stream terminated.")]
107+
RequestsReceiverStreamClosed,
105108
}

Diff for: client/consensus/beefy/src/error.rs

+11-1
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,18 @@ pub enum Error {
3434
Signature(String),
3535
#[error("Session uninitialized")]
3636
UninitSession,
37-
#[error("pallet-beefy was reset, please restart voter")]
37+
#[error("pallet-beefy was reset")]
3838
ConsensusReset,
39+
#[error("Block import stream terminated")]
40+
BlockImportStreamTerminated,
41+
#[error("Gossip Engine terminated")]
42+
GossipEngineTerminated,
43+
#[error("Finality proofs gossiping stream terminated")]
44+
FinalityProofGossipStreamTerminated,
45+
#[error("Finality stream terminated")]
46+
FinalityStreamTerminated,
47+
#[error("Votes gossiping stream terminated")]
48+
VotesGossipStreamTerminated,
3949
}
4050

4151
#[cfg(test)]

Diff for: client/consensus/beefy/src/lib.rs

+88-66
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ pub async fn start_beefy_gadget<B, BE, C, N, P, R, S>(
221221
B: Block,
222222
BE: Backend<B>,
223223
C: Client<B, BE> + BlockBackend<B>,
224-
P: PayloadProvider<B>,
224+
P: PayloadProvider<B> + Clone,
225225
R: ProvideRuntimeApi<B>,
226226
R::Api: BeefyApi<B, AuthorityId> + MmrApi<B, MmrRootHash, NumberFor<B>>,
227227
N: GossipNetwork<B> + NetworkRequest + Send + Sync + 'static,
@@ -237,7 +237,7 @@ pub async fn start_beefy_gadget<B, BE, C, N, P, R, S>(
237237
min_block_delta,
238238
prometheus_registry,
239239
links,
240-
on_demand_justifications_handler,
240+
mut on_demand_justifications_handler,
241241
} = beefy_params;
242242

243243
let BeefyNetworkParams {
@@ -248,83 +248,105 @@ pub async fn start_beefy_gadget<B, BE, C, N, P, R, S>(
248248
..
249249
} = network_params;
250250

251-
let known_peers = Arc::new(Mutex::new(KnownPeers::new()));
252-
// Default votes filter is to discard everything.
253-
// Validator is updated later with correct starting round and set id.
254-
let (gossip_validator, gossip_report_stream) =
255-
communication::gossip::GossipValidator::new(known_peers.clone());
256-
let gossip_validator = Arc::new(gossip_validator);
257-
let mut gossip_engine = GossipEngine::new(
258-
network.clone(),
259-
sync.clone(),
260-
gossip_protocol_name,
261-
gossip_validator.clone(),
262-
None,
263-
);
264251
let metrics = register_metrics(prometheus_registry.clone());
265252

266-
// The `GossipValidator` adds and removes known peers based on valid votes and network events.
267-
let on_demand_justifications = OnDemandJustificationsEngine::new(
268-
network.clone(),
269-
justifications_protocol_name,
270-
known_peers,
271-
prometheus_registry.clone(),
272-
);
273-
274253
// Subscribe to finality notifications and justifications before waiting for runtime pallet and
275254
// reuse the streams, so we don't miss notifications while waiting for pallet to be available.
276255
let mut finality_notifications = client.finality_notification_stream().fuse();
277-
let block_import_justif = links.from_block_import_justif_stream.subscribe(100_000).fuse();
278-
279-
// Wait for BEEFY pallet to be active before starting voter.
280-
let persisted_state =
281-
match wait_for_runtime_pallet(&*runtime, &mut gossip_engine, &mut finality_notifications)
282-
.await
283-
.and_then(|(beefy_genesis, best_grandpa)| {
284-
load_or_init_voter_state(
285-
&*backend,
286-
&*runtime,
287-
beefy_genesis,
288-
best_grandpa,
289-
min_block_delta,
290-
)
291-
}) {
256+
let mut block_import_justif = links.from_block_import_justif_stream.subscribe(100_000).fuse();
257+
258+
// We re-create and re-run the worker in this loop in order to quickly reinit and resume after
259+
// select recoverable errors.
260+
loop {
261+
let known_peers = Arc::new(Mutex::new(KnownPeers::new()));
262+
// Default votes filter is to discard everything.
263+
// Validator is updated later with correct starting round and set id.
264+
let (gossip_validator, gossip_report_stream) =
265+
communication::gossip::GossipValidator::new(known_peers.clone());
266+
let gossip_validator = Arc::new(gossip_validator);
267+
let mut gossip_engine = GossipEngine::new(
268+
network.clone(),
269+
sync.clone(),
270+
gossip_protocol_name.clone(),
271+
gossip_validator.clone(),
272+
None,
273+
);
274+
275+
// The `GossipValidator` adds and removes known peers based on valid votes and network
276+
// events.
277+
let on_demand_justifications = OnDemandJustificationsEngine::new(
278+
network.clone(),
279+
justifications_protocol_name.clone(),
280+
known_peers,
281+
prometheus_registry.clone(),
282+
);
283+
284+
// Wait for BEEFY pallet to be active before starting voter.
285+
let persisted_state = match wait_for_runtime_pallet(
286+
&*runtime,
287+
&mut gossip_engine,
288+
&mut finality_notifications,
289+
)
290+
.await
291+
.and_then(|(beefy_genesis, best_grandpa)| {
292+
load_or_init_voter_state(
293+
&*backend,
294+
&*runtime,
295+
beefy_genesis,
296+
best_grandpa,
297+
min_block_delta,
298+
)
299+
}) {
292300
Ok(state) => state,
293301
Err(e) => {
294302
error!(target: LOG_TARGET, "Error: {:?}. Terminating.", e);
295303
return
296304
},
297305
};
298-
// Update the gossip validator with the right starting round and set id.
299-
if let Err(e) = persisted_state
300-
.gossip_filter_config()
301-
.map(|f| gossip_validator.update_filter(f))
302-
{
303-
error!(target: LOG_TARGET, "Error: {:?}. Terminating.", e);
304-
return
305-
}
306+
// Update the gossip validator with the right starting round and set id.
307+
if let Err(e) = persisted_state
308+
.gossip_filter_config()
309+
.map(|f| gossip_validator.update_filter(f))
310+
{
311+
error!(target: LOG_TARGET, "Error: {:?}. Terminating.", e);
312+
return
313+
}
306314

307-
let worker = worker::BeefyWorker {
308-
backend,
309-
payload_provider,
310-
runtime,
311-
sync,
312-
key_store: key_store.into(),
313-
gossip_engine,
314-
gossip_validator,
315-
gossip_report_stream,
316-
on_demand_justifications,
317-
links,
318-
metrics,
319-
pending_justifications: BTreeMap::new(),
320-
persisted_state,
321-
};
315+
let worker = worker::BeefyWorker {
316+
backend: backend.clone(),
317+
payload_provider: payload_provider.clone(),
318+
runtime: runtime.clone(),
319+
sync: sync.clone(),
320+
key_store: key_store.clone().into(),
321+
gossip_engine,
322+
gossip_validator,
323+
gossip_report_stream,
324+
on_demand_justifications,
325+
links: links.clone(),
326+
metrics: metrics.clone(),
327+
pending_justifications: BTreeMap::new(),
328+
persisted_state,
329+
};
322330

323-
futures::future::select(
324-
Box::pin(worker.run(block_import_justif, finality_notifications)),
325-
Box::pin(on_demand_justifications_handler.run()),
326-
)
327-
.await;
331+
match futures::future::select(
332+
Box::pin(worker.run(&mut block_import_justif, &mut finality_notifications)),
333+
Box::pin(on_demand_justifications_handler.run()),
334+
)
335+
.await
336+
{
337+
// On `ConsensusReset` error, just reinit and restart voter.
338+
futures::future::Either::Left((error::Error::ConsensusReset, _)) => {
339+
error!(target: LOG_TARGET, "🥩 Error: {:?}. Restarting voter.", error::Error::ConsensusReset);
340+
continue
341+
},
342+
// On other errors, bring down / finish the task.
343+
futures::future::Either::Left((worker_err, _)) =>
344+
error!(target: LOG_TARGET, "🥩 Error: {:?}. Terminating.", worker_err),
345+
futures::future::Either::Right((odj_handler_err, _)) =>
346+
error!(target: LOG_TARGET, "🥩 Error: {:?}. Terminating.", odj_handler_err),
347+
};
348+
return
349+
}
328350
}
329351

330352
fn load_or_init_voter_state<B, BE, R>(

Diff for: client/consensus/beefy/src/worker.rs

+15-27
Original file line numberDiff line numberDiff line change
@@ -447,11 +447,7 @@ where
447447
.ok()
448448
.flatten()
449449
.filter(|genesis| *genesis == self.persisted_state.pallet_genesis)
450-
.ok_or_else(|| {
451-
let err = Error::ConsensusReset;
452-
error!(target: LOG_TARGET, "🥩 Error: {}", err);
453-
err
454-
})?;
450+
.ok_or(Error::ConsensusReset)?;
455451

456452
if *header.number() > self.best_grandpa_block() {
457453
// update best GRANDPA finalized block we have seen
@@ -795,11 +791,12 @@ where
795791
/// Main loop for BEEFY worker.
796792
///
797793
/// Run the main async loop which is driven by finality notifications and gossiped votes.
794+
/// Should never end, returns `Error` otherwise.
798795
pub(crate) async fn run(
799796
mut self,
800-
mut block_import_justif: Fuse<NotificationReceiver<BeefyVersionedFinalityProof<B>>>,
801-
mut finality_notifications: Fuse<FinalityNotifications<B>>,
802-
) {
797+
block_import_justif: &mut Fuse<NotificationReceiver<BeefyVersionedFinalityProof<B>>>,
798+
finality_notifications: &mut Fuse<FinalityNotifications<B>>,
799+
) -> Error {
803800
info!(
804801
target: LOG_TARGET,
805802
"🥩 run BEEFY worker, best grandpa: #{:?}.",
@@ -848,17 +845,17 @@ where
848845
// Use `select_biased!` to prioritize order below.
849846
// Process finality notifications first since these drive the voter.
850847
notification = finality_notifications.next() => {
851-
if notification.and_then(|notif| {
852-
self.handle_finality_notification(&notif).ok()
853-
}).is_none() {
854-
error!(target: LOG_TARGET, "🥩 Finality stream terminated, closing worker.");
855-
return;
848+
if let Some(notif) = notification {
849+
if let Err(err) = self.handle_finality_notification(&notif) {
850+
return err;
851+
}
852+
} else {
853+
return Error::FinalityStreamTerminated;
856854
}
857855
},
858856
// Make sure to pump gossip engine.
859857
_ = gossip_engine => {
860-
error!(target: LOG_TARGET, "🥩 Gossip engine has terminated, closing worker.");
861-
return;
858+
return Error::GossipEngineTerminated;
862859
},
863860
// Process incoming justifications as these can make some in-flight votes obsolete.
864861
response_info = self.on_demand_justifications.next().fuse() => {
@@ -881,8 +878,7 @@ where
881878
debug!(target: LOG_TARGET, "🥩 {}", err);
882879
}
883880
} else {
884-
error!(target: LOG_TARGET, "🥩 Block import stream terminated, closing worker.");
885-
return;
881+
return Error::BlockImportStreamTerminated;
886882
}
887883
},
888884
justif = gossip_proofs.next() => {
@@ -892,11 +888,7 @@ where
892888
debug!(target: LOG_TARGET, "🥩 {}", err);
893889
}
894890
} else {
895-
error!(
896-
target: LOG_TARGET,
897-
"🥩 Finality proofs gossiping stream terminated, closing worker."
898-
);
899-
return;
891+
return Error::FinalityProofGossipStreamTerminated;
900892
}
901893
},
902894
// Finally process incoming votes.
@@ -907,11 +899,7 @@ where
907899
debug!(target: LOG_TARGET, "🥩 {}", err);
908900
}
909901
} else {
910-
error!(
911-
target: LOG_TARGET,
912-
"🥩 Votes gossiping stream terminated, closing worker."
913-
);
914-
return;
902+
return Error::VotesGossipStreamTerminated;
915903
}
916904
},
917905
// Process peer reports.

Diff for: primitives/consensus/beefy/src/mmr.rs

+6
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,12 @@ mod mmr_root_provider {
162162
_phantom: PhantomData<B>,
163163
}
164164

165+
impl<B, R> Clone for MmrRootProvider<B, R> {
166+
fn clone(&self) -> Self {
167+
Self { runtime: self.runtime.clone(), _phantom: PhantomData }
168+
}
169+
}
170+
165171
impl<B, R> MmrRootProvider<B, R>
166172
where
167173
B: Block,

0 commit comments

Comments
 (0)