From 874797d54fcf319813586ba7edb7861c21b98aa8 Mon Sep 17 00:00:00 2001 From: Aaro Altonen Date: Wed, 5 Apr 2023 16:02:35 +0300 Subject: [PATCH 1/5] Evict inactive peers from `SyncingEngine` If both halves of the block announce notification stream have been inactive for 2 minutes, report the peer and disconnect it, allowing `SyncingEngine` to free up a slot for some other peer that hopefully is more active. This needs to be done because the node may falsely believe it has open connections to peers because the inbound substream can be closed without any notification and closed outbound substream is noticed only when node attempts to write to it which may not happen if the node has nothing to send. --- client/network/sync/src/engine.rs | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/client/network/sync/src/engine.rs b/client/network/sync/src/engine.rs index e3d45a980a0b4..cd2b03457c29e 100644 --- a/client/network/sync/src/engine.rs +++ b/client/network/sync/src/engine.rs @@ -67,6 +67,7 @@ use std::{ Arc, }, task::Poll, + time::{Duration, Instant}, }; /// Interval at which we perform time based maintenance @@ -75,12 +76,19 @@ const TICK_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(1100) /// Maximum number of known block hashes to keep for a peer. const MAX_KNOWN_BLOCKS: usize = 1024; // ~32kb per peer + LruHashSet overhead +/// If the block announces stream to peer has been inactive for two minutes meaning local node +/// has not sent or received block announcements to/from the peer, report the node for inactivity, +/// disconnect it and attempt to establish connection to some other peer. +const INACTIVITY_EVICT_THRESHOLD: Duration = Duration::from_secs(2 * 60); + mod rep { use sc_peerset::ReputationChange as Rep; /// Peer has different genesis. pub const GENESIS_MISMATCH: Rep = Rep::new_fatal("Genesis mismatch"); /// Peer send us a block announcement that failed at validation. pub const BAD_BLOCK_ANNOUNCEMENT: Rep = Rep::new(-(1 << 12), "Bad block announcement"); + /// Block announce substream with the peer has been inactive too long + pub const INACTIVE_SUBSTREAM: Rep = Rep::new(-(1 << 10), "Inactive block announce substream"); } struct Metrics { @@ -160,6 +168,10 @@ pub struct Peer { pub known_blocks: LruHashSet, /// Notification sink. sink: NotificationsSink, + /// Instant when the last notification was sent to peer. + last_notification_sent: Instant, + /// Instant when the last notification was received from peer. + last_notification_received: Instant, } pub struct SyncingEngine { @@ -200,6 +212,9 @@ pub struct SyncingEngine { /// All connected peers. Contains both full and light node peers. peers: HashMap>, + /// Evicted peers + evicted: HashSet, + /// List of nodes for which we perform additional logging because they are important for the /// user. important_peers: HashSet, @@ -344,6 +359,7 @@ where chain_sync, network_service, peers: HashMap::new(), + evicted: HashSet::new(), block_announce_data_cache: LruCache::new(cache_capacity), block_announce_protocol_name, num_connected: num_connected.clone(), @@ -507,6 +523,7 @@ where }, }; peer.known_blocks.insert(hash); + peer.last_notification_received = Instant::now(); if peer.info.roles.is_full() { let is_best = match announce.state.unwrap_or(BlockState::Best) { @@ -557,6 +574,7 @@ where data: Some(data.clone()), }; + peer.last_notification_sent = Instant::now(); peer.sink.send_sync_notification(message.encode()); } } @@ -590,6 +608,33 @@ where self.tick_timeout.reset(TICK_TIMEOUT); } + // go over all connected peers and check if any of them have been idle for a while. Idle in + // this case means that we haven't sent or received block announcements to/from this peer. + // If that is the case, because of #5685, it could be that the block announces substream is + // not actually open and and this peer is just wasting a slot and is should be replaced with + // some other node that is willing to send us block announcements. + for (id, peer) in self.peers.iter() { + // because of a delay between disconnecting a peer in `SyncingEngine` and getting the + // response back from `Protocol`, a peer might be reported and disconnect multiple + // times. To prevent this from happening (until the underlying issue is fixed), keep + // track of evicted peers and report and disconnect them only once. + if self.evicted.contains(id) { + continue + } + + let last_received_late = + peer.last_notification_received.elapsed() > INACTIVITY_EVICT_THRESHOLD; + let last_sent_late = peer.last_notification_sent.elapsed() > INACTIVITY_EVICT_THRESHOLD; + + if last_received_late && last_sent_late { + log::debug!(target: "sync", "evict peer {id} since it has been idling for too long"); + self.network_service.report_peer(*id, rep::INACTIVE_SUBSTREAM); + self.network_service + .disconnect_peer(*id, self.block_announce_protocol_name.clone()); + self.evicted.insert(*id); + } + } + while let Poll::Ready(Some(event)) = self.service_rx.poll_next_unpin(cx) { match event { ToServiceCommand::SetSyncForkRequest(peers, hash, number) => { @@ -683,6 +728,7 @@ where }, }, sc_network::SyncEvent::NotificationStreamClosed { remote } => { + self.evicted.remove(remote); if self.on_sync_peer_disconnected(remote).is_err() { log::trace!( target: "sync", @@ -835,6 +881,8 @@ where NonZeroUsize::new(MAX_KNOWN_BLOCKS).expect("Constant is nonzero"), ), sink, + last_notification_sent: Instant::now(), + last_notification_received: Instant::now(), }; let req = if peer.info.roles.is_full() { From f0c6c72e6883397d70d5a3e65d395293cdc6909a Mon Sep 17 00:00:00 2001 From: Aaro Altonen Date: Wed, 5 Apr 2023 17:20:09 +0300 Subject: [PATCH 2/5] zzz --- client/network/sync/src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/network/sync/src/engine.rs b/client/network/sync/src/engine.rs index cd2b03457c29e..6bd5d862de849 100644 --- a/client/network/sync/src/engine.rs +++ b/client/network/sync/src/engine.rs @@ -728,7 +728,7 @@ where }, }, sc_network::SyncEvent::NotificationStreamClosed { remote } => { - self.evicted.remove(remote); + self.evicted.remove(&remote); if self.on_sync_peer_disconnected(remote).is_err() { log::trace!( target: "sync", From ed79b5b227157b9a994a0bbc396892cc9cb96581 Mon Sep 17 00:00:00 2001 From: Aaro Altonen Date: Tue, 11 Apr 2023 11:45:26 +0300 Subject: [PATCH 3/5] wip --- client/network/sync/src/engine.rs | 4 ++-- client/network/sync/src/lib.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/client/network/sync/src/engine.rs b/client/network/sync/src/engine.rs index 6bd5d862de849..029ef2c9b186c 100644 --- a/client/network/sync/src/engine.rs +++ b/client/network/sync/src/engine.rs @@ -79,7 +79,7 @@ const MAX_KNOWN_BLOCKS: usize = 1024; // ~32kb per peer + LruHashSet overhead /// If the block announces stream to peer has been inactive for two minutes meaning local node /// has not sent or received block announcements to/from the peer, report the node for inactivity, /// disconnect it and attempt to establish connection to some other peer. -const INACTIVITY_EVICT_THRESHOLD: Duration = Duration::from_secs(2 * 60); +const INACTIVITY_EVICT_THRESHOLD: Duration = Duration::from_secs(30); mod rep { use sc_peerset::ReputationChange as Rep; @@ -627,7 +627,7 @@ where let last_sent_late = peer.last_notification_sent.elapsed() > INACTIVITY_EVICT_THRESHOLD; if last_received_late && last_sent_late { - log::debug!(target: "sync", "evict peer {id} since it has been idling for too long"); + log::error!(target: "sync", "evict peer {id} since it has been idling for too long"); self.network_service.report_peer(*id, rep::INACTIVE_SUBSTREAM); self.network_service .disconnect_peer(*id, self.block_announce_protocol_name.clone()); diff --git a/client/network/sync/src/lib.rs b/client/network/sync/src/lib.rs index 28959e7f9c886..3566d4493a187 100644 --- a/client/network/sync/src/lib.rs +++ b/client/network/sync/src/lib.rs @@ -108,7 +108,7 @@ pub mod warp; pub mod warp_request_handler; /// Maximum blocks to request in a single packet. -const MAX_BLOCKS_TO_REQUEST: usize = 64; +const MAX_BLOCKS_TO_REQUEST: usize = 8; /// Maximum blocks to store in the import queue. const MAX_IMPORTING_BLOCKS: usize = 2048; From 9ddb96b9cabf56737ab38ff86fec916f1511173a Mon Sep 17 00:00:00 2001 From: Aaro Altonen Date: Fri, 14 Apr 2023 11:01:45 +0300 Subject: [PATCH 4/5] Evict peers only when timeout expires --- client/network/sync/src/engine.rs | 50 ++++++++++++++++--------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/client/network/sync/src/engine.rs b/client/network/sync/src/engine.rs index 029ef2c9b186c..d82e94472f540 100644 --- a/client/network/sync/src/engine.rs +++ b/client/network/sync/src/engine.rs @@ -605,34 +605,36 @@ where while let Poll::Ready(()) = self.tick_timeout.poll_unpin(cx) { self.report_metrics(); - self.tick_timeout.reset(TICK_TIMEOUT); - } - // go over all connected peers and check if any of them have been idle for a while. Idle in - // this case means that we haven't sent or received block announcements to/from this peer. - // If that is the case, because of #5685, it could be that the block announces substream is - // not actually open and and this peer is just wasting a slot and is should be replaced with - // some other node that is willing to send us block announcements. - for (id, peer) in self.peers.iter() { - // because of a delay between disconnecting a peer in `SyncingEngine` and getting the - // response back from `Protocol`, a peer might be reported and disconnect multiple - // times. To prevent this from happening (until the underlying issue is fixed), keep - // track of evicted peers and report and disconnect them only once. - if self.evicted.contains(id) { - continue - } + // go over all connected peers and check if any of them have been idle for a while. Idle + // in this case means that we haven't sent or received block announcements to/from this + // peer. If that is the case, because of #5685, it could be that the block announces + // substream is not actually open and and this peer is just wasting a slot and is should + // be replaced with some other node that is willing to send us block announcements. + for (id, peer) in self.peers.iter() { + // because of a delay between disconnecting a peer in `SyncingEngine` and getting + // the response back from `Protocol`, a peer might be reported and disconnect + // multiple times. To prevent this from happening (until the underlying issue is + // fixed), keep track of evicted peers and report and disconnect them only once. + if self.evicted.contains(id) { + continue + } - let last_received_late = - peer.last_notification_received.elapsed() > INACTIVITY_EVICT_THRESHOLD; - let last_sent_late = peer.last_notification_sent.elapsed() > INACTIVITY_EVICT_THRESHOLD; + let last_received_late = + peer.last_notification_received.elapsed() > INACTIVITY_EVICT_THRESHOLD; + let last_sent_late = + peer.last_notification_sent.elapsed() > INACTIVITY_EVICT_THRESHOLD; - if last_received_late && last_sent_late { - log::error!(target: "sync", "evict peer {id} since it has been idling for too long"); - self.network_service.report_peer(*id, rep::INACTIVE_SUBSTREAM); - self.network_service - .disconnect_peer(*id, self.block_announce_protocol_name.clone()); - self.evicted.insert(*id); + if last_received_late && last_sent_late { + log::error!(target: "sync", "evict peer {id} since it has been idling for too long"); + self.network_service.report_peer(*id, rep::INACTIVE_SUBSTREAM); + self.network_service + .disconnect_peer(*id, self.block_announce_protocol_name.clone()); + self.evicted.insert(*id); + } } + + self.tick_timeout.reset(TICK_TIMEOUT); } while let Poll::Ready(Some(event)) = self.service_rx.poll_next_unpin(cx) { From 734c4f1d9ae5960df5468ed852ad43a3155d4198 Mon Sep 17 00:00:00 2001 From: Aaro Altonen Date: Fri, 14 Apr 2023 11:13:47 +0300 Subject: [PATCH 5/5] Use `debug!()` --- client/network/sync/src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/network/sync/src/engine.rs b/client/network/sync/src/engine.rs index a6e341f30d8f4..eca0ebfe41a9f 100644 --- a/client/network/sync/src/engine.rs +++ b/client/network/sync/src/engine.rs @@ -635,7 +635,7 @@ where peer.last_notification_sent.elapsed() > INACTIVITY_EVICT_THRESHOLD; if last_received_late && last_sent_late { - log::error!(target: "sync", "evict peer {id} since it has been idling for too long"); + log::debug!(target: "sync", "evict peer {id} since it has been idling for too long"); self.network_service.report_peer(*id, rep::INACTIVE_SUBSTREAM); self.network_service .disconnect_peer(*id, self.block_announce_protocol_name.clone());