diff --git a/client/network/sync/src/engine.rs b/client/network/sync/src/engine.rs index 6fb618a571c25..eca0ebfe41a9f 100644 --- a/client/network/sync/src/engine.rs +++ b/client/network/sync/src/engine.rs @@ -67,6 +67,7 @@ use std::{ Arc, }, task::Poll, + time::{Duration, Instant}, }; /// Interval at which we perform time based maintenance @@ -75,12 +76,19 @@ const TICK_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(1100) /// Maximum number of known block hashes to keep for a peer. const MAX_KNOWN_BLOCKS: usize = 1024; // ~32kb per peer + LruHashSet overhead +/// If the block announces stream to peer has been inactive for two minutes meaning local node +/// has not sent or received block announcements to/from the peer, report the node for inactivity, +/// disconnect it and attempt to establish connection to some other peer. +const INACTIVITY_EVICT_THRESHOLD: Duration = Duration::from_secs(30); + mod rep { use sc_peerset::ReputationChange as Rep; /// Peer has different genesis. pub const GENESIS_MISMATCH: Rep = Rep::new_fatal("Genesis mismatch"); /// Peer send us a block announcement that failed at validation. pub const BAD_BLOCK_ANNOUNCEMENT: Rep = Rep::new(-(1 << 12), "Bad block announcement"); + /// Block announce substream with the peer has been inactive too long + pub const INACTIVE_SUBSTREAM: Rep = Rep::new(-(1 << 10), "Inactive block announce substream"); } struct Metrics { @@ -160,6 +168,10 @@ pub struct Peer { pub known_blocks: LruHashSet, /// Notification sink. sink: NotificationsSink, + /// Instant when the last notification was sent to peer. + last_notification_sent: Instant, + /// Instant when the last notification was received from peer. + last_notification_received: Instant, } pub struct SyncingEngine { @@ -200,6 +212,9 @@ pub struct SyncingEngine { /// All connected peers. Contains both full and light node peers. peers: HashMap>, + /// Evicted peers + evicted: HashSet, + /// List of nodes for which we perform additional logging because they are important for the /// user. important_peers: HashSet, @@ -353,6 +368,7 @@ where chain_sync, network_service, peers: HashMap::new(), + evicted: HashSet::new(), block_announce_data_cache: LruCache::new(cache_capacity), block_announce_protocol_name, num_connected: num_connected.clone(), @@ -516,6 +532,7 @@ where }, }; peer.known_blocks.insert(hash); + peer.last_notification_received = Instant::now(); if peer.info.roles.is_full() { let is_best = match announce.state.unwrap_or(BlockState::Best) { @@ -566,6 +583,7 @@ where data: Some(data.clone()), }; + peer.last_notification_sent = Instant::now(); peer.sink.send_sync_notification(message.encode()); } } @@ -596,6 +614,35 @@ where while let Poll::Ready(()) = self.tick_timeout.poll_unpin(cx) { self.report_metrics(); + + // go over all connected peers and check if any of them have been idle for a while. Idle + // in this case means that we haven't sent or received block announcements to/from this + // peer. If that is the case, because of #5685, it could be that the block announces + // substream is not actually open and and this peer is just wasting a slot and is should + // be replaced with some other node that is willing to send us block announcements. + for (id, peer) in self.peers.iter() { + // because of a delay between disconnecting a peer in `SyncingEngine` and getting + // the response back from `Protocol`, a peer might be reported and disconnect + // multiple times. To prevent this from happening (until the underlying issue is + // fixed), keep track of evicted peers and report and disconnect them only once. + if self.evicted.contains(id) { + continue + } + + let last_received_late = + peer.last_notification_received.elapsed() > INACTIVITY_EVICT_THRESHOLD; + let last_sent_late = + peer.last_notification_sent.elapsed() > INACTIVITY_EVICT_THRESHOLD; + + if last_received_late && last_sent_late { + log::debug!(target: "sync", "evict peer {id} since it has been idling for too long"); + self.network_service.report_peer(*id, rep::INACTIVE_SUBSTREAM); + self.network_service + .disconnect_peer(*id, self.block_announce_protocol_name.clone()); + self.evicted.insert(*id); + } + } + self.tick_timeout.reset(TICK_TIMEOUT); } @@ -692,6 +739,7 @@ where }, }, sc_network::SyncEvent::NotificationStreamClosed { remote } => { + self.evicted.remove(&remote); if self.on_sync_peer_disconnected(remote).is_err() { log::trace!( target: "sync", @@ -844,6 +892,8 @@ where NonZeroUsize::new(MAX_KNOWN_BLOCKS).expect("Constant is nonzero"), ), sink, + last_notification_sent: Instant::now(), + last_notification_received: Instant::now(), }; let req = if peer.info.roles.is_full() {