From 51490844a22a5eb00203c2d280a22498c377d1e6 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Mar 2026 13:13:58 -0300 Subject: [PATCH 1/2] Exclude failed peers from block fetch retry selection Track which peers returned empty BlocksByRoot responses per block root in a HashSet instead of the write-only last_peer field. On retry, filter them out of the candidate pool so we pick a different peer. Falls back to the full peer set (with cleared tracking) if all peers have been exhausted. Fixes retry storms observed in devnet where the same lagging peer was randomly selected on every attempt, wasting all 10 retries. --- crates/net/p2p/src/lib.rs | 2 +- crates/net/p2p/src/req_resp/handlers.rs | 42 +++++++++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/crates/net/p2p/src/lib.rs b/crates/net/p2p/src/lib.rs index 2ebec1d3..ffaec47e 100644 --- a/crates/net/p2p/src/lib.rs +++ b/crates/net/p2p/src/lib.rs @@ -53,7 +53,7 @@ enum RetryMessage { pub(crate) struct PendingRequest { pub(crate) attempts: u32, - pub(crate) last_peer: Option, + pub(crate) failed_peers: HashSet, } #[allow(clippy::too_many_arguments)] diff --git a/crates/net/p2p/src/req_resp/handlers.rs b/crates/net/p2p/src/req_resp/handlers.rs index 9e50ea66..bf1ab056 100644 --- a/crates/net/p2p/src/req_resp/handlers.rs +++ b/crates/net/p2p/src/req_resp/handlers.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use ethlambda_storage::Store; use libp2p::{PeerId, request_response}; use rand::seq::SliceRandom; @@ -198,9 +200,31 @@ pub async fn fetch_block_from_peer(server: &mut P2PServer, root: H256) -> bool { return false; } - // Select random peer - let peers: Vec<_> = server.connected_peers.iter().copied().collect(); - let peer = match peers.choose(&mut rand::thread_rng()) { + // Exclude peers that already returned empty responses for this root + let pool: Vec<_> = { + let failed = server.pending_requests.get(&root).map(|p| &p.failed_peers); + server + .connected_peers + .iter() + .copied() + .filter(|p| failed.is_none_or(|f| !f.contains(p))) + .collect() + }; + + // Fall back to full set if all peers have failed (new peers may have connected, + // or previously-failing peers may have caught up). Clear failed_peers so subsequent + // retries start a fresh round of elimination. + let pool = if pool.is_empty() { + warn!(%root, "All peers failed for this block, retrying with full peer set"); + if let Some(pending) = server.pending_requests.get_mut(&root) { + pending.failed_peers.clear(); + } + server.connected_peers.iter().copied().collect() + } else { + pool + }; + + let peer = match pool.choose(&mut rand::thread_rng()) { Some(&p) => p, None => { warn!(%root, "Failed to select random peer"); @@ -216,7 +240,8 @@ pub async fn fetch_block_from_peer(server: &mut P2PServer, root: H256) -> bool { } let request = BlocksByRootRequest { roots }; - info!(%peer, %root, "Sending BlocksByRoot request for missing block"); + let excluded = server.connected_peers.len() - pool.len(); + info!(%peer, %root, excluded, "Sending BlocksByRoot request for missing block"); let request_id = server .swarm .behaviour_mut() @@ -228,17 +253,14 @@ pub async fn fetch_block_from_peer(server: &mut P2PServer, root: H256) -> bool { ); // Track the request if not already tracked (new request) - let pending = server + server .pending_requests .entry(root) .or_insert(PendingRequest { attempts: 1, - last_peer: None, + failed_peers: HashSet::new(), }); - // Update last_peer - pending.last_peer = Some(peer); - // Map request_id to root for failure handling server.request_id_map.insert(request_id, root); @@ -250,6 +272,8 @@ async fn handle_fetch_failure(server: &mut P2PServer, root: H256, peer: PeerId) return; }; + pending.failed_peers.insert(peer); + if pending.attempts >= MAX_FETCH_RETRIES { error!(%root, %peer, attempts=%pending.attempts, "Block fetch failed after max retries, giving up"); From 0844aef9d079d923146e7b0e9bc31352bd346dd2 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Fri, 6 Mar 2026 15:33:35 -0300 Subject: [PATCH 2/2] Skip failed-peers filter when there are no failed peers --- crates/net/p2p/src/req_resp/handlers.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crates/net/p2p/src/req_resp/handlers.rs b/crates/net/p2p/src/req_resp/handlers.rs index bf1ab056..3deba876 100644 --- a/crates/net/p2p/src/req_resp/handlers.rs +++ b/crates/net/p2p/src/req_resp/handlers.rs @@ -201,13 +201,16 @@ pub async fn fetch_block_from_peer(server: &mut P2PServer, root: H256) -> bool { } // Exclude peers that already returned empty responses for this root - let pool: Vec<_> = { - let failed = server.pending_requests.get(&root).map(|p| &p.failed_peers); + let failed = server.pending_requests.get(&root).map(|p| &p.failed_peers); + let pool: Vec<_> = if failed.is_none_or(|f| f.is_empty()) { + server.connected_peers.iter().copied().collect() + } else { + let failed = failed.unwrap(); server .connected_peers .iter() .copied() - .filter(|p| failed.is_none_or(|f| !f.contains(p))) + .filter(|p| !failed.contains(p)) .collect() };