From 606d5bcb04b65485aa539c88e340137357a8bfac Mon Sep 17 00:00:00 2001 From: Quang Le Date: Thu, 14 May 2026 11:16:25 +0700 Subject: [PATCH 01/28] test: update qbft test --- crates/core/src/qbft/fake_clock.rs | 58 +- crates/core/src/qbft/internal_test.rs | 1196 ++++++++++++++++++++++--- 2 files changed, 1131 insertions(+), 123 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 04e143f5..3b7baeea 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -1,6 +1,6 @@ use crossbeam::channel as mpmc; use std::{ - collections::HashMap, + collections::BTreeMap, sync::{Arc, Mutex}, thread, time::{Duration, Instant}, @@ -15,7 +15,7 @@ struct FakeClockInner { start: Instant, now: Instant, last_id: usize, - clients: HashMap, Instant)>, + clients: BTreeMap, Instant)>, } impl FakeClock { @@ -98,56 +98,82 @@ impl FakeClock { } } -impl Drop for FakeClock { - fn drop(&mut self) { - self.cancel(); - } -} - #[test] fn multiple_threads_timers() { let clock = FakeClock::new(Instant::now()); + let (done_tx, done_rx) = mpmc::bounded(2); - let start = Instant::now(); thread::scope(|s| { let c1 = clock.clone(); let (ch_1, _) = c1.new_timer(Duration::from_secs(5)); + let done_tx_1 = done_tx.clone(); s.spawn(move || { - let _ = ch_1.recv(); + done_tx_1.send(ch_1.recv().is_ok()).unwrap(); }); let c2 = clock.clone(); let (ch_2, _) = c2.new_timer(Duration::from_secs(5)); + let done_tx_2 = done_tx.clone(); s.spawn(move || { - let _ = ch_2.recv(); + done_tx_2.send(ch_2.recv().is_ok()).unwrap(); }); + clock.advance(Duration::from_secs(4)); + assert!(done_rx.try_recv().is_err()); clock.advance(Duration::from_secs(6)); }); - println!("start={:?}, clock={:?}", start.elapsed(), clock.elapsed()); + assert_eq!(vec![true, true], done_rx.try_iter().collect::>()); + assert_eq!(Duration::from_secs(10), clock.elapsed()); } #[test] fn multiple_threads_cancellation() { let clock = FakeClock::new(Instant::now()); + let (done_tx, done_rx) = mpmc::bounded(2); - let start = Instant::now(); thread::scope(|s| { let c1 = clock.clone(); let (ch_1, _) = c1.new_timer(Duration::from_secs(5)); + let done_tx_1 = done_tx.clone(); s.spawn(move || { - let _ = ch_1.recv(); + done_tx_1.send(ch_1.recv().is_err()).unwrap(); }); let c2 = clock.clone(); let (ch_2, _) = c2.new_timer(Duration::from_secs(5)); + let done_tx_2 = done_tx.clone(); s.spawn(move || { - let _ = ch_2.recv(); + done_tx_2.send(ch_2.recv().is_err()).unwrap(); }); clock.cancel(); }); - println!("start={:?}, clock={:?}", start.elapsed(), clock.elapsed()); + assert_eq!(vec![true, true], done_rx.try_iter().collect::>()); + assert_eq!(Duration::ZERO, clock.elapsed()); +} + +#[test] +fn cancel_one_timer_only() { + let clock = FakeClock::new(Instant::now()); + let (ch_1, cancel_1) = clock.new_timer(Duration::from_secs(5)); + let (ch_2, _) = clock.new_timer(Duration::from_secs(5)); + + cancel_1(); + clock.advance(Duration::from_secs(5)); + + assert!(ch_1.try_recv().is_err()); + assert!(ch_2.try_recv().is_ok()); +} + +#[test] +fn expired_timer_delivers_once() { + let clock = FakeClock::new(Instant::now()); + let (ch, _) = clock.new_timer(Duration::from_secs(5)); + + clock.advance(Duration::from_secs(5)); + assert!(ch.try_recv().is_ok()); + clock.advance(Duration::from_secs(5)); + assert!(ch.try_recv().is_err()); } diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index b7770251..5c5659f2 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1,10 +1,32 @@ use crate::qbft::{self, fake_clock::FakeClock, *}; use cancellation::CancellationTokenSource; use crossbeam::channel as mpmc; -use std::{collections::HashMap, sync::Arc, thread, time::Duration}; +use std::{ + collections::{BTreeMap, HashMap}, + fmt::Write as _, + panic::{self, AssertUnwindSafe}, + sync::{Arc, Mutex}, + thread, + time::Duration, +}; const WRITE_CHAN_ERR: &str = "Failed to write to channel"; const READ_CHAN_ERR: &str = "Failed to read from channel"; +const CHAIN_SPLIT_SEED: u64 = 0x4348_4149_4e53_504c; + +type RunOutcome = std::thread::Result>; +type TestMsgRef = Msg; + +struct PendingBroadcast { + deliver_at: Duration, + key: u64, + msg: TestMsgRef, +} + +enum BroadcastEvent { + Immediate(TestMsgRef), + Delayed(PendingBroadcast), +} #[derive(Default, Debug)] struct Test { @@ -28,6 +50,8 @@ struct Test { pub prepared_val: i32, /// Non-deterministic consensus at random round. pub random_round: bool, + /// Enables fuzzing by node 1. + pub fuzz: bool, } fn test_qbft(test: Test) { @@ -35,20 +59,24 @@ fn test_qbft(test: Test) { const MAX_ROUND: usize = 50; const FIFO_LIMIT: usize = 100; + let seed = test_seed(&test); + let trace = Trace::new(); let start_time = time::Instant::now(); + let real_start = time::Instant::now(); let clock = FakeClock::new(start_time); let cts = CancellationTokenSource::new(); - let mut receives = HashMap::< + let mut receives = BTreeMap::< i64, ( mpmc::Sender>, mpmc::Receiver>, ), >::new(); - let (broadcast_tx, broadcast_rx) = mpmc::unbounded::>(); + let (broadcast_tx, broadcast_rx) = mpmc::unbounded::(); + let (unjust_tx, unjust_rx) = mpmc::unbounded::(); let (result_chan_tx, result_chan_rx) = mpmc::bounded::>>(N); - let (run_chan_tx, run_chan_rx) = mpmc::bounded::>(N); + let (run_chan_tx, run_chan_rx) = mpmc::bounded::<(i64, RunOutcome)>(N); let is_leader = Box::new(make_is_leader(N as i64)); @@ -74,32 +102,46 @@ fn test_qbft(test: Test) { result_chan_tx.send(q_commit.clone()).expect(WRITE_CHAN_ERR); }) }, - compare: Box::new(|_, _, _, _, return_err, _| { - return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + compare: Box::new(|_, _, _, input_value_source, return_err, return_value| { + return_value + .send(*input_value_source) + .expect(WRITE_CHAN_ERR); + send_compare_result(return_err, return_value, Ok(())); }), nodes: N as i64, fifo_limit: FIFO_LIMIT as i64, log_round_change: { let clock = clock.clone(); + let trace = trace.clone(); Box::new(move |_, process, round, new_round, upon_rule, _| { - println!( + trace.push(format!( "{:?} - {}@{} change to {} ~= {}", clock.elapsed(), process, round, new_round, upon_rule, - ); + )); + }) + }, + log_unjust: { + let trace = trace.clone(); + let unjust_tx = unjust_tx.clone(); + let fuzz = test.fuzz; + Box::new(move |_, process, msg| { + let line = format!("Unjust: process={} msg={:?}", process, msg); + trace.push(line.clone()); + if !fuzz { + unjust_tx.send(line).expect(WRITE_CHAN_ERR); + } }) }, - log_unjust: Box::new(|_, _, msg| { - println!("Unjust: {:?}", msg); - }), log_upon_rule: { let clock = clock.clone(); + let trace = trace.clone(); Box::new(move |_, process, round, msg, upon_rule| { - println!( + trace.push(format!( "{:?} {} => {}@{} -> {}@{} ~= {}", clock.elapsed(), msg.source(), @@ -108,7 +150,7 @@ fn test_qbft(test: Test) { process, round, upon_rule, - ); + )); }) }, }); @@ -122,6 +164,7 @@ fn test_qbft(test: Test) { let trans = Transport { broadcast: { let clock = clock.clone(); + let trace = trace.clone(); Box::new( move |_, type_, instance, source, round, value, pr, pv, justification| { @@ -130,16 +173,22 @@ fn test_qbft(test: Test) { } if type_ == MSG_COMMIT && round <= test.commits_after.into() { - println!( + trace.push(format!( "{:?} {} dropping commit for round {}", clock.elapsed(), source, round - ); + )); return Ok(()); } - println!("{:?} {} => {}@{}", clock.elapsed(), source, type_, round); + trace.push(format!( + "{:?} {} => {}@{}", + clock.elapsed(), + source, + type_, + round + )); let msg = new_msg( type_, @@ -159,7 +208,9 @@ fn test_qbft(test: Test) { msg.clone(), test.bcast_jitter_ms, clock.clone(), - ); // TODO: Add clock + trace.clone(), + seed, + ); Ok(()) }, @@ -168,7 +219,7 @@ fn test_qbft(test: Test) { receive: receiver.clone(), }; - let token = cts.token(); + let token = cts.token().clone(); let clock = clock.clone(); let receiver = receiver.clone(); let start_delay = test.start_delay.get(&i).copied(); @@ -177,99 +228,229 @@ fn test_qbft(test: Test) { let run_chan_tx = run_chan_tx.clone(); let defs = defs.clone(); let is_leader = is_leader.clone(); + let trace = trace.clone(); s.spawn(move || { if let Some(delay) = start_delay { - println!("{:?} Node {} start delay {:?}", clock.elapsed(), i, delay); + trace.push(format!( + "{:?} Node {} start delay {:?}", + clock.elapsed(), + i, + delay + )); let (delay_ch, _) = clock.new_timer(delay); _ = delay_ch.recv(); - println!("{:?} Node {} starting", clock.elapsed(), i); + trace.push(format!( + "{:?} Node {} starting {:?}", + clock.elapsed(), + i, + delay + )); } - // Drain any buffered messages - while !receiver.is_empty() { - _ = receiver.recv().expect(READ_CHAN_ERR); + if start_delay.is_some() { + // Drain any buffered messages + while !receiver.is_empty() { + _ = receiver.recv().expect(READ_CHAN_ERR); + } } let (v_chan_tx, v_chan_rx) = mpmc::bounded::(1); - let (_, vs_chan_rx) = mpmc::bounded::(1); + let (vs_chan_tx, vs_chan_rx) = mpmc::bounded::(1); + let mut keep_value_sender = Some(v_chan_tx); + let mut input_value_rx = v_chan_rx; if let Some(delay) = value_delay { + let v_chan_tx_send = keep_value_sender + .as_ref() + .expect("value sender kept until run returns") + .clone(); s.spawn(move || { let (delay_ch, cancel) = clock.new_timer(delay); _ = delay_ch.recv(); - _ = v_chan_tx.send(i); + _ = v_chan_tx_send.send(i); cancel(); }); } else if decide_round != 1 { + let v_chan_tx_send = keep_value_sender + .as_ref() + .expect("value sender kept until run returns") + .clone(); s.spawn(move || { - _ = v_chan_tx.send(i); + _ = v_chan_tx_send.send(i); }); } else if is_leader(&test.instance, 1, i) { + let v_chan_tx_send = keep_value_sender + .as_ref() + .expect("value sender kept until run returns") + .clone(); s.spawn(move || { - _ = v_chan_tx.send(i); + _ = v_chan_tx_send.send(i); }); + } else { + keep_value_sender = None; + input_value_rx = mpmc::never(); } - run_chan_tx - .send(qbft::run( - token, + let keepalive = (keep_value_sender, vs_chan_tx); + let run_result = panic::catch_unwind(AssertUnwindSafe(|| { + qbft::run( + &token, &defs, &trans, &test.instance, i, - v_chan_rx, + input_value_rx, vs_chan_rx, - )) - .expect(WRITE_CHAN_ERR); + ) + })); + drop(keepalive); + run_chan_tx.send((i, run_result)).expect(WRITE_CHAN_ERR); }); } - let mut results = HashMap::>::new(); + let mut results = BTreeMap::>::new(); let mut count = 0; let mut decided = false; let mut done = 0; + let mut broadcasts = 0usize; + let mut pending = Vec::::new(); + let mut next_fuzz_at = test.fuzz.then_some(Duration::from_millis(100)); + let mut fuzz_counter = 0_u64; loop { - mpmc::select! { - recv(broadcast_rx) -> msg => { - let msg = msg.expect(READ_CHAN_ERR); - for (target, (out_tx, _)) in receives.iter() { - if *target == msg.source() { - continue; // Do not broadcast to self, we sent to self already. - } - - if let Some(p) = test.drop_prob.get(&msg.source()) { - if rand::random::() < *p { - println!("{:?} {} => {}@{} => {} (dropped)", clock.elapsed(), msg.source(), msg.type_(), msg.round(), target); - continue; // Drop - } - } + broadcasts += deliver_ready_broadcasts( + &mut pending, + &receives, + &test.drop_prob, + seed, + &trace, + &clock, + ); + + while let Some(next) = next_fuzz_at { + if clock.elapsed() < next { + break; + } - out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); + let msg = random_msg(test.instance, 1, seed, fuzz_counter); + fuzz_counter = fuzz_counter.wrapping_add(1); + trace.push(format!( + "{:?} fuzz {} => {}@{}", + clock.elapsed(), + msg.source(), + msg.type_(), + msg.round() + )); + broadcasts += + fanout_broadcast(&receives, &test.drop_prob, seed, &trace, &clock, msg); + next_fuzz_at = Some(next + Duration::from_millis(100)); + } - if rand::random::() < 0.1 { // Send 10% messages twice - out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); + mpmc::select! { + recv(broadcast_rx) -> event => { + match event.expect(READ_CHAN_ERR) { + BroadcastEvent::Immediate(msg) => { + broadcasts += fanout_broadcast( + &receives, + &test.drop_prob, + seed, + &trace, + &clock, + msg, + ); } + BroadcastEvent::Delayed(delayed) => pending.push(delayed), + } + clock.advance(Duration::from_millis(1)); + if clock.elapsed() > Duration::from_secs(180) || real_start.elapsed() > Duration::from_secs(20) { + cts.cancel(); + clock.cancel(); + panic!( + "qbft test hang: decided={} done={} count={} elapsed={:?} real_elapsed={:?} broadcasts={} seed={}\n{}", + decided, + done, + count, + clock.elapsed(), + real_start.elapsed(), + broadcasts, + seed, + trace.dump() + ); } } + recv(unjust_rx) -> unjust => { + let unjust = unjust.expect(READ_CHAN_ERR); + cts.cancel(); + clock.cancel(); + panic!("unjust message: {unjust} elapsed={:?} seed={}\n{}", clock.elapsed(), seed, trace.dump()); + } + recv(result_chan_rx) -> res => { let q_commit = res.expect(READ_CHAN_ERR); for commit in q_commit.clone() { for (_, previous) in results.iter() { - assert_eq!(previous.value(), commit.value(), "commit values"); + if previous.value() != commit.value() { + cts.cancel(); + clock.cancel(); + panic!( + "commit values differ: previous={:?} commit={:?} elapsed={:?} seed={}\n{}", + previous, + commit, + clock.elapsed(), + seed, + trace.dump() + ); + } } if !test.random_round { - assert_eq!(i64::from(test.decide_round), commit.round(), "wrong decide round"); + if i64::from(test.decide_round) != commit.round() { + cts.cancel(); + clock.cancel(); + panic!( + "wrong decide round: want={} got={} commit={:?} elapsed={:?} seed={}\n{}", + test.decide_round, + commit.round(), + commit, + clock.elapsed(), + seed, + trace.dump() + ); + } if test.prepared_val != 0 { // Check prepared value if set - assert_eq!(i64::from(test.prepared_val), commit.value(), "wrong prepared value"); + if i64::from(test.prepared_val) != commit.value() { + cts.cancel(); + clock.cancel(); + panic!( + "wrong prepared value: want={} got={} commit={:?} elapsed={:?} seed={}\n{}", + test.prepared_val, + commit.value(), + commit, + clock.elapsed(), + seed, + trace.dump() + ); + } } else { // Otherwise check that leader value was used. - assert!(is_leader(&test.instance, commit.round(), commit.value()), "not leader"); + if !is_leader(&test.instance, commit.round(), commit.value()) { + cts.cancel(); + clock.cancel(); + panic!( + "not leader value: instance={} round={} value={} commit={:?} elapsed={:?} seed={}\n{}", + test.instance, + commit.round(), + commit.value(), + commit, + clock.elapsed(), + seed, + trace.dump() + ); + } } } @@ -282,7 +463,7 @@ fn test_qbft(test: Test) { } let round = q_commit[0].round(); - println!("Got all results in round {} after {:?}: {:?}", round, clock.elapsed(), results); + trace.push(format!("Got all results in round {} after {:?}: {:?}", round, clock.elapsed(), results)); // Trigger shutdown decided = true; @@ -292,11 +473,24 @@ fn test_qbft(test: Test) { } recv(run_chan_rx) -> res => { - let err = res.expect(READ_CHAN_ERR); + let (node, outcome) = res.expect(READ_CHAN_ERR); - if err.is_err() { + if !matches!(outcome, Ok(Ok(()))) { if !decided { - panic!("unexpected run error"); + cts.cancel(); + clock.cancel(); + panic!( + "unexpected run error: node={} outcome={} decided={} done={} count={} elapsed={:?} broadcasts={} seed={}\n{}", + node, + format_run_outcome(&outcome), + decided, + done, + count, + clock.elapsed(), + broadcasts, + seed, + trace.dump() + ); } } @@ -309,12 +503,98 @@ fn test_qbft(test: Test) { default => { thread::sleep(time::Duration::from_micros(1)); clock.advance(Duration::from_millis(1)); + if clock.elapsed() > Duration::from_secs(180) || real_start.elapsed() > Duration::from_secs(20) { + cts.cancel(); + clock.cancel(); + panic!( + "qbft test hang: decided={} done={} count={} elapsed={:?} real_elapsed={:?} broadcasts={} seed={}\n{}", + decided, + done, + count, + clock.elapsed(), + real_start.elapsed(), + broadcasts, + seed, + trace.dump() + ); + } } } } }); } +#[derive(Clone, Default)] +struct Trace(Arc>>); + +impl Trace { + fn new() -> Self { + Self::default() + } + + fn push(&self, line: String) { + self.0.lock().unwrap().push(line); + } + + fn dump(&self) -> String { + let lines = self.0.lock().unwrap(); + let start = lines.len().saturating_sub(200); + let mut out = String::new(); + for line in &lines[start..] { + let _ = writeln!(out, "{line}"); + } + out + } +} + +fn format_run_outcome(outcome: &RunOutcome) -> String { + match outcome { + Ok(Ok(())) => "ok".to_string(), + Ok(Err(err)) => format!("error {err:?}"), + Err(payload) => { + if let Some(msg) = payload.downcast_ref::<&str>() { + format!("panic {msg}") + } else if let Some(msg) = payload.downcast_ref::() { + format!("panic {msg}") + } else { + "panic ".to_string() + } + } + } +} + +fn outcome_is_error(outcome: &RunOutcome, expected: fn(&QbftError) -> bool) -> bool { + matches!(outcome, Ok(Err(err)) if expected(err)) +} + +fn send_compare_result( + return_err: &mpmc::Sender>, + return_value: &mpmc::Sender, + result: Result<()>, +) { + // Keep the auxiliary compare channels alive until the result is consumed. + // Go channels do not close when the callback returns; Rust senders do. + let keepalive = return_value.clone(); + thread::spawn(move || { + thread::sleep(Duration::from_millis(50)); + drop(keepalive); + }); + return_err.send(result).expect(WRITE_CHAN_ERR); +} + +fn test_seed(test: &Test) -> u64 { + let mut seed = 0x5142_4654_5445_5354_u64; + seed ^= test.instance as u64; + seed ^= u64::from(test.const_period) << 8; + seed ^= (test.bcast_jitter_ms as u64) << 16; + seed ^= (test.commits_after as u64) << 32; + seed ^= (test.decide_round as u64) << 40; + seed ^= (test.prepared_val as u64) << 48; + seed ^= u64::from(test.random_round) << 56; + seed ^= u64::from(test.fuzz) << 57; + seed +} + /// Construct a leader election function. fn make_is_leader(n: i64) -> impl Fn(&i64, i64, i64) -> bool + Clone { move |instance: &i64, round: i64, process: i64| -> bool { (instance + round) % n == process } @@ -365,32 +645,167 @@ fn new_msg( // Delays the message broadcast by between 1x and 2x jitter_ms and drops // messages. fn bcast( - broadcast: mpmc::Sender>, + broadcast: mpmc::Sender, msg: Msg, jitter_ms: i32, clock: FakeClock, + trace: Trace, + seed: u64, ) { if jitter_ms == 0 { - broadcast.send(msg.clone()).expect(WRITE_CHAN_ERR); + broadcast + .send(BroadcastEvent::Immediate(msg.clone())) + .expect(WRITE_CHAN_ERR); return; } - thread::spawn(move || { - let delta_ms = (f64::from(jitter_ms) * rand::random::()) as i32; - let delay = Duration::from_millis((jitter_ms + delta_ms) as u64); - println!( - "{:?} {} => {}@{} (bcast delay {:?})", - clock.elapsed(), - msg.source(), - msg.type_(), - msg.round(), - delay - ); - let (delay_ch, _) = clock.new_timer(delay); - _ = delay_ch.recv(); + let delta_ms = (f64::from(jitter_ms) * deterministic_unit(seed, &msg, 0, 3)) as i32; + let delay = Duration::from_millis((jitter_ms + delta_ms) as u64); + trace.push(format!( + "{:?} {} => {}@{} (bcast delay {:?})", + clock.elapsed(), + msg.source(), + msg.type_(), + msg.round(), + delay + )); + let key = deterministic_msg_u64(seed, &msg, 0, 4); + broadcast + .send(BroadcastEvent::Delayed(PendingBroadcast { + deliver_at: clock.elapsed() + delay, + key, + msg, + })) + .expect(WRITE_CHAN_ERR); +} - _ = broadcast.send(msg); - }); +fn deliver_ready_broadcasts( + pending: &mut Vec, + receives: &BTreeMap, mpmc::Receiver)>, + drop_prob: &HashMap, + seed: u64, + trace: &Trace, + clock: &FakeClock, +) -> usize { + pending.sort_by_key(|delayed| (delayed.deliver_at, delayed.key)); + let ready_count = pending + .iter() + .take_while(|delayed| delayed.deliver_at <= clock.elapsed()) + .count(); + let ready = pending.drain(..ready_count).collect::>(); + + ready + .into_iter() + .map(|delayed| fanout_broadcast(receives, drop_prob, seed, trace, clock, delayed.msg)) + .sum() +} + +fn fanout_broadcast( + receives: &BTreeMap, mpmc::Receiver)>, + drop_prob: &HashMap, + seed: u64, + trace: &Trace, + clock: &FakeClock, + msg: TestMsgRef, +) -> usize { + let mut broadcasts = 0; + for (target, (out_tx, _)) in receives.iter() { + if *target == msg.source() { + continue; // Do not broadcast to self, we sent to self already. + } + + if let Some(p) = drop_prob.get(&msg.source()) { + if deterministic_unit(seed, &msg, *target, 1) < *p { + trace.push(format!( + "{:?} {} => {}@{} => {} (dropped)", + clock.elapsed(), + msg.source(), + msg.type_(), + msg.round(), + target + )); + continue; + } + } + + out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); + broadcasts += 1; + + if deterministic_unit(seed, &msg, *target, 2) < 0.1 { + out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); + broadcasts += 1; + trace.push(format!( + "{:?} {} => {}@{} => {} (duplicate)", + clock.elapsed(), + msg.source(), + msg.type_(), + msg.round(), + target + )); + } + } + + broadcasts +} + +fn random_msg(instance: i64, peer_idx: i64, seed: u64, counter: u64) -> Msg { + let message_types = [ + MSG_PRE_PREPARE, + MSG_PREPARE, + MSG_COMMIT, + MSG_ROUND_CHANGE, + MSG_DECIDED, + ]; + new_msg( + message_types[deterministic_range(seed, counter, 10, message_types.len())], + instance, + peer_idx, + deterministic_i64(seed, counter, 11, 10), + deterministic_i64(seed, counter, 12, 10), + 0, + deterministic_i64(seed, counter, 13, 10), + deterministic_i64(seed, counter, 14, 10), + None, + ) +} + +fn deterministic_unit(seed: u64, msg: &Msg, target: i64, salt: u64) -> f64 { + let value = deterministic_msg_u64(seed, msg, target, salt) >> 11; + value as f64 / ((1_u64 << 53) as f64) +} + +fn deterministic_msg_u64(seed: u64, msg: &Msg, target: i64, salt: u64) -> u64 { + let mut value = splitmix64(seed ^ salt); + value = splitmix64(value ^ i64_to_u64(msg.type_().0)); + value = splitmix64(value ^ i64_to_u64(msg.instance())); + value = splitmix64(value ^ i64_to_u64(msg.source())); + value = splitmix64(value ^ i64_to_u64(msg.round())); + value = splitmix64(value ^ i64_to_u64(msg.value())); + value = splitmix64(value ^ i64_to_u64(msg.value_source().unwrap_or_default())); + value = splitmix64(value ^ i64_to_u64(msg.prepared_round())); + value = splitmix64(value ^ i64_to_u64(msg.prepared_value())); + splitmix64(value ^ i64_to_u64(target)) +} + +fn deterministic_range(seed: u64, counter: u64, salt: u64, upper: usize) -> usize { + let upper = u64::try_from(upper).expect("upper fits in u64"); + usize::try_from(splitmix64(seed ^ counter ^ salt) % upper).expect("range fits in usize") +} + +fn deterministic_i64(seed: u64, counter: u64, salt: u64, upper: i64) -> i64 { + let upper = u64::try_from(upper).expect("upper is positive"); + i64::try_from(splitmix64(seed ^ counter ^ salt) % upper).expect("range fits in i64") +} + +fn splitmix64(mut value: u64) -> u64 { + value = value.wrapping_add(0x9e37_79b9_7f4a_7c15); + value = (value ^ (value >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9); + value = (value ^ (value >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb); + value ^ (value >> 31) +} + +fn i64_to_u64(value: i64) -> u64 { + u64::from_le_bytes(value.to_le_bytes()) } #[derive(Clone, Debug)] @@ -455,7 +870,6 @@ impl SomeMsg for TestMsg { } #[test] -#[ignore = "flaky"] fn happy_0() { test_qbft(Test { instance: 0, @@ -465,7 +879,6 @@ fn happy_0() { } #[test] -#[ignore = "flaky"] fn happy_1() { test_qbft(Test { instance: 1, @@ -475,7 +888,6 @@ fn happy_1() { } #[test] -#[ignore = "flaky"] fn prepare_round_1_decide_round_2() { test_qbft(Test { instance: 0, @@ -487,12 +899,11 @@ fn prepare_round_1_decide_round_2() { } #[test] -#[ignore = "wrong prepared value"] fn prepare_round_2_decide_round_3() { test_qbft(Test { instance: 0, commits_after: 2, - value_delay: HashMap::from([(1, Duration::from_millis(200))]), + value_delay: HashMap::from([(1, Duration::from_secs(2))]), decide_round: 3, prepared_val: 2, const_period: true, @@ -501,32 +912,27 @@ fn prepare_round_2_decide_round_3() { } #[test] -#[ignore = "wrong decide round"] -fn leader_late_xp() { +fn leader_late_exp() { test_qbft(Test { instance: 0, - start_delay: HashMap::from([(1, Duration::from_millis(200))]), + start_delay: HashMap::from([(1, Duration::from_secs(2))]), decide_round: 2, ..Default::default() }); } #[test] -#[ignore = "wrong decide round"] fn leader_down_const() { test_qbft(Test { - instance: 3, - start_delay: HashMap::from([ - (1, Duration::from_millis(50)), - (2, Duration::from_millis(100)), - ]), - decide_round: 4, + instance: 0, + start_delay: HashMap::from([(1, Duration::from_secs(2))]), + const_period: true, + decide_round: 2, ..Default::default() }); } #[test] -#[ignore = "flaky"] fn very_late_exp() { test_qbft(Test { instance: 3, @@ -537,7 +943,6 @@ fn very_late_exp() { } #[test] -#[ignore = "flaky"] fn very_late_const() { test_qbft(Test { instance: 1, @@ -549,7 +954,6 @@ fn very_late_const() { } #[test] -#[ignore = "flaky"] fn stagger_start_exp() { test_qbft(Test { instance: 0, @@ -565,7 +969,6 @@ fn stagger_start_exp() { } #[test] -#[ignore = "flaky"] fn stagger_start_const() { test_qbft(Test { instance: 0, @@ -582,7 +985,6 @@ fn stagger_start_const() { } #[test] -#[ignore = "flaky"] fn very_delayed_value_exp() { test_qbft(Test { instance: 3, @@ -593,7 +995,6 @@ fn very_delayed_value_exp() { } #[test] -#[ignore = "flaky"] fn very_delayed_value_const() { test_qbft(Test { instance: 1, @@ -605,7 +1006,6 @@ fn very_delayed_value_const() { } #[test] -#[ignore = "flaky"] fn stagger_delayed_value_exp() { test_qbft(Test { instance: 0, @@ -621,7 +1021,6 @@ fn stagger_delayed_value_exp() { } #[test] -#[ignore = "flaky"] fn stagger_delayed_value_const() { test_qbft(Test { instance: 0, @@ -638,7 +1037,6 @@ fn stagger_delayed_value_const() { } #[test] -#[ignore = "flaky"] fn round1_leader_no_value_round2_leader_offline() { test_qbft(Test { instance: 0, @@ -651,7 +1049,6 @@ fn round1_leader_no_value_round2_leader_offline() { } #[test] -#[ignore = "flaky"] fn jitter_500ms_exp() { test_qbft(Test { instance: 3, @@ -662,7 +1059,6 @@ fn jitter_500ms_exp() { } #[test] -#[ignore = "flaky"] fn jitter_200ms_const() { test_qbft(Test { instance: 3, @@ -674,7 +1070,6 @@ fn jitter_200ms_const() { } #[test] -#[ignore = "flaky"] fn drop_10_percent_const() { test_qbft(Test { instance: 1, @@ -686,7 +1081,6 @@ fn drop_10_percent_const() { } #[test] -#[ignore = "flaky"] fn drop_30_percent_const() { test_qbft(Test { instance: 1, @@ -697,6 +1091,41 @@ fn drop_30_percent_const() { }); } +#[test] +fn fuzz() { + test_qbft(Test { + instance: 1, + fuzz: true, + const_period: true, + decide_round: 1, + ..Default::default() + }); +} + +#[test] +fn fuzz_with_late_leader() { + test_qbft(Test { + instance: 1, + fuzz: true, + start_delay: HashMap::from([(1, Duration::from_secs(2)), (2, Duration::from_secs(2))]), + const_period: true, + random_round: true, + ..Default::default() + }); +} + +#[test] +fn fuzz_with_very_late_leader() { + test_qbft(Test { + instance: 1, + fuzz: true, + start_delay: HashMap::from([(1, Duration::from_secs(10)), (2, Duration::from_secs(10))]), + const_period: true, + random_round: true, + ..Default::default() + }); +} + fn noop_definition() -> Definition { Definition { is_leader: Box::new(|_, _, _| false), @@ -719,7 +1148,71 @@ fn noop_transport() -> Transport { } #[test] -#[ignore = "flaky"] +fn formulas() { + let expected = [ + (1, 1, 0), + (2, 2, 0), + (3, 2, 0), + (4, 3, 1), + (5, 4, 1), + (6, 4, 1), + (7, 5, 2), + (8, 6, 2), + (9, 6, 2), + (10, 7, 3), + (11, 8, 3), + (12, 8, 3), + (13, 9, 4), + (14, 10, 4), + (15, 10, 4), + (16, 11, 5), + (17, 12, 5), + (18, 12, 5), + (19, 13, 6), + (20, 14, 6), + (21, 14, 6), + (22, 15, 7), + ]; + + for (n, q, f) in expected { + let d = Definition:: { + nodes: n, + ..noop_definition() + }; + assert_eq!(q, d.quorum(), "Quorum given N={n}"); + assert_eq!(f, d.faulty(), "Faulty given N={n}"); + } +} + +#[test] +fn is_justified_pre_prepare_mixed_round_change_prepare_fixture() { + let preprepare = new_msg( + MSG_PRE_PREPARE, + 1, + 3, + 6, + 2, + 0, + 0, + 0, + Some(&vec![ + new_msg(MSG_ROUND_CHANGE, 1, 2, 6, 0, 0, 2, 3, None), + new_msg(MSG_ROUND_CHANGE, 1, 3, 6, 0, 0, 2, 3, None), + new_msg(MSG_ROUND_CHANGE, 1, 1, 6, 0, 0, 2, 2, None), + new_msg(MSG_PREPARE, 1, 3, 2, 2, 0, 0, 0, None), + new_msg(MSG_PREPARE, 1, 4, 2, 2, 0, 0, 0, None), + new_msg(MSG_PREPARE, 1, 1, 2, 2, 0, 0, 0, None), + new_msg(MSG_PREPARE, 1, 2, 2, 2, 0, 0, 0, None), + ]), + ); + let mut def = noop_definition(); + def.nodes = 4; + def.is_leader = Box::new(make_is_leader(4)); + + assert!(is_justified_pre_prepare(&def, &1, &preprepare, 0)); +} + +#[test] fn duplicate_pre_prepare_rules() { let cts = CancellationTokenSource::new(); let ct = &cts.token().clone(); @@ -760,8 +1253,11 @@ fn duplicate_pre_prepare_rules() { panic!("unexpected round {}", round); }); - def.compare = Box::new(|_, _, _, _, return_err, _| { - _ = return_err.send(Ok(())); + def.compare = Box::new(|_, _, _, input_value_source, return_err, return_value| { + return_value + .send(*input_value_source) + .expect(WRITE_CHAN_ERR); + send_compare_result(return_err, return_value, Ok(())); }); let (r_chan_tx, r_chan_rx) = mpmc::bounded::>(2); @@ -788,3 +1284,489 @@ fn duplicate_pre_prepare_rules() { assert!(res.is_ok()); } + +#[test] +fn classify_rules() { + let mut def = noop_definition(); + def.nodes = 4; + def.is_leader = Box::new(make_is_leader(4)); + + let preprepare = new_msg(MSG_PRE_PREPARE, 0, 1, 1, 1, 0, 0, 0, None); + assert!(classify(&def, &0, 1, 2, &HashMap::new(), &preprepare).0 == UPON_JUSTIFIED_PRE_PREPARE); + + let prepares = vec![ + new_msg(MSG_PREPARE, 0, 1, 1, 2, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 2, 1, 2, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 3, 1, 2, 0, 0, 0, None), + ]; + let buffer = buffer_by_source(&prepares); + assert!(classify(&def, &0, 1, 2, &buffer, &prepares[2]).0 == UPON_QUORUM_PREPARES); + + let commits = vec![ + new_msg(MSG_COMMIT, 0, 1, 1, 2, 0, 0, 0, None), + new_msg(MSG_COMMIT, 0, 2, 1, 2, 0, 0, 0, None), + new_msg(MSG_COMMIT, 0, 3, 1, 2, 0, 0, 0, None), + ]; + let buffer = buffer_by_source(&commits); + assert!(classify(&def, &0, 1, 2, &buffer, &commits[2]).0 == UPON_QUORUM_COMMITS); + + let future_round_changes = vec![ + new_msg(MSG_ROUND_CHANGE, 0, 1, 3, 0, 0, 0, 0, None), + new_msg(MSG_ROUND_CHANGE, 0, 2, 3, 0, 0, 0, 0, None), + ]; + let buffer = buffer_by_source(&future_round_changes); + assert!( + classify(&def, &0, 1, 2, &buffer, &future_round_changes[1]).0 == UPON_F_PLUS1_ROUND_CHANGES + ); + + let unjust_round_changes = vec![ + new_msg(MSG_ROUND_CHANGE, 0, 1, 1, 0, 0, 2, 9, None), + new_msg(MSG_ROUND_CHANGE, 0, 2, 1, 0, 0, 2, 9, None), + new_msg(MSG_ROUND_CHANGE, 0, 3, 1, 0, 0, 2, 9, None), + ]; + let buffer = buffer_by_source(&unjust_round_changes); + assert!( + classify(&def, &0, 1, 2, &buffer, &unjust_round_changes[2]).0 + == UPON_UNJUST_QUORUM_ROUND_CHANGES + ); +} + +#[test] +fn justified_qrc_j1_and_j2() { + let mut def = noop_definition(); + def.nodes = 4; + let j1 = vec![ + new_msg(MSG_ROUND_CHANGE, 0, 1, 2, 0, 0, 0, 0, None), + new_msg(MSG_ROUND_CHANGE, 0, 2, 2, 0, 0, 0, 0, None), + new_msg(MSG_ROUND_CHANGE, 0, 3, 2, 0, 0, 0, 0, None), + ]; + assert_eq!(Some(0), contains_justified_qrc(&def, &j1, 2)); + assert_eq!(3, get_justified_qrc(&def, &j1, 2).unwrap().len()); + + let j2 = vec![ + new_msg(MSG_ROUND_CHANGE, 0, 1, 2, 0, 0, 1, 7, None), + new_msg(MSG_ROUND_CHANGE, 0, 2, 2, 0, 0, 1, 7, None), + new_msg(MSG_ROUND_CHANGE, 0, 3, 2, 0, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 1, 1, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 2, 1, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 3, 1, 7, 0, 0, 0, None), + ]; + assert_eq!(Some(7), contains_justified_qrc(&def, &j2, 2)); + assert!(get_justified_qrc(&def, &j2, 2).unwrap().len() >= 6); +} + +#[test] +fn filter_msgs_keeps_one_per_source() { + let msgs = vec![ + new_msg(MSG_PREPARE, 0, 1, 1, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 1, 1, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 2, 1, 7, 0, 0, 0, None), + ]; + + let filtered = filter_msgs(&msgs, MSG_PREPARE, 1, Some(&7), None, None); + + assert_eq!(2, filtered.len()); + assert_eq!( + vec![1, 2], + filtered.iter().map(|msg| msg.source()).collect::>() + ); +} + +#[test] +fn compare_success_error_cached_value_source_and_timeout() { + let cts = CancellationTokenSource::new(); + let msg = new_msg(MSG_PRE_PREPARE, 0, 1, 1, 7, 11, 0, 0, None); + let (_vs_tx, vs_rx) = mpmc::bounded::(1); + let timer = mpmc::never(); + let mut def = noop_definition(); + def.compare = Box::new(|_, _, _, input_value_source, return_err, return_value| { + return_value + .send(*input_value_source) + .expect(WRITE_CHAN_ERR); + send_compare_result(return_err, return_value, Ok(())); + }); + assert!(matches!( + compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), + Ok(0) + )); + + let mut def = noop_definition(); + def.compare = Box::new(|_, _, _, input_value_source, return_err, return_value| { + return_value + .send(*input_value_source) + .expect(WRITE_CHAN_ERR); + send_compare_result(return_err, return_value, Err(QbftError::CompareError)); + }); + assert!(matches!( + compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), + Err(QbftError::CompareError) + )); + + let (vs_tx, vs_rx) = mpmc::bounded::(1); + vs_tx.send(42).expect(WRITE_CHAN_ERR); + let mut def = noop_definition(); + def.compare = Box::new( + |_, _, input_value_source_ch, input_value_source, return_err, return_value| { + let cached = if *input_value_source == 0 { + let value = input_value_source_ch.recv().expect(READ_CHAN_ERR); + return_value.send(value).expect(WRITE_CHAN_ERR); + value + } else { + *input_value_source + }; + assert_eq!(42, cached); + send_compare_result(return_err, return_value, Ok(())); + }, + ); + assert!(matches!( + compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), + Ok(42) + )); + + let (timer_tx, timer_rx) = mpmc::bounded(1); + timer_tx.send(time::Instant::now()).expect(WRITE_CHAN_ERR); + let mut def = noop_definition(); + def.compare = Box::new(|_, _, _, _, return_err, _| { + thread::sleep(Duration::from_millis(20)); + let _ = return_err.send(Ok(())); + }); + assert!(matches!( + compare(cts.token(), &def, &msg, &vs_rx, 0, &timer_rx), + Err(QbftError::TimeoutError) + )); +} + +fn buffer_by_source(msgs: &[Msg]) -> HashMap>> { + let mut buffer = HashMap::new(); + for msg in msgs { + buffer + .entry(msg.source()) + .or_insert_with(Vec::new) + .push(msg.clone()); + } + buffer +} + +#[derive(Debug)] +struct ChainSplitTest { + value_source: HashMap, + decide_round: i32, + prepared_val: i32, + should_halt: bool, +} + +#[test] +fn chain_split_same_value() { + test_qbft_chain_split(ChainSplitTest { + decide_round: 1, + value_source: HashMap::from([(1, 1), (2, 1), (3, 1), (4, 1)]), + prepared_val: 1, + should_halt: false, + }); +} + +#[test] +fn chain_split_non_leader_peer_has_different_value() { + test_qbft_chain_split(ChainSplitTest { + decide_round: 1, + value_source: HashMap::from([(1, 1), (2, 3), (3, 1), (4, 1)]), + prepared_val: 1, + should_halt: false, + }); +} + +#[test] +fn chain_split_first_leader_has_different_value_second_leader_succeeds() { + test_qbft_chain_split(ChainSplitTest { + decide_round: 2, + value_source: HashMap::from([(1, 3), (2, 1), (3, 1), (4, 1)]), + prepared_val: 1, + should_halt: false, + }); +} + +#[test] +fn zz_chain_split_no_consensus_halt() { + test_qbft_chain_split(ChainSplitTest { + decide_round: 0, + value_source: HashMap::from([(1, 1), (2, 1), (3, 3), (4, 3)]), + prepared_val: 0, + should_halt: true, + }); +} + +fn test_qbft_chain_split(test: ChainSplitTest) { + const N: usize = 4; + const MAX_ROUND: i64 = 10; + const FIFO_LIMIT: i64 = 100; + + let clock = FakeClock::new(time::Instant::now()); + let cts = CancellationTokenSource::new(); + let trace = Trace::new(); + let mut receives = BTreeMap::< + i64, + ( + mpmc::Sender>, + mpmc::Receiver>, + ), + >::new(); + let (broadcast_tx, broadcast_rx) = mpmc::unbounded::>(); + let (result_chan_tx, result_chan_rx) = mpmc::bounded::>>(N); + let (run_chan_tx, run_chan_rx) = mpmc::bounded::<(i64, RunOutcome)>(N); + let instance = 0; + + let defs = Arc::new(Definition { + is_leader: Box::new(make_is_leader(N as i64)), + new_timer: { + let clock = clock.clone(); + Box::new(move |round| { + clock.new_timer(Duration::from_secs(u64::pow(2, (round as u32) - 1))) + }) + }, + decide: { + let result_chan_tx = result_chan_tx.clone(); + Box::new(move |_, _, _, q_commit| { + result_chan_tx.send(q_commit.clone()).expect(WRITE_CHAN_ERR); + }) + }, + compare: Box::new( + |_, qcommit, input_value_source_ch, input_value_source, return_err, return_value| { + let leader_value_source = qcommit.value_source().expect("value source"); + let local = if *input_value_source == 0 { + let value = input_value_source_ch.recv().expect(READ_CHAN_ERR); + return_value.send(value).expect(WRITE_CHAN_ERR); + value + } else { + *input_value_source + }; + + if leader_value_source != local { + return_value.send(local).expect(WRITE_CHAN_ERR); + send_compare_result(return_err, return_value, Err(QbftError::CompareError)); + return; + } + + return_value.send(local).expect(WRITE_CHAN_ERR); + send_compare_result(return_err, return_value, Ok(())); + }, + ), + nodes: N as i64, + fifo_limit: FIFO_LIMIT, + log_round_change: { + let clock = clock.clone(); + let trace = trace.clone(); + Box::new(move |_, process, round, new_round, upon_rule, _| { + trace.push(format!( + "{:?} - {}@{} change to {} ~= {}", + clock.elapsed(), + process, + round, + new_round, + upon_rule + )); + }) + }, + log_unjust: { + let trace = trace.clone(); + Box::new(move |_, process, msg| { + trace.push(format!("Unjust: process={} msg={:?}", process, msg)) + }) + }, + log_upon_rule: { + let clock = clock.clone(); + let trace = trace.clone(); + Box::new(move |_, process, round, msg, upon_rule| { + trace.push(format!( + "{:?} {} => {}@{} -> {}@{} ~= {}", + clock.elapsed(), + msg.source(), + msg.type_(), + msg.round(), + process, + round, + upon_rule + )); + }) + }, + }); + + thread::scope(|s| { + for i in 1..=N as i64 { + let (sender, receiver) = mpmc::bounded::>(1000); + receives.insert(i, (sender.clone(), receiver.clone())); + let broadcast_tx = broadcast_tx.clone(); + let trace = trace.clone(); + let clock = clock.clone(); + + let transport = Transport { + broadcast: Box::new( + move |_, type_, instance, source, round, value, pr, pv, justification| { + if round > MAX_ROUND { + return Err(QbftError::MaxRoundReached); + } + + trace.push(format!( + "{:?} {} => {}@{}", + clock.elapsed(), + source, + type_, + round + )); + let msg = new_msg( + type_, + *instance, + source, + round, + *value, + *value, + pr, + *pv, + justification, + ); + sender.send(msg.clone()).expect(WRITE_CHAN_ERR); + broadcast_tx.send(msg).expect(WRITE_CHAN_ERR); + Ok(()) + }, + ), + receive: receiver, + }; + + let token = cts.token().clone(); + let defs = defs.clone(); + let run_chan_tx = run_chan_tx.clone(); + let value_source = test.value_source[&i]; + s.spawn(move || { + let (v_tx, v_rx) = mpmc::bounded::(1); + let (vs_tx, vs_rx) = mpmc::unbounded::(); + v_tx.send(value_source).expect(WRITE_CHAN_ERR); + for _ in 0..=MAX_ROUND { + vs_tx.send(value_source).expect(WRITE_CHAN_ERR); + } + let run_result = panic::catch_unwind(AssertUnwindSafe(|| { + qbft::run(&token, &defs, &transport, &instance, i, v_rx, vs_rx) + })); + drop(v_tx); + drop(vs_tx); + run_chan_tx.send((i, run_result)).expect(WRITE_CHAN_ERR); + }); + } + + let mut results = BTreeMap::>::new(); + let mut count = 0; + let mut decided = false; + let mut done = 0; + + loop { + mpmc::select! { + recv(broadcast_rx) -> msg => { + let msg = msg.expect(READ_CHAN_ERR); + for (target, (out_tx, _)) in receives.iter() { + if *target == msg.source() { + continue; + } + out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); + if deterministic_unit(CHAIN_SPLIT_SEED, &msg, *target, 1) < 0.1 { + out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); + } + } + } + recv(result_chan_rx) -> res => { + let q_commit = res.expect(READ_CHAN_ERR); + for commit in q_commit.clone() { + for previous in results.values() { + if previous.value() != commit.value() { + cts.cancel(); + clock.cancel(); + panic!( + "chain split commit values differ: previous={:?} commit={:?} elapsed={:?}\n{}", + previous, + commit, + clock.elapsed(), + trace.dump() + ); + } + } + if i64::from(test.decide_round) != commit.round() { + cts.cancel(); + clock.cancel(); + panic!( + "chain split wrong decide round: want={} got={} commit={:?} elapsed={:?}\n{}", + test.decide_round, + commit.round(), + commit, + clock.elapsed(), + trace.dump() + ); + } + if test.prepared_val != 0 { + if i64::from(test.prepared_val) != commit.value() { + cts.cancel(); + clock.cancel(); + panic!( + "chain split wrong prepared value: want={} got={} commit={:?} elapsed={:?}\n{}", + test.prepared_val, + commit.value(), + commit, + clock.elapsed(), + trace.dump() + ); + } + } + results.insert(commit.source(), commit); + } + count += 1; + if count == N { + decided = true; + clock.cancel(); + cts.cancel(); + } + } + recv(run_chan_rx) -> res => { + let (node, outcome) = res.expect(READ_CHAN_ERR); + let expected_halt = test.should_halt + && outcome_is_error(&outcome, |err| matches!(err, QbftError::MaxRoundReached)); + if !(decided || expected_halt) { + cts.cancel(); + clock.cancel(); + panic!( + "unexpected chain split run error: node={} outcome={} decided={} done={} count={} elapsed={:?}\n{}", + node, + format_run_outcome(&outcome), + decided, + done, + count, + clock.elapsed(), + trace.dump() + ); + } + done += 1; + if done == N { + if test.should_halt { + assert!(!decided, "halt case unexpectedly decided"); + } + return; + } + } + default => { + thread::sleep(Duration::from_micros(1)); + let tick = if test.should_halt { + Duration::from_millis(100) + } else { + Duration::from_millis(1) + }; + clock.advance(tick); + let limit = if test.should_halt { + Duration::from_secs(600) + } else { + Duration::from_secs(60) + }; + if clock.elapsed() > limit { + cts.cancel(); + clock.cancel(); + panic!("chain split hang: decided={decided} done={done} count={count} elapsed={:?}\n{}", clock.elapsed(), trace.dump()); + } + } + } + } + }); +} From 0ec38dede4cac52ff4ba21a1f51d160c89efb4a9 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 10:22:23 +0700 Subject: [PATCH 02/28] fix: compare run on retached thread --- crates/core/src/qbft/fake_clock.rs | 4 +- crates/core/src/qbft/internal_test.rs | 231 ++++++++++++++++++++------ crates/core/src/qbft/mod.rs | 169 +++++++++++-------- 3 files changed, 283 insertions(+), 121 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 3b7baeea..0a3ab477 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -1,6 +1,6 @@ use crossbeam::channel as mpmc; use std::{ - collections::BTreeMap, + collections::HashMap, sync::{Arc, Mutex}, thread, time::{Duration, Instant}, @@ -15,7 +15,7 @@ struct FakeClockInner { start: Instant, now: Instant, last_id: usize, - clients: BTreeMap, Instant)>, + clients: HashMap, Instant)>, } impl FakeClock { diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 5c5659f2..ea072f94 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -12,6 +12,7 @@ use std::{ const WRITE_CHAN_ERR: &str = "Failed to write to channel"; const READ_CHAN_ERR: &str = "Failed to read from channel"; +// Fixed seed for deterministic chain-split duplicate-message simulation. const CHAIN_SPLIT_SEED: u64 = 0x4348_4149_4e53_504c; type RunOutcome = std::thread::Result>; @@ -66,6 +67,9 @@ fn test_qbft(test: Test) { let clock = FakeClock::new(start_time); let cts = CancellationTokenSource::new(); + // Keep peer iteration deterministic. These fake-clock tests assert exact + // rounds, and broadcast fanout order affects which node observes quorums + // first when tests run in parallel. let mut receives = BTreeMap::< i64, ( @@ -102,11 +106,8 @@ fn test_qbft(test: Test) { result_chan_tx.send(q_commit.clone()).expect(WRITE_CHAN_ERR); }) }, - compare: Box::new(|_, _, _, input_value_source, return_err, return_value| { - return_value - .send(*input_value_source) - .expect(WRITE_CHAN_ERR); - send_compare_result(return_err, return_value, Ok(())); + compare: Arc::new(|_, _, _, _, return_err, _| { + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); }), nodes: N as i64, fifo_limit: FIFO_LIMIT as i64, @@ -329,6 +330,10 @@ fn test_qbft(test: Test) { &clock, ); + if decided { + next_fuzz_at = None; + } + while let Some(next) = next_fuzz_at { if clock.elapsed() < next { break; @@ -467,6 +472,7 @@ fn test_qbft(test: Test) { // Trigger shutdown decided = true; + next_fuzz_at = None; clock.cancel(); cts.cancel(); @@ -567,21 +573,6 @@ fn outcome_is_error(outcome: &RunOutcome, expected: fn(&QbftError) -> bool) -> b matches!(outcome, Ok(Err(err)) if expected(err)) } -fn send_compare_result( - return_err: &mpmc::Sender>, - return_value: &mpmc::Sender, - result: Result<()>, -) { - // Keep the auxiliary compare channels alive until the result is consumed. - // Go channels do not close when the callback returns; Rust senders do. - let keepalive = return_value.clone(); - thread::spawn(move || { - thread::sleep(Duration::from_millis(50)); - drop(keepalive); - }); - return_err.send(result).expect(WRITE_CHAN_ERR); -} - fn test_seed(test: &Test) -> u64 { let mut seed = 0x5142_4654_5445_5354_u64; seed ^= test.instance as u64; @@ -1131,7 +1122,7 @@ fn noop_definition() -> Definition { is_leader: Box::new(|_, _, _| false), new_timer: Box::new(|_| (mpmc::never(), Box::new(|| {}))), decide: Box::new(|_, _, _, _| {}), - compare: Box::new(|_, _, _, _, _, _| {}), + compare: Arc::new(|_, _, _, _, _, _| {}), nodes: 0, fifo_limit: 0, log_round_change: Box::new(|_, _, _, _, _, _| {}), @@ -1253,11 +1244,8 @@ fn duplicate_pre_prepare_rules() { panic!("unexpected round {}", round); }); - def.compare = Box::new(|_, _, _, input_value_source, return_err, return_value| { - return_value - .send(*input_value_source) - .expect(WRITE_CHAN_ERR); - send_compare_result(return_err, return_value, Ok(())); + def.compare = Arc::new(|_, _, _, _, return_err, _| { + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); }); let (r_chan_tx, r_chan_rx) = mpmc::bounded::>(2); @@ -1285,6 +1273,35 @@ fn duplicate_pre_prepare_rules() { assert!(res.is_ok()); } +#[test] +fn idle_run_returns_when_cancelled() { + let cts = CancellationTokenSource::new(); + let token = cts.token().clone(); + let def = noop_definition(); + let transport = noop_transport(); + let (_input_tx, input_rx) = mpmc::bounded::(1); + let (_source_tx, source_rx) = mpmc::bounded::(1); + let (done_tx, done_rx) = mpmc::bounded(1); + + thread::spawn(move || { + done_tx + .send(qbft::run( + &token, &def, &transport, &0, 1, input_rx, source_rx, + )) + .expect(WRITE_CHAN_ERR); + }); + + thread::sleep(Duration::from_millis(10)); + cts.cancel(); + + assert!(matches!( + done_rx + .recv_timeout(Duration::from_millis(100)) + .expect("idle run must unblock on cancellation"), + Ok(()) + )); +} + #[test] fn classify_rules() { let mut def = noop_definition(); @@ -1379,33 +1396,42 @@ fn compare_success_error_cached_value_source_and_timeout() { let (_vs_tx, vs_rx) = mpmc::bounded::(1); let timer = mpmc::never(); let mut def = noop_definition(); - def.compare = Box::new(|_, _, _, input_value_source, return_err, return_value| { - return_value - .send(*input_value_source) - .expect(WRITE_CHAN_ERR); - send_compare_result(return_err, return_value, Ok(())); + def.compare = Arc::new(|_, _, _, _, return_err, _| { + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); }); assert!(matches!( compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), - Ok(0) + (0, Ok(())) + )); + + let mut def = noop_definition(); + def.compare = Arc::new(|_, _, _, _, return_err, _| { + let return_err = return_err.clone(); + thread::spawn(move || { + thread::sleep(Duration::from_millis(10)); + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + }); + }); + assert!(matches!( + compare(cts.token(), &def, &msg, &vs_rx, 41, &timer), + (41, Ok(())) )); let mut def = noop_definition(); - def.compare = Box::new(|_, _, _, input_value_source, return_err, return_value| { - return_value - .send(*input_value_source) + def.compare = Arc::new(|_, _, _, _, return_err, _| { + return_err + .send(Err(QbftError::CompareError)) .expect(WRITE_CHAN_ERR); - send_compare_result(return_err, return_value, Err(QbftError::CompareError)); }); assert!(matches!( compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), - Err(QbftError::CompareError) + (0, Err(QbftError::CompareError)) )); let (vs_tx, vs_rx) = mpmc::bounded::(1); vs_tx.send(42).expect(WRITE_CHAN_ERR); let mut def = noop_definition(); - def.compare = Box::new( + def.compare = Arc::new( |_, _, input_value_source_ch, input_value_source, return_err, return_value| { let cached = if *input_value_source == 0 { let value = input_value_source_ch.recv().expect(READ_CHAN_ERR); @@ -1415,27 +1441,121 @@ fn compare_success_error_cached_value_source_and_timeout() { *input_value_source }; assert_eq!(42, cached); - send_compare_result(return_err, return_value, Ok(())); + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + }, + ); + assert!(matches!( + compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), + (42, Ok(())) + )); + + let (vs_tx, vs_rx) = mpmc::bounded::(1); + vs_tx.send(43).expect(WRITE_CHAN_ERR); + let mut def = noop_definition(); + def.compare = Arc::new( + |_, _, input_value_source_ch, input_value_source, return_err, return_value| { + let cached = if *input_value_source == 0 { + let value = input_value_source_ch.recv().expect(READ_CHAN_ERR); + return_value.send(value).expect(WRITE_CHAN_ERR); + value + } else { + *input_value_source + }; + assert_eq!(43, cached); + return_err + .send(Err(QbftError::CompareError)) + .expect(WRITE_CHAN_ERR); }, ); assert!(matches!( compare(cts.token(), &def, &msg, &vs_rx, 0, &timer), - Ok(42) + (43, Err(QbftError::CompareError)) )); let (timer_tx, timer_rx) = mpmc::bounded(1); timer_tx.send(time::Instant::now()).expect(WRITE_CHAN_ERR); let mut def = noop_definition(); - def.compare = Box::new(|_, _, _, _, return_err, _| { + def.compare = Arc::new(|_, _, _, _, return_err, _| { thread::sleep(Duration::from_millis(20)); let _ = return_err.send(Ok(())); }); assert!(matches!( - compare(cts.token(), &def, &msg, &vs_rx, 0, &timer_rx), - Err(QbftError::TimeoutError) + compare(cts.token(), &def, &msg, &vs_rx, 44, &timer_rx), + (44, Err(QbftError::TimeoutError)) + )); +} + +#[test] +fn compare_timeout_does_not_wait_for_blocked_callback() { + let cts = CancellationTokenSource::new(); + let msg = new_msg(MSG_PRE_PREPARE, 0, 1, 1, 7, 11, 0, 0, None); + let (_vs_tx, vs_rx) = mpmc::bounded::(1); + let (timer_tx, timer_rx) = mpmc::bounded(1); + timer_tx.send(time::Instant::now()).expect(WRITE_CHAN_ERR); + + let mut def = noop_definition(); + def.compare = Arc::new(|ct, _, _, _, return_err, _| { + while !ct.is_canceled() { + thread::sleep(Duration::from_millis(1)); + } + let _ = return_err.send(Ok(())); + }); + + let (result_tx, result_rx) = mpmc::bounded(1); + thread::spawn(move || { + result_tx + .send(compare(cts.token(), &def, &msg, &vs_rx, 0, &timer_rx)) + .expect(WRITE_CHAN_ERR); + }); + + assert!(matches!( + result_rx + .recv_timeout(Duration::from_millis(100)) + .expect("compare must return on timer without waiting for blocked callback"), + (0, Err(QbftError::TimeoutError)) )); } +#[test] +fn compare_parent_cancel_cancels_callback_token() { + let cts = CancellationTokenSource::new(); + let token = cts.token().clone(); + let msg = new_msg(MSG_PRE_PREPARE, 0, 1, 1, 7, 11, 0, 0, None); + let (_vs_tx, vs_rx) = mpmc::bounded::(1); + let (timer_tx, timer_rx) = mpmc::bounded(1); + let (token_cancelled_tx, token_cancelled_rx) = mpmc::bounded(1); + + let mut def = noop_definition(); + def.compare = Arc::new(move |ct, _, _, _, return_err, _| { + while !ct.is_canceled() { + thread::sleep(Duration::from_millis(1)); + } + token_cancelled_tx.send(()).expect(WRITE_CHAN_ERR); + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + }); + + let (result_tx, result_rx) = mpmc::bounded(1); + thread::spawn(move || { + result_tx + .send(compare(&token, &def, &msg, &vs_rx, 0, &timer_rx)) + .expect(WRITE_CHAN_ERR); + }); + + thread::sleep(Duration::from_millis(10)); + cts.cancel(); + + match result_rx.recv_timeout(Duration::from_millis(100)) { + Ok(result) => assert!(matches!(result, (0, Ok(())))), + Err(err) => { + let _ = timer_tx.send(time::Instant::now()); + panic!("compare callback token must be canceled by parent token: {err}"); + } + } + token_cancelled_rx + .recv_timeout(Duration::from_millis(100)) + .expect("callback token must be canceled by parent token"); +} + fn buffer_by_source(msgs: &[Msg]) -> HashMap>> { let mut buffer = HashMap::new(); for msg in msgs { @@ -1503,6 +1623,9 @@ fn test_qbft_chain_split(test: ChainSplitTest) { let clock = FakeClock::new(time::Instant::now()); let cts = CancellationTokenSource::new(); let trace = Trace::new(); + // Keep peer iteration deterministic. These fake-clock tests assert exact + // rounds, and broadcast fanout order affects which node observes quorums + // first when tests run in parallel. let mut receives = BTreeMap::< i64, ( @@ -1529,7 +1652,7 @@ fn test_qbft_chain_split(test: ChainSplitTest) { result_chan_tx.send(q_commit.clone()).expect(WRITE_CHAN_ERR); }) }, - compare: Box::new( + compare: Arc::new( |_, qcommit, input_value_source_ch, input_value_source, return_err, return_value| { let leader_value_source = qcommit.value_source().expect("value source"); let local = if *input_value_source == 0 { @@ -1541,13 +1664,13 @@ fn test_qbft_chain_split(test: ChainSplitTest) { }; if leader_value_source != local { - return_value.send(local).expect(WRITE_CHAN_ERR); - send_compare_result(return_err, return_value, Err(QbftError::CompareError)); + return_err + .send(Err(QbftError::CompareError)) + .expect(WRITE_CHAN_ERR); return; } - return_value.send(local).expect(WRITE_CHAN_ERR); - send_compare_result(return_err, return_value, Ok(())); + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); }, ), nodes: N as i64, @@ -1637,11 +1760,9 @@ fn test_qbft_chain_split(test: ChainSplitTest) { let value_source = test.value_source[&i]; s.spawn(move || { let (v_tx, v_rx) = mpmc::bounded::(1); - let (vs_tx, vs_rx) = mpmc::unbounded::(); + let (vs_tx, vs_rx) = mpmc::bounded::(1); v_tx.send(value_source).expect(WRITE_CHAN_ERR); - for _ in 0..=MAX_ROUND { - vs_tx.send(value_source).expect(WRITE_CHAN_ERR); - } + vs_tx.send(value_source).expect(WRITE_CHAN_ERR); let run_result = panic::catch_unwind(AssertUnwindSafe(|| { qbft::run(&token, &defs, &transport, &instance, i, v_rx, vs_rx) })); @@ -1756,7 +1877,11 @@ fn test_qbft_chain_split(test: ChainSplitTest) { }; clock.advance(tick); let limit = if test.should_halt { - Duration::from_secs(600) + let max_round = u32::try_from(MAX_ROUND).expect("MAX_ROUND fits u32"); + let seconds = 1_u64 + .checked_shl(max_round.checked_add(1).expect("MAX_ROUND permits timeout limit")) + .expect("MAX_ROUND permits timeout limit"); + Duration::from_secs(seconds) } else { Duration::from_secs(60) }; diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 41d0a2d5..e444373b 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -20,7 +20,7 @@ #![allow(clippy::cast_possible_truncation)] #![allow(clippy::arithmetic_side_effects)] -use cancellation::CancellationToken; +use cancellation::{CancellationToken, CancellationTokenSource}; use crossbeam::channel as mpmc; use std::{ any, @@ -33,6 +33,19 @@ use std::{ type Result = std::result::Result; +const RUN_CANCELLATION_POLL_INTERVAL: time::Duration = time::Duration::from_millis(1); + +type CompareFn = dyn Fn( + /* ct */ &CancellationToken, + /* qcommit */ &Msg, + /* input_value_source_ch */ &mpmc::Receiver, + /* input_value_source */ &C, + /* return_err */ &mpmc::Sender>, + /* return_value */ &mpmc::Sender, + ) + Send + + Sync + + 'static; + #[derive(Debug, thiserror::Error)] pub enum QbftError { #[error("Timeout")] @@ -41,6 +54,9 @@ pub enum QbftError { #[error("Compare leader value with local value failed")] CompareError, + #[error("bug: expected only comparison or timeout error")] + UnexpectedCompareError, + #[error("Maximum round reached")] MaxRoundReached, @@ -99,20 +115,10 @@ where + Sync, >, - /// Called when leader proposes value and we compare it with our local - /// value. It's an opt-in feature that should instantly return `None` on - /// `return_err` channel if it is not turned on. - pub compare: Box< - dyn Fn( - /* ct */ &CancellationToken, - /* qcommit */ &Msg, - /* input_value_source_ch */ &mpmc::Receiver, - /* input_value_source */ &C, - /* return_err */ &mpmc::Sender>, - /* return_value */ &mpmc::Sender, - ) + Send - + Sync, - >, + /// Charon parity hook called when the leader proposes a value. The core + /// algorithm only runs this callback and reacts to its result; any + /// value-source comparison policy belongs to the caller. + pub compare: sync::Arc>, /// Called when consensus has been reached on a value. pub decide: Box< @@ -194,12 +200,14 @@ pub const MSG_DECIDED: MessageType = MessageType(5); const MSG_SENTINEL: MessageType = MessageType(6); // intentionally not public impl MessageType { + /// Returns true when the message type is one of the known QBFT wire types. pub fn valid(&self) -> bool { self.0 > MSG_UNKNOWN.0 && self.0 < MSG_SENTINEL.0 } } impl Display for MessageType { + /// Formats the message type using the stable wire/debug label. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let s = match self.0 { 0 => "unknown", @@ -260,6 +268,7 @@ pub const UPON_JUSTIFIED_DECIDED: UponRule = UponRule(7); pub const UPON_ROUND_TIMEOUT: UponRule = UponRule(8); // This is not triggered by a message, but by a timer. impl Display for UponRule { + /// Formats the upon-rule using the stable debug label. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let s = match self.0 { 0 => "nothing", @@ -284,11 +293,12 @@ struct DedupKey { round: i64, } -/// Executes the consensus algorithm until the context is closed. -/// The generic type `I` is the instance of consensus and can be anything. -/// The generic type `V` is the arbitrary data value being proposed; it only -/// requires an Equal method. The generic type `C` is the compare value, used to -/// compare leader's proposed value with local value and can be anything. +/// Executes one QBFT consensus instance until it decides, errors, or is +/// cancelled. +/// +/// `I` identifies the consensus instance, `V` is the comparable proposed value, +/// and `C` is the application value used by `Definition::compare` to compare a +/// leader proposal with the local input source. pub fn run( ct: &CancellationToken, d: &Definition, @@ -299,8 +309,9 @@ pub fn run( input_value_source_ch: mpmc::Receiver, ) -> Result<()> where - V: PartialEq + Eq + Hash + Default, - C: Clone + Send + Sync + Default, + I: Send + Sync + 'static, + V: PartialEq + Eq + Hash + Default + 'static, + C: Clone + Send + Sync + Default + 'static, { // === State === let round: Cell = Cell::new(1); @@ -414,7 +425,7 @@ where (timer_chan, stop_timer) = (d.new_timer)(round.get()); } - while !ct.is_canceled() { + loop { mpmc::select! { recv(input_value_ch) -> result => { let iv = result?; @@ -472,7 +483,7 @@ where stop_timer(); (timer_chan, stop_timer) = (d.new_timer)(round.get()); - let compare_result = compare( + let (new_input_value_source, compare_result) = compare( ct, d, &msg, @@ -480,12 +491,10 @@ where input_value_source.clone(), &timer_chan, ); + input_value_source = new_input_value_source; match compare_result { - Ok(v) => { - input_value_source = v; - broadcast_msg(MSG_PREPARE, &msg.value(), None)?; - } + Ok(()) => broadcast_msg(MSG_PREPARE, &msg.value(), None)?, Err(qbft_err) => { match qbft_err { QbftError::CompareError => { @@ -503,7 +512,7 @@ where broadcast_round_change()?; } - _ => panic!("bug: expected only {} or {} error", QbftError::CompareError, QbftError::TimeoutError) + _ => return Err(QbftError::UnexpectedCompareError), } } } @@ -579,7 +588,7 @@ where broadcast_round_change()?; } - default => { + default(RUN_CANCELLATION_POLL_INTERVAL) => { if ct.is_canceled() { break; } @@ -590,6 +599,9 @@ where Ok(()) } +/// The callback may cache the local input source and return success/failure. +/// This helper only preserves that callback result and lets the round timer win +/// if the callback blocks. fn compare( ct: &CancellationToken, d: &Definition, @@ -597,13 +609,14 @@ fn compare( input_value_source_ch: &mpmc::Receiver, input_value_source: C, timer_chan: &mpmc::Receiver, -) -> Result +) -> (C, Result<()>) where - V: PartialEq, - C: Clone + Send + Sync, + I: Send + Sync + 'static, + V: PartialEq + 'static, + C: Clone + Send + Sync + 'static, { let (compare_err_tx, compare_err_rx) = mpmc::bounded::>(1); - let (compare_value_tx, compare_value_rx) = mpmc::bounded::(1); + let (compare_value_tx, mut compare_value_rx) = mpmc::bounded::(1); // d.Compare has 2 roles: // 1. Read from the `input_value_source_ch` (if `input_value_source` is empty). @@ -615,46 +628,69 @@ where // If comparison or any other unexpected error occurs, the error is returned on // `compare_err` channel. - thread::scope(|s| { - let mut result = input_value_source.clone(); - let compare = &d.compare; + let mut result = input_value_source.clone(); + let compare = d.compare.clone(); + let compare_cts = sync::Arc::new(CancellationTokenSource::new()); + let compare_ct = compare_cts.token().clone(); + let msg = msg.clone(); + let input_value_source_ch = input_value_source_ch.clone(); + + thread::spawn(move || { + (compare)( + &compare_ct, + &msg, + &input_value_source_ch, + &input_value_source, + &compare_err_tx, + &compare_value_tx, + ); + }); - s.spawn(move || { - (compare)( - ct, - msg, - input_value_source_ch, - &input_value_source, - &compare_err_tx, - &compare_value_tx, - ); - }); - - loop { - mpmc::select! { - recv(compare_err_rx) -> msg => { - let err = msg?; - - return match err { - Ok(_) => Ok(result), - Err(_) => Err(QbftError::CompareError), - }; - }, - - recv(compare_value_rx) -> msg => { - let value = msg?; + loop { + mpmc::select! { + recv(compare_err_rx) -> msg => { + let err = match msg { + Ok(err) => err, + Err(err) => { + compare_cts.cancel(); + return (result, Err(QbftError::ChannelError(err))); + } + }; + while let Ok(value) = compare_value_rx.try_recv() { result = value; - }, + } - recv(timer_chan) -> msg => { - msg?; + compare_cts.cancel(); + return match err { + Ok(()) => (result, Ok(())), + Err(_) => (result, Err(QbftError::CompareError)), + }; + }, - return Err(QbftError::TimeoutError); + recv(compare_value_rx) -> msg => { + match msg { + Ok(value) => result = value, + Err(_) => compare_value_rx = mpmc::never(), + } + }, + + recv(timer_chan) -> msg => { + compare_cts.cancel(); + if let Err(err) = msg { + return (result, Err(QbftError::ChannelError(err))); + } + + return (result, Err(QbftError::TimeoutError)); + } + + default(RUN_CANCELLATION_POLL_INTERVAL) => { + if ct.is_canceled() { + compare_cts.cancel(); } } } - }) + } } /// Returns all messages from the provided round. @@ -1122,6 +1158,7 @@ where value: V, } +/// Returns all unique-source PREPARE quorums grouped by identical round and value. fn get_prepare_quorums( d: &Definition, all: &Vec>, From b752caaf8b9d09a8847c44c27141832f14dba1f8 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 10:26:17 +0700 Subject: [PATCH 03/28] fix: removed hard coded salt in tests --- crates/core/src/qbft/internal_test.rs | 49 +++++++++++++++++---------- crates/core/src/qbft/mod.rs | 3 +- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index ea072f94..5f32d92f 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -14,6 +14,15 @@ const WRITE_CHAN_ERR: &str = "Failed to write to channel"; const READ_CHAN_ERR: &str = "Failed to read from channel"; // Fixed seed for deterministic chain-split duplicate-message simulation. const CHAIN_SPLIT_SEED: u64 = 0x4348_4149_4e53_504c; +const TEST_STREAM_DROP: u64 = 1; +const TEST_STREAM_DUPLICATE: u64 = 2; +const TEST_STREAM_JITTER: u64 = 3; +const TEST_STREAM_DELAY_ORDER: u64 = 4; +const TEST_STREAM_MSG_TYPE: u64 = 10; +const TEST_STREAM_MSG_ROUND: u64 = 11; +const TEST_STREAM_MSG_VALUE: u64 = 12; +const TEST_STREAM_MSG_PREPARED_ROUND: u64 = 13; +const TEST_STREAM_MSG_PREPARED_VALUE: u64 = 14; type RunOutcome = std::thread::Result>; type TestMsgRef = Msg; @@ -650,7 +659,8 @@ fn bcast( return; } - let delta_ms = (f64::from(jitter_ms) * deterministic_unit(seed, &msg, 0, 3)) as i32; + let delta_ms = + (f64::from(jitter_ms) * deterministic_unit(seed, &msg, 0, TEST_STREAM_JITTER)) as i32; let delay = Duration::from_millis((jitter_ms + delta_ms) as u64); trace.push(format!( "{:?} {} => {}@{} (bcast delay {:?})", @@ -660,7 +670,7 @@ fn bcast( msg.round(), delay )); - let key = deterministic_msg_u64(seed, &msg, 0, 4); + let key = deterministic_msg_u64(seed, &msg, 0, TEST_STREAM_DELAY_ORDER); broadcast .send(BroadcastEvent::Delayed(PendingBroadcast { deliver_at: clock.elapsed() + delay, @@ -706,7 +716,7 @@ fn fanout_broadcast( } if let Some(p) = drop_prob.get(&msg.source()) { - if deterministic_unit(seed, &msg, *target, 1) < *p { + if deterministic_unit(seed, &msg, *target, TEST_STREAM_DROP) < *p { trace.push(format!( "{:?} {} => {}@{} => {} (dropped)", clock.elapsed(), @@ -722,7 +732,7 @@ fn fanout_broadcast( out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); broadcasts += 1; - if deterministic_unit(seed, &msg, *target, 2) < 0.1 { + if deterministic_unit(seed, &msg, *target, TEST_STREAM_DUPLICATE) < 0.1 { out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); broadcasts += 1; trace.push(format!( @@ -748,25 +758,26 @@ fn random_msg(instance: i64, peer_idx: i64, seed: u64, counter: u64) -> Msg, target: i64, salt: u64) -> f64 { - let value = deterministic_msg_u64(seed, msg, target, salt) >> 11; +fn deterministic_unit(seed: u64, msg: &Msg, target: i64, stream_id: u64) -> f64 { + let value = deterministic_msg_u64(seed, msg, target, stream_id) >> 11; value as f64 / ((1_u64 << 53) as f64) } -fn deterministic_msg_u64(seed: u64, msg: &Msg, target: i64, salt: u64) -> u64 { - let mut value = splitmix64(seed ^ salt); +fn deterministic_msg_u64(seed: u64, msg: &Msg, target: i64, stream_id: u64) -> u64 { + let mut value = splitmix64(seed ^ stream_id); value = splitmix64(value ^ i64_to_u64(msg.type_().0)); value = splitmix64(value ^ i64_to_u64(msg.instance())); value = splitmix64(value ^ i64_to_u64(msg.source())); @@ -778,14 +789,14 @@ fn deterministic_msg_u64(seed: u64, msg: &Msg, target: i64, salt: splitmix64(value ^ i64_to_u64(target)) } -fn deterministic_range(seed: u64, counter: u64, salt: u64, upper: usize) -> usize { +fn deterministic_range(seed: u64, counter: u64, stream_id: u64, upper: usize) -> usize { let upper = u64::try_from(upper).expect("upper fits in u64"); - usize::try_from(splitmix64(seed ^ counter ^ salt) % upper).expect("range fits in usize") + usize::try_from(splitmix64(seed ^ counter ^ stream_id) % upper).expect("range fits in usize") } -fn deterministic_i64(seed: u64, counter: u64, salt: u64, upper: i64) -> i64 { +fn deterministic_i64(seed: u64, counter: u64, stream_id: u64, upper: i64) -> i64 { let upper = u64::try_from(upper).expect("upper is positive"); - i64::try_from(splitmix64(seed ^ counter ^ salt) % upper).expect("range fits in i64") + i64::try_from(splitmix64(seed ^ counter ^ stream_id) % upper).expect("range fits in i64") } fn splitmix64(mut value: u64) -> u64 { @@ -1786,7 +1797,9 @@ fn test_qbft_chain_split(test: ChainSplitTest) { continue; } out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); - if deterministic_unit(CHAIN_SPLIT_SEED, &msg, *target, 1) < 0.1 { + if deterministic_unit(CHAIN_SPLIT_SEED, &msg, *target, TEST_STREAM_DUPLICATE) + < 0.1 + { out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); } } diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index e444373b..78f111d5 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -1158,7 +1158,8 @@ where value: V, } -/// Returns all unique-source PREPARE quorums grouped by identical round and value. +/// Returns all unique-source PREPARE quorums grouped by identical round and +/// value. fn get_prepare_quorums( d: &Definition, all: &Vec>, From ed912eebf1bd871d3da44e884832a602752cf4d3 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 12:52:18 +0700 Subject: [PATCH 04/28] fix: return error on Context cancelled --- crates/core/src/qbft/internal_test.rs | 4 ++-- crates/core/src/qbft/mod.rs | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 5f32d92f..87ef8b98 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1281,7 +1281,7 @@ fn duplicate_pre_prepare_rules() { input_value_source_ch, ); - assert!(res.is_ok()); + assert!(matches!(res, Err(QbftError::ContextCanceled))); } #[test] @@ -1309,7 +1309,7 @@ fn idle_run_returns_when_cancelled() { done_rx .recv_timeout(Duration::from_millis(100)) .expect("idle run must unblock on cancellation"), - Ok(()) + Err(QbftError::ContextCanceled) )); } diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 78f111d5..9cf6d163 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -57,6 +57,9 @@ pub enum QbftError { #[error("bug: expected only comparison or timeout error")] UnexpectedCompareError, + #[error("context canceled")] + ContextCanceled, + #[error("Maximum round reached")] MaxRoundReached, @@ -590,13 +593,11 @@ where default(RUN_CANCELLATION_POLL_INTERVAL) => { if ct.is_canceled() { - break; + return Err(QbftError::ContextCanceled); } } } } - - Ok(()) } /// The callback may cache the local input source and return success/failure. From e01f64fc4969b7e2746e1d8ecc1f01c3375a04e1 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 14:13:28 +0700 Subject: [PATCH 05/28] fix: hash from string, not magic number --- crates/core/src/qbft/internal_test.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 87ef8b98..e2e1b509 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -12,8 +12,8 @@ use std::{ const WRITE_CHAN_ERR: &str = "Failed to write to channel"; const READ_CHAN_ERR: &str = "Failed to read from channel"; -// Fixed seed for deterministic chain-split duplicate-message simulation. -const CHAIN_SPLIT_SEED: u64 = 0x4348_4149_4e53_504c; +const TEST_SEED_LABEL: &str = "qbft-test"; +const CHAIN_SPLIT_SEED_LABEL: &str = "chain-split"; const TEST_STREAM_DROP: u64 = 1; const TEST_STREAM_DUPLICATE: u64 = 2; const TEST_STREAM_JITTER: u64 = 3; @@ -583,7 +583,7 @@ fn outcome_is_error(outcome: &RunOutcome, expected: fn(&QbftError) -> bool) -> b } fn test_seed(test: &Test) -> u64 { - let mut seed = 0x5142_4654_5445_5354_u64; + let mut seed = seed_from_label(TEST_SEED_LABEL); seed ^= test.instance as u64; seed ^= u64::from(test.const_period) << 8; seed ^= (test.bcast_jitter_ms as u64) << 16; @@ -595,6 +595,14 @@ fn test_seed(test: &Test) -> u64 { seed } +fn seed_from_label(label: &str) -> u64 { + // Small rolling-hash multiplier; only separates deterministic test labels, + // not used for cryptographic randomness or protocol behavior. + label.bytes().fold(0_u64, |seed, byte| { + seed.wrapping_mul(131).wrapping_add(u64::from(byte)) + }) +} + /// Construct a leader election function. fn make_is_leader(n: i64) -> impl Fn(&i64, i64, i64) -> bool + Clone { move |instance: &i64, round: i64, process: i64| -> bool { (instance + round) % n == process } @@ -1797,8 +1805,12 @@ fn test_qbft_chain_split(test: ChainSplitTest) { continue; } out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); - if deterministic_unit(CHAIN_SPLIT_SEED, &msg, *target, TEST_STREAM_DUPLICATE) - < 0.1 + if deterministic_unit( + seed_from_label(CHAIN_SPLIT_SEED_LABEL), + &msg, + *target, + TEST_STREAM_DUPLICATE, + ) < 0.1 { out_tx.send(msg.clone()).expect(WRITE_CHAN_ERR); } From b22fb417801ce1d90dab9977ce2da76781e244e7 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 14:15:06 +0700 Subject: [PATCH 06/28] fix: one shot cancel when parent is cancelled --- crates/core/src/qbft/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 9cf6d163..4ee494b3 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -635,6 +635,7 @@ where let compare_ct = compare_cts.token().clone(); let msg = msg.clone(); let input_value_source_ch = input_value_source_ch.clone(); + let mut compare_parent_cancelled = false; thread::spawn(move || { (compare)( @@ -686,8 +687,9 @@ where } default(RUN_CANCELLATION_POLL_INTERVAL) => { - if ct.is_canceled() { + if !compare_parent_cancelled && ct.is_canceled() { compare_cts.cancel(); + compare_parent_cancelled = true; } } } From 081d8a47862fec96e56b644f2b6a6650989e1357 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 14:18:48 +0700 Subject: [PATCH 07/28] fix: fix make_is_leader test --- crates/core/src/qbft/internal_test.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index e2e1b509..c52ad70c 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -605,7 +605,11 @@ fn seed_from_label(label: &str) -> u64 { /// Construct a leader election function. fn make_is_leader(n: i64) -> impl Fn(&i64, i64, i64) -> bool + Clone { - move |instance: &i64, round: i64, process: i64| -> bool { (instance + round) % n == process } + move |instance: &i64, round: i64, process: i64| -> bool { + // Go's helper uses 0-indexed process IDs. These Rust tests use 1..=n, + // so translate the same modulo schedule into the local process range. + ((instance + round - 1).rem_euclid(n)) + 1 == process + } } /// Returns a new message to be broadcast. @@ -998,7 +1002,11 @@ fn stagger_start_const() { fn very_delayed_value_exp() { test_qbft(Test { instance: 3, - value_delay: HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]), + value_delay: HashMap::from([ + (1, Duration::from_secs(5)), + (2, Duration::from_secs(10)), + (4, Duration::from_secs(5)), + ]), decide_round: 4, ..Default::default() }); From 1a0a33700001f6a8ffa8452d1bd5a27b9e024905 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 14:24:10 +0700 Subject: [PATCH 08/28] fix: minors naming and comments --- crates/core/src/qbft/fake_clock.rs | 8 ++++++-- crates/core/src/qbft/mod.rs | 8 +++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 0a3ab477..8392e7f3 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -123,7 +123,9 @@ fn multiple_threads_timers() { clock.advance(Duration::from_secs(6)); }); - assert_eq!(vec![true, true], done_rx.try_iter().collect::>()); + let done = done_rx.try_iter().collect::>(); + assert_eq!(2, done.len()); + assert!(done.into_iter().all(|done| done)); assert_eq!(Duration::from_secs(10), clock.elapsed()); } @@ -150,7 +152,9 @@ fn multiple_threads_cancellation() { clock.cancel(); }); - assert_eq!(vec![true, true], done_rx.try_iter().collect::>()); + let done = done_rx.try_iter().collect::>(); + assert_eq!(2, done.len()); + assert!(done.into_iter().all(|done| done)); assert_eq!(Duration::ZERO, clock.elapsed()); } diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 4ee494b3..e064292d 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -33,7 +33,7 @@ use std::{ type Result = std::result::Result; -const RUN_CANCELLATION_POLL_INTERVAL: time::Duration = time::Duration::from_millis(1); +const CANCELLATION_POLL_INTERVAL: time::Duration = time::Duration::from_millis(1); type CompareFn = dyn Fn( /* ct */ &CancellationToken, @@ -591,7 +591,7 @@ where broadcast_round_change()?; } - default(RUN_CANCELLATION_POLL_INTERVAL) => { + default(CANCELLATION_POLL_INTERVAL) => { if ct.is_canceled() { return Err(QbftError::ContextCanceled); } @@ -637,6 +637,8 @@ where let input_value_source_ch = input_value_source_ch.clone(); let mut compare_parent_cancelled = false; + // Detached by design, matching Charon's goroutine behavior: if a caller-provided + // compare callback ignores cancellation and never reports, it may outlive this call. thread::spawn(move || { (compare)( &compare_ct, @@ -686,7 +688,7 @@ where return (result, Err(QbftError::TimeoutError)); } - default(RUN_CANCELLATION_POLL_INTERVAL) => { + default(CANCELLATION_POLL_INTERVAL) => { if !compare_parent_cancelled && ct.is_canceled() { compare_cts.cancel(); compare_parent_cancelled = true; From 44b9a1335a25924db219d5fff47b62cb4f8433bc Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 14:48:07 +0700 Subject: [PATCH 09/28] fix: linter --- crates/core/src/qbft/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index e064292d..5c21e9cc 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -637,8 +637,9 @@ where let input_value_source_ch = input_value_source_ch.clone(); let mut compare_parent_cancelled = false; - // Detached by design, matching Charon's goroutine behavior: if a caller-provided - // compare callback ignores cancellation and never reports, it may outlive this call. + // Detached by design, matching Charon's goroutine behavior: if a + // caller-provided compare callback ignores cancellation and never reports, + // it may outlive this call. thread::spawn(move || { (compare)( &compare_ct, From cc59baf6ca395e6acdb9a75a5467ea871bfc181b Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 18:28:14 +0700 Subject: [PATCH 10/28] fix: early cancel on the loop --- crates/core/src/qbft/internal_test.rs | 52 +++++++++++++++++++++++++++ crates/core/src/qbft/mod.rs | 4 +++ 2 files changed, 56 insertions(+) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index c52ad70c..8a150546 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1329,6 +1329,58 @@ fn idle_run_returns_when_cancelled() { )); } +#[test] +fn run_cancels_under_hot_receive_stream() { + let cts = CancellationTokenSource::new(); + let token = cts.token().clone(); + let mut def = noop_definition(); + def.nodes = 4; + def.fifo_limit = 100; + + let (receive_tx, receive_rx) = mpmc::bounded::>(1024); + let transport = Transport { + receive: receive_rx, + ..noop_transport() + }; + let (_input_tx, input_rx) = mpmc::bounded::(1); + let (_source_tx, source_rx) = mpmc::bounded::(1); + let (done_tx, done_rx) = mpmc::bounded(1); + + let sender_cts = CancellationTokenSource::new(); + let sender_token = sender_cts.token().clone(); + let sender = thread::spawn(move || { + let msg = new_msg(MSG_PREPARE, 0, 1, 2, 1, 0, 0, 0, None); + while !sender_token.is_canceled() { + match receive_tx.try_send(msg.clone()) { + Ok(()) => {} + Err(mpmc::TrySendError::Full(_)) => thread::yield_now(), + Err(mpmc::TrySendError::Disconnected(_)) => break, + } + } + }); + + thread::spawn(move || { + done_tx + .send(qbft::run( + &token, &def, &transport, &0, 1, input_rx, source_rx, + )) + .expect(WRITE_CHAN_ERR); + }); + + thread::sleep(Duration::from_millis(10)); + cts.cancel(); + + assert!(matches!( + done_rx + .recv_timeout(Duration::from_millis(100)) + .expect("run must unblock on cancellation even with a hot receive stream"), + Err(QbftError::ContextCanceled) + )); + + sender_cts.cancel(); + sender.join().expect("sender thread must exit"); +} + #[test] fn classify_rules() { let mut def = noop_definition(); diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 5c21e9cc..c192d240 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -429,6 +429,10 @@ where } loop { + if ct.is_canceled() { + return Err(QbftError::ContextCanceled); + } + mpmc::select! { recv(input_value_ch) -> result => { let iv = result?; From ed82b6e6a09711696f8f8478a012f62153749c37 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 18:32:19 +0700 Subject: [PATCH 11/28] fix: context cancel in compare --- crates/core/src/qbft/internal_test.rs | 70 ++++++++++++++++++++++++++- crates/core/src/qbft/mod.rs | 15 ++++-- 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 8a150546..4a759253 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1610,7 +1610,7 @@ fn compare_parent_cancel_cancels_callback_token() { thread::sleep(Duration::from_millis(1)); } token_cancelled_tx.send(()).expect(WRITE_CHAN_ERR); - return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + let _ = return_err.send(Ok(())); }); let (result_tx, result_rx) = mpmc::bounded(1); @@ -1624,7 +1624,7 @@ fn compare_parent_cancel_cancels_callback_token() { cts.cancel(); match result_rx.recv_timeout(Duration::from_millis(100)) { - Ok(result) => assert!(matches!(result, (0, Ok(())))), + Ok(result) => assert!(matches!(result, (0, Err(QbftError::ContextCanceled)))), Err(err) => { let _ = timer_tx.send(time::Instant::now()); panic!("compare callback token must be canceled by parent token: {err}"); @@ -1635,6 +1635,72 @@ fn compare_parent_cancel_cancels_callback_token() { .expect("callback token must be canceled by parent token"); } +#[test] +fn run_parent_cancel_during_compare_does_not_prepare() { + const LEADER: i64 = 1; + const PROCESS: i64 = 2; + + let cts = CancellationTokenSource::new(); + let token = cts.token().clone(); + let msg = new_msg(MSG_PRE_PREPARE, 0, LEADER, 1, 7, 11, 0, 0, None); + let (receive_tx, receive_rx) = mpmc::bounded(1); + receive_tx.send(msg).expect(WRITE_CHAN_ERR); + + let (compare_started_tx, compare_started_rx) = mpmc::bounded(1); + let (compare_cancelled_tx, compare_cancelled_rx) = mpmc::bounded(1); + let mut def = noop_definition(); + def.nodes = 4; + def.fifo_limit = 100; + def.is_leader = Box::new(|_, _, process| process == LEADER); + def.compare = Arc::new(move |ct, _, _, _, return_err, _| { + compare_started_tx.send(()).expect(WRITE_CHAN_ERR); + while !ct.is_canceled() { + thread::sleep(Duration::from_millis(1)); + } + compare_cancelled_tx.send(()).expect(WRITE_CHAN_ERR); + let _ = return_err.send(Ok(())); + }); + + let (broadcast_tx, broadcast_rx) = mpmc::bounded(1); + let transport = Transport { + broadcast: Box::new(move |_, type_, _, _, _, _, _, _, _| { + broadcast_tx.send(type_).expect(WRITE_CHAN_ERR); + Ok(()) + }), + receive: receive_rx, + }; + let (_input_tx, input_rx) = mpmc::bounded::(1); + let (_source_tx, source_rx) = mpmc::bounded::(1); + let (done_tx, done_rx) = mpmc::bounded(1); + + thread::spawn(move || { + done_tx + .send(qbft::run( + &token, &def, &transport, &0, PROCESS, input_rx, source_rx, + )) + .expect(WRITE_CHAN_ERR); + }); + + compare_started_rx + .recv_timeout(Duration::from_millis(100)) + .expect("compare must start"); + cts.cancel(); + + assert!(matches!( + done_rx + .recv_timeout(Duration::from_millis(100)) + .expect("run must return parent cancellation from compare"), + Err(QbftError::ContextCanceled) + )); + compare_cancelled_rx + .recv_timeout(Duration::from_millis(100)) + .expect("compare callback token must be canceled"); + assert!( + broadcast_rx.try_iter().all(|type_| type_ != MSG_PREPARE), + "parent cancellation during compare must not broadcast PREPARE" + ); +} + fn buffer_by_source(msgs: &[Msg]) -> HashMap>> { let mut buffer = HashMap::new(); for msg in msgs { diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index c192d240..cf43da2d 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -519,6 +519,7 @@ where broadcast_round_change()?; } + QbftError::ContextCanceled => return Err(QbftError::ContextCanceled), _ => return Err(QbftError::UnexpectedCompareError), } } @@ -639,7 +640,6 @@ where let compare_ct = compare_cts.token().clone(); let msg = msg.clone(); let input_value_source_ch = input_value_source_ch.clone(); - let mut compare_parent_cancelled = false; // Detached by design, matching Charon's goroutine behavior: if a // caller-provided compare callback ignores cancellation and never reports, @@ -656,6 +656,11 @@ where }); loop { + if ct.is_canceled() { + compare_cts.cancel(); + return (result, Err(QbftError::ContextCanceled)); + } + mpmc::select! { recv(compare_err_rx) -> msg => { let err = match msg { @@ -671,6 +676,10 @@ where } compare_cts.cancel(); + if ct.is_canceled() { + return (result, Err(QbftError::ContextCanceled)); + } + return match err { Ok(()) => (result, Ok(())), Err(_) => (result, Err(QbftError::CompareError)), @@ -694,9 +703,9 @@ where } default(CANCELLATION_POLL_INTERVAL) => { - if !compare_parent_cancelled && ct.is_canceled() { + if ct.is_canceled() { compare_cts.cancel(); - compare_parent_cancelled = true; + return (result, Err(QbftError::ContextCanceled)); } } } From 7911c23493611716d05d510d4ce30e2be27e89e9 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 18:39:21 +0700 Subject: [PATCH 12/28] fix: validate definition --- crates/core/src/qbft/internal_test.rs | 52 ++++++++++++++++++++++++--- crates/core/src/qbft/mod.rs | 24 +++++++++++++ 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 4a759253..b8c14b0b 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1248,12 +1248,14 @@ fn duplicate_pre_prepare_rules() { 0, 0, 0, - // Justification not required since nodes and quorum both 0. + // Round 2 is accepted after round 1 records a compare failure. None, ) }; let mut def = noop_definition(); + def.nodes = 4; + def.fifo_limit = 100; def.is_leader = Box::new(|_, _, process| process == LEADER); def.log_upon_rule = Box::new(move |_, _, round, msg, upon_rule| { println!("UponRule: rule={} round={} ", upon_rule, msg.round()); @@ -1271,8 +1273,13 @@ fn duplicate_pre_prepare_rules() { panic!("unexpected round {}", round); }); - def.compare = Arc::new(|_, _, _, _, return_err, _| { - return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + def.compare = Arc::new(|_, msg, _, _, return_err, _| { + let result = if msg.round() == 1 { + Err(QbftError::CompareError) + } else { + Ok(()) + }; + return_err.send(result).expect(WRITE_CHAN_ERR); }); let (r_chan_tx, r_chan_rx) = mpmc::bounded::>(2); @@ -1304,7 +1311,9 @@ fn duplicate_pre_prepare_rules() { fn idle_run_returns_when_cancelled() { let cts = CancellationTokenSource::new(); let token = cts.token().clone(); - let def = noop_definition(); + let mut def = noop_definition(); + def.nodes = 4; + def.fifo_limit = 100; let transport = noop_transport(); let (_input_tx, input_rx) = mpmc::bounded::(1); let (_source_tx, source_rx) = mpmc::bounded::(1); @@ -1329,6 +1338,41 @@ fn idle_run_returns_when_cancelled() { )); } +#[test] +fn invalid_definition_rejected() { + let cts = CancellationTokenSource::new(); + let transport = noop_transport(); + let (_input_tx, input_rx) = mpmc::bounded::(1); + let (_source_tx, source_rx) = mpmc::bounded::(1); + + let mut def = noop_definition(); + def.nodes = 0; + def.fifo_limit = 1; + assert!(matches!( + qbft::run( + cts.token(), + &def, + &transport, + &0, + 1, + input_rx.clone(), + source_rx.clone(), + ), + Err(QbftError::InvalidDefinition( + "nodes must be greater than zero" + )) + )); + + def.nodes = 4; + def.fifo_limit = 0; + assert!(matches!( + qbft::run(cts.token(), &def, &transport, &0, 1, input_rx, source_rx), + Err(QbftError::InvalidDefinition( + "fifo_limit must be greater than zero" + )) + )); +} + #[test] fn run_cancels_under_hot_receive_stream() { let cts = CancellationTokenSource::new(); diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index cf43da2d..3e0c6ba6 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -66,6 +66,9 @@ pub enum QbftError { #[error("Zero input value not supported")] ZeroInputValue, + #[error("Invalid definition: {0}")] + InvalidDefinition(&'static str), + #[error("Failed to read from channel: {0}")] ChannelError(#[from] mpmc::RecvError), } @@ -316,6 +319,8 @@ where V: PartialEq + Eq + Hash + Default + 'static, C: Clone + Send + Sync + Default + 'static, { + validate_definition(d)?; + // === State === let round: Cell = Cell::new(1); let input_value: RefCell = RefCell::new(Default::default()); @@ -605,6 +610,25 @@ where } } +fn validate_definition(d: &Definition) -> Result<()> +where + V: PartialEq, +{ + if d.nodes <= 0 { + return Err(QbftError::InvalidDefinition( + "nodes must be greater than zero", + )); + } + + if d.fifo_limit <= 0 { + return Err(QbftError::InvalidDefinition( + "fifo_limit must be greater than zero", + )); + } + + Ok(()) +} + /// The callback may cache the local input source and return success/failure. /// This helper only preserves that callback result and lets the round timer win /// if the callback blocks. From 829ba5f4f8a44bd0b4d4d0b45024f1a7f9eea2ac Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 18:43:51 +0700 Subject: [PATCH 13/28] fix: add check pr < r --- crates/core/src/qbft/internal_test.rs | 25 ++++++++++++++++++++++ crates/core/src/qbft/mod.rs | 30 +++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index b8c14b0b..33f296b5 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1493,6 +1493,31 @@ fn justified_qrc_j1_and_j2() { ]; assert_eq!(Some(7), contains_justified_qrc(&def, &j2, 2)); assert!(get_justified_qrc(&def, &j2, 2).unwrap().len() >= 6); + + let invalid_pr = vec![ + new_msg(MSG_ROUND_CHANGE, 0, 1, 2, 0, 0, 2, 7, None), + new_msg(MSG_ROUND_CHANGE, 0, 2, 2, 0, 0, 2, 7, None), + new_msg(MSG_ROUND_CHANGE, 0, 3, 2, 0, 0, 2, 7, None), + new_msg(MSG_PREPARE, 0, 1, 2, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 2, 2, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 3, 2, 7, 0, 0, 0, None), + ]; + assert_eq!(None, contains_justified_qrc(&def, &invalid_pr, 2)); + assert!(get_justified_qrc(&def, &invalid_pr, 2).is_none()); +} + +#[test] +fn round_change_prepared_round_must_be_before_round() { + let mut def = noop_definition(); + def.nodes = 4; + let prepares = vec![ + new_msg(MSG_PREPARE, 0, 1, 2, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 2, 2, 7, 0, 0, 0, None), + new_msg(MSG_PREPARE, 0, 3, 2, 7, 0, 0, 0, None), + ]; + let invalid = new_msg(MSG_ROUND_CHANGE, 0, 1, 2, 0, 0, 2, 7, Some(&prepares)); + + assert!(!is_justified_round_change(&def, &invalid)); } #[test] diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 3e0c6ba6..e1786459 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -913,6 +913,13 @@ where let pr = msg.prepared_round(); let pv = msg.prepared_value(); + // The IBFT paper requires ROUND-CHANGE prepared_round to be lower than the + // target round. Go core currently omits this check, but valid Charon traffic + // already satisfies it. + if !valid_round_change_prepared_round(msg) { + return false; + } + if prepares.is_empty() { return pr == 0 && pv == Default::default(); } @@ -946,6 +953,14 @@ where true } +fn valid_round_change_prepared_round(msg: &Msg) -> bool +where + V: PartialEq, +{ + let pr = msg.prepared_round(); + pr >= 0 && pr < msg.round() +} + /// Returns true if the decided message is justified by quorum COMMIT messages /// of identical round and value. fn is_justified_decided(d: &Definition, msg: &Msg) -> bool @@ -1018,6 +1033,11 @@ where if (qrc.len() as i64) < d.quorum() { return None; } + + if qrc.iter().any(|rc| !valid_round_change_prepared_round(rc)) { + return None; + } + // No need to calculate J1 or J2 for all possible combinations, // since justification should only contain one. @@ -1111,7 +1131,10 @@ where return Some(qrc); } - let round_changes = filter_round_change(all, round); + let round_changes = filter_round_change(all, round) + .into_iter() + .filter(valid_round_change_prepared_round) + .collect::>(); for prepares in get_prepare_quorums(d, all) { // See if we have quorum ROUND-CHANGE with HIGHEST_PREPARED(qrc) == @@ -1259,7 +1282,10 @@ where let null_pr = Default::default(); let null_pv = Some(&Default::default()); - let justification = filter_msgs(all, MSG_ROUND_CHANGE, round, None, Some(null_pr), null_pv); + let justification = filter_msgs(all, MSG_ROUND_CHANGE, round, None, Some(null_pr), null_pv) + .into_iter() + .filter(valid_round_change_prepared_round) + .collect::>(); ( justification.clone(), From d834463c68aa53f399ed26002c66aae0561a8761 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 18 May 2026 18:46:13 +0700 Subject: [PATCH 14/28] fix: test and document on run --- crates/core/src/qbft/internal_test.rs | 11 +++++++++++ crates/core/src/qbft/mod.rs | 7 +++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 33f296b5..b8fd8f8a 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -2013,6 +2013,17 @@ fn test_qbft_chain_split(test: ChainSplitTest) { } recv(result_chan_rx) -> res => { let q_commit = res.expect(READ_CHAN_ERR); + if test.should_halt { + cts.cancel(); + clock.cancel(); + panic!( + "halt case unexpectedly decided: q_commit={:?} elapsed={:?}\n{}", + q_commit, + clock.elapsed(), + trace.dump() + ); + } + for commit in q_commit.clone() { for previous in results.values() { if previous.value() != commit.value() { diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index e1786459..71eb57b7 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -299,8 +299,11 @@ struct DedupKey { round: i64, } -/// Executes one QBFT consensus instance until it decides, errors, or is -/// cancelled. +/// Executes one QBFT consensus instance until it errors or is cancelled. +/// +/// Decisions are reported via `Definition::decide`. After deciding, `run` +/// remains active so it can answer later `ROUND_CHANGE` messages with `DECIDED` +/// catch-up messages. /// /// `I` identifies the consensus instance, `V` is the comparable proposed value, /// and `C` is the application value used by `Definition::compare` to compare a From 739e3b389b5b6a65cdf717498307ef4dc179e67a Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 11:04:13 +0700 Subject: [PATCH 15/28] fix: compare callback failed should timeout --- crates/core/src/qbft/internal_test.rs | 37 +++++++++++++++++++++++++++ crates/core/src/qbft/mod.rs | 8 +++--- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index b8fd8f8a..495de30e 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1664,6 +1664,43 @@ fn compare_timeout_does_not_wait_for_blocked_callback() { )); } +#[test] +fn compare_callback_exit_without_status_waits_for_timer() { + let cts = CancellationTokenSource::new(); + let msg = new_msg(MSG_PRE_PREPARE, 0, 1, 1, 7, 11, 0, 0, None); + let (_vs_tx, vs_rx) = mpmc::bounded::(1); + let (timer_tx, timer_rx) = mpmc::bounded(1); + let (callback_done_tx, callback_done_rx) = mpmc::bounded(1); + + let mut def = noop_definition(); + def.compare = Arc::new(move |_, _, _, _, _, _| { + callback_done_tx.send(()).expect(WRITE_CHAN_ERR); + }); + + let (result_tx, result_rx) = mpmc::bounded(1); + thread::spawn(move || { + result_tx + .send(compare(cts.token(), &def, &msg, &vs_rx, 0, &timer_rx)) + .expect(WRITE_CHAN_ERR); + }); + + callback_done_rx + .recv_timeout(Duration::from_millis(100)) + .expect("compare callback must exit"); + assert!( + result_rx.try_recv().is_err(), + "compare must wait for timer/cancel if callback exits without status" + ); + + timer_tx.send(time::Instant::now()).expect(WRITE_CHAN_ERR); + assert!(matches!( + result_rx + .recv_timeout(Duration::from_millis(100)) + .expect("compare must return after timer fires"), + (0, Err(QbftError::TimeoutError)) + )); +} + #[test] fn compare_parent_cancel_cancels_callback_token() { let cts = CancellationTokenSource::new(); diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 71eb57b7..0c53de21 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -648,7 +648,7 @@ where V: PartialEq + 'static, C: Clone + Send + Sync + 'static, { - let (compare_err_tx, compare_err_rx) = mpmc::bounded::>(1); + let (compare_err_tx, mut compare_err_rx) = mpmc::bounded::>(1); let (compare_value_tx, mut compare_value_rx) = mpmc::bounded::(1); // d.Compare has 2 roles: @@ -692,9 +692,9 @@ where recv(compare_err_rx) -> msg => { let err = match msg { Ok(err) => err, - Err(err) => { - compare_cts.cancel(); - return (result, Err(QbftError::ChannelError(err))); + Err(_) => { + compare_err_rx = mpmc::never(); + continue; } }; From 27744cff60b58a851631d9aec036618765536213 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 11:13:01 +0700 Subject: [PATCH 16/28] fix: test use timeout channel instead of sleep --- crates/core/src/qbft/internal_test.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 495de30e..08dd1960 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1708,10 +1708,12 @@ fn compare_parent_cancel_cancels_callback_token() { let msg = new_msg(MSG_PRE_PREPARE, 0, 1, 1, 7, 11, 0, 0, None); let (_vs_tx, vs_rx) = mpmc::bounded::(1); let (timer_tx, timer_rx) = mpmc::bounded(1); + let (compare_started_tx, compare_started_rx) = mpmc::bounded(1); let (token_cancelled_tx, token_cancelled_rx) = mpmc::bounded(1); let mut def = noop_definition(); def.compare = Arc::new(move |ct, _, _, _, return_err, _| { + compare_started_tx.send(()).expect(WRITE_CHAN_ERR); while !ct.is_canceled() { thread::sleep(Duration::from_millis(1)); } @@ -1726,7 +1728,9 @@ fn compare_parent_cancel_cancels_callback_token() { .expect(WRITE_CHAN_ERR); }); - thread::sleep(Duration::from_millis(10)); + compare_started_rx + .recv_timeout(Duration::from_millis(100)) + .expect("compare must start"); cts.cancel(); match result_rx.recv_timeout(Duration::from_millis(100)) { From 0bd6a4f1ae8639e3f2f651d11cc5289f3a349235 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 11:31:20 +0700 Subject: [PATCH 17/28] fix: add comment on cancellation poll interval --- crates/core/src/qbft/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 0c53de21..3e5c2062 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -33,7 +33,10 @@ use std::{ type Result = std::result::Result; -const CANCELLATION_POLL_INTERVAL: time::Duration = time::Duration::from_millis(1); +// The `cancellation` crate is callback-based, not channel-based, so it cannot +// be used directly in `crossbeam::select!`. Keep polling coarse: QBFT shutdown +// does not need sub-millisecond latency, and idle instances should stay cheap. +const CANCELLATION_POLL_INTERVAL: time::Duration = time::Duration::from_millis(50); type CompareFn = dyn Fn( /* ct */ &CancellationToken, From e988c35f4c14899f2582d412d5f9206eaada1057 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 11:36:35 +0700 Subject: [PATCH 18/28] fix: use enum for invalid defnintion error --- crates/core/src/qbft/internal_test.rs | 8 ++------ crates/core/src/qbft/mod.rs | 17 +++++++++-------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 08dd1960..29f44138 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1358,18 +1358,14 @@ fn invalid_definition_rejected() { input_rx.clone(), source_rx.clone(), ), - Err(QbftError::InvalidDefinition( - "nodes must be greater than zero" - )) + Err(QbftError::InvalidNodes { nodes: 0 }) )); def.nodes = 4; def.fifo_limit = 0; assert!(matches!( qbft::run(cts.token(), &def, &transport, &0, 1, input_rx, source_rx), - Err(QbftError::InvalidDefinition( - "fifo_limit must be greater than zero" - )) + Err(QbftError::InvalidFifoLimit { fifo_limit: 0 }) )); } diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 3e5c2062..071823c2 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -69,8 +69,11 @@ pub enum QbftError { #[error("Zero input value not supported")] ZeroInputValue, - #[error("Invalid definition: {0}")] - InvalidDefinition(&'static str), + #[error("invalid node count: must be greater than zero, got {nodes}")] + InvalidNodes { nodes: i64 }, + + #[error("invalid FIFO limit: must be greater than zero, got {fifo_limit}")] + InvalidFifoLimit { fifo_limit: i64 }, #[error("Failed to read from channel: {0}")] ChannelError(#[from] mpmc::RecvError), @@ -621,15 +624,13 @@ where V: PartialEq, { if d.nodes <= 0 { - return Err(QbftError::InvalidDefinition( - "nodes must be greater than zero", - )); + return Err(QbftError::InvalidNodes { nodes: d.nodes }); } if d.fifo_limit <= 0 { - return Err(QbftError::InvalidDefinition( - "fifo_limit must be greater than zero", - )); + return Err(QbftError::InvalidFifoLimit { + fifo_limit: d.fifo_limit, + }); } Ok(()) From 6c65b3291d80af3cefc00f3d4d2e367041b8ccd0 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 11:59:03 +0700 Subject: [PATCH 19/28] fix: add more test on check valid round --- crates/core/src/qbft/internal_test.rs | 126 ++++++++++++++++++++++++-- crates/core/src/qbft/mod.rs | 9 +- 2 files changed, 122 insertions(+), 13 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 29f44138..4226c89a 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -654,6 +654,22 @@ fn new_msg( }) } +fn new_prepare_quorum(round: i64, value: i64) -> Vec { + (1..=3) + .map(|source| new_msg(MSG_PREPARE, 0, source, round, value, 0, 0, 0, None)) + .collect() +} + +fn new_round_change(source: i64, round: i64, pr: i64, pv: i64) -> TestMsgRef { + new_msg(MSG_ROUND_CHANGE, 0, source, round, 0, 0, pr, pv, None) +} + +fn new_round_change_quorum(round: i64, pr: i64, pv: i64) -> Vec { + (1..=3) + .map(|source| new_round_change(source, round, pr, pv)) + .collect() +} + // Delays the message broadcast by between 1x and 2x jitter_ms and drops // messages. fn bcast( @@ -1503,17 +1519,111 @@ fn justified_qrc_j1_and_j2() { } #[test] -fn round_change_prepared_round_must_be_before_round() { +fn valid_round_change_prepared_round_boundaries() { + let cases = [ + ("negative", 2, -1, false), + ("null at round one", 1, 0, true), + ("previous round", 2, 1, true), + ("current round", 2, 2, false), + ("future round", 2, 3, false), + ]; + + for (name, round, prepared_round, expected) in cases { + let msg = new_msg(MSG_ROUND_CHANGE, 0, 1, round, 0, 0, prepared_round, 7, None); + assert_eq!(expected, valid_round_change_prepared_round(&msg), "{name}"); + } +} + +#[test] +fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites() { let mut def = noop_definition(); def.nodes = 4; - let prepares = vec![ - new_msg(MSG_PREPARE, 0, 1, 2, 7, 0, 0, 0, None), - new_msg(MSG_PREPARE, 0, 2, 2, 7, 0, 0, 0, None), - new_msg(MSG_PREPARE, 0, 3, 2, 7, 0, 0, 0, None), - ]; - let invalid = new_msg(MSG_ROUND_CHANGE, 0, 1, 2, 0, 0, 2, 7, Some(&prepares)); + let target_round = 2; + let valid_prepared_round = 1; + let value = 7; + let prepares = new_prepare_quorum(valid_prepared_round, value); + let valid_round_changes = new_round_change_quorum(target_round, valid_prepared_round, value); + + for (name, invalid_pr) in [ + ("negative", -1), + ("current round", target_round), + ("future round", target_round + 1), + ] { + let invalid_prepares = new_prepare_quorum(invalid_pr, value); + let invalid_round_change = new_msg( + MSG_ROUND_CHANGE, + 0, + 1, + target_round, + 0, + 0, + invalid_pr, + value, + Some(&invalid_prepares), + ); + assert!( + !is_justified_round_change(&def, &invalid_round_change), + "is_justified_round_change must reject {name} prepared_round" + ); + + let mut only_invalid = new_round_change_quorum(target_round, invalid_pr, value); + only_invalid.extend(invalid_prepares); + assert_eq!( + None, + contains_justified_qrc(&def, &only_invalid, target_round), + "contains_justified_qrc must not count {name} prepared_round" + ); + assert!( + get_justified_qrc(&def, &only_invalid, target_round).is_none(), + "get_justified_qrc must not count {name} prepared_round" + ); + + let mut with_invalid_extra = valid_round_changes.clone(); + with_invalid_extra.push(new_round_change(4, target_round, invalid_pr, value)); + with_invalid_extra.extend(prepares.clone()); + assert_eq!( + Some(value), + contains_justified_qrc(&def, &with_invalid_extra, target_round), + "contains_justified_qrc must ignore extra {name} prepared_round" + ); + let qrc = get_justified_qrc(&def, &with_invalid_extra, target_round) + .expect("valid quorum must remain after filtering invalid prepared_round"); + assert!( + qrc.iter() + .filter(|msg| msg.type_() == MSG_ROUND_CHANGE) + .all(valid_round_change_prepared_round), + "get_justified_qrc must filter extra {name} prepared_round" + ); + } +} - assert!(!is_justified_round_change(&def, &invalid)); +#[test] +fn quorum_null_prepared_filters_invalid_prepared_rounds() { + let mut def = noop_definition(); + def.nodes = 4; + + let valid = new_round_change_quorum(1, 0, 0); + let (qrc, ok) = quorum_null_prepared(&def, &valid, 1); + assert!(ok); + assert_eq!(3, qrc.len()); + + for (name, round, invalid_pr) in [ + ("negative", 1, -1), + ("current round", 1, 1), + ("future round", 1, 2), + ("zero round", 0, 0), + ] { + let invalid = new_round_change_quorum(round, invalid_pr, 0); + let (qrc, ok) = quorum_null_prepared(&def, &invalid, round); + assert!( + !ok, + "quorum_null_prepared must reject {name} prepared_round" + ); + assert!( + qrc.is_empty(), + "quorum_null_prepared must filter {name} prepared_round" + ); + } } #[test] diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 071823c2..54b77949 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -1036,15 +1036,14 @@ fn contains_justified_qrc( where V: Eq + Hash + Default, { - let qrc = filter_round_change(justification, round); + let qrc = filter_round_change(justification, round) + .into_iter() + .filter(valid_round_change_prepared_round) + .collect::>(); if (qrc.len() as i64) < d.quorum() { return None; } - if qrc.iter().any(|rc| !valid_round_change_prepared_round(rc)) { - return None; - } - // No need to calculate J1 or J2 for all possible combinations, // since justification should only contain one. From b30e616aa0b79a9592d389d912d14eecf242f3a8 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 12:10:03 +0700 Subject: [PATCH 20/28] fix: using test-case --- crates/core/src/qbft/internal_test.rs | 553 ++++++++++---------------- 1 file changed, 205 insertions(+), 348 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 4226c89a..38409f61 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -9,6 +9,7 @@ use std::{ thread, time::Duration, }; +use test_case::test_case; const WRITE_CHAN_ERR: &str = "Failed to write to channel"; const READ_CHAN_ERR: &str = "Failed to read from channel"; @@ -899,107 +900,66 @@ impl SomeMsg for TestMsg { } } -#[test] -fn happy_0() { +#[test_case(0 ; "happy_0")] +#[test_case(1 ; "happy_1")] +fn happy(instance: i64) { test_qbft(Test { - instance: 0, - decide_round: 1, - ..Default::default() - }); -} - -#[test] -fn happy_1() { - test_qbft(Test { - instance: 1, + instance, decide_round: 1, ..Default::default() }); } -#[test] -fn prepare_round_1_decide_round_2() { - test_qbft(Test { - instance: 0, - commits_after: 1, - decide_round: 2, - prepared_val: 1, - ..Default::default() - }); -} - -#[test] -fn prepare_round_2_decide_round_3() { - test_qbft(Test { - instance: 0, - commits_after: 2, - value_delay: HashMap::from([(1, Duration::from_secs(2))]), - decide_round: 3, - prepared_val: 2, - const_period: true, - ..Default::default() - }); -} - -#[test] -fn leader_late_exp() { +#[test_case(1, None, 2, 1, false ; "prepare_round_1_decide_round_2")] +#[test_case(2, Some(2), 3, 2, true ; "prepare_round_2_decide_round_3")] +fn prepare_round( + commits_after: i32, + value_delay_secs: Option, + decide_round: i32, + prepared_val: i32, + const_period: bool, +) { test_qbft(Test { instance: 0, - start_delay: HashMap::from([(1, Duration::from_secs(2))]), - decide_round: 2, + commits_after, + value_delay: value_delay_secs + .map(|secs| HashMap::from([(1, Duration::from_secs(secs))])) + .unwrap_or_default(), + decide_round, + prepared_val, + const_period, ..Default::default() }); } -#[test] -fn leader_down_const() { +#[test_case(false ; "leader_late_exp")] +#[test_case(true ; "leader_down_const")] +fn delayed_leader_start(const_period: bool) { test_qbft(Test { instance: 0, start_delay: HashMap::from([(1, Duration::from_secs(2))]), - const_period: true, decide_round: 2, + const_period, ..Default::default() }); } -#[test] -fn very_late_exp() { - test_qbft(Test { - instance: 3, - start_delay: HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]), - decide_round: 4, - ..Default::default() - }); -} - -#[test] -fn very_late_const() { +#[test_case(3, false, 4, false ; "very_late_exp")] +#[test_case(1, true, 0, true ; "very_late_const")] +fn very_late_start(instance: i64, const_period: bool, decide_round: i32, random_round: bool) { test_qbft(Test { - instance: 1, + instance, start_delay: HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]), - const_period: true, - random_round: true, - ..Default::default() - }); -} - -#[test] -fn stagger_start_exp() { - test_qbft(Test { - instance: 0, - start_delay: HashMap::from([ - (1, Duration::from_secs(0)), - (2, Duration::from_secs(1)), - (3, Duration::from_secs(2)), - (4, Duration::from_secs(3)), - ]), - random_round: true, // Takes 1 or 2 rounds. + decide_round, + const_period, + random_round, ..Default::default() }); } -#[test] -fn stagger_start_const() { +#[test_case(false ; "stagger_start_exp")] +#[test_case(true ; "stagger_start_const")] +fn stagger_start(const_period: bool) { test_qbft(Test { instance: 0, start_delay: HashMap::from([ @@ -1008,54 +968,40 @@ fn stagger_start_const() { (3, Duration::from_secs(2)), (4, Duration::from_secs(3)), ]), - const_period: true, + const_period, random_round: true, // Takes 1 or 2 rounds. ..Default::default() }); } -#[test] -fn very_delayed_value_exp() { - test_qbft(Test { - instance: 3, - value_delay: HashMap::from([ - (1, Duration::from_secs(5)), - (2, Duration::from_secs(10)), - (4, Duration::from_secs(5)), - ]), - decide_round: 4, - ..Default::default() - }); -} - -#[test] -fn very_delayed_value_const() { - test_qbft(Test { - instance: 1, - value_delay: HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]), - const_period: true, - random_round: true, - ..Default::default() - }); -} +#[test_case(3, true, false, 4, false ; "very_delayed_value_exp")] +#[test_case(1, false, true, 0, true ; "very_delayed_value_const")] +fn very_delayed_value( + instance: i64, + include_process_4: bool, + const_period: bool, + decide_round: i32, + random_round: bool, +) { + let mut value_delay = + HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]); + if include_process_4 { + value_delay.insert(4, Duration::from_secs(5)); + } -#[test] -fn stagger_delayed_value_exp() { test_qbft(Test { - instance: 0, - value_delay: HashMap::from([ - (1, Duration::from_secs(0)), - (2, Duration::from_secs(1)), - (3, Duration::from_secs(2)), - (4, Duration::from_secs(3)), - ]), - random_round: true, + instance, + value_delay, + const_period, + decide_round, + random_round, ..Default::default() }); } -#[test] -fn stagger_delayed_value_const() { +#[test_case(false ; "stagger_delayed_value_exp")] +#[test_case(true ; "stagger_delayed_value_const")] +fn stagger_delayed_value(const_period: bool) { test_qbft(Test { instance: 0, value_delay: HashMap::from([ @@ -1064,7 +1010,7 @@ fn stagger_delayed_value_const() { (3, Duration::from_secs(2)), (4, Duration::from_secs(3)), ]), - const_period: true, + const_period, random_round: true, ..Default::default() }); @@ -1082,80 +1028,53 @@ fn round1_leader_no_value_round2_leader_offline() { }); } -#[test] -fn jitter_500ms_exp() { - test_qbft(Test { - instance: 3, - bcast_jitter_ms: 500, - random_round: true, - ..Default::default() - }); -} - -#[test] -fn jitter_200ms_const() { +#[test_case(500, false ; "jitter_500ms_exp")] +#[test_case(200, true ; "jitter_200ms_const")] +fn jitter(bcast_jitter_ms: i32, const_period: bool) { test_qbft(Test { instance: 3, - bcast_jitter_ms: 200, // 0.2-0.4s network delay * 3msgs/round == 0.6-1.2s delay per 1s round - const_period: true, - random_round: true, - ..Default::default() - }); -} - -#[test] -fn drop_10_percent_const() { - test_qbft(Test { - instance: 1, - drop_prob: HashMap::from([(1, 0.1), (2, 0.1), (3, 0.1), (4, 0.1)]), - const_period: true, + bcast_jitter_ms, + const_period, random_round: true, ..Default::default() }); } -#[test] -fn drop_30_percent_const() { - test_qbft(Test { - instance: 1, - drop_prob: HashMap::from([(1, 0.3), (2, 0.3), (3, 0.3), (4, 0.3)]), - const_period: true, - random_round: true, - ..Default::default() - }); -} - -#[test] -fn fuzz() { - test_qbft(Test { - instance: 1, - fuzz: true, - const_period: true, - decide_round: 1, - ..Default::default() - }); -} - -#[test] -fn fuzz_with_late_leader() { +#[test_case(0.1 ; "drop_10_percent_const")] +#[test_case(0.3 ; "drop_30_percent_const")] +fn dropped_messages(drop_probability: f64) { test_qbft(Test { instance: 1, - fuzz: true, - start_delay: HashMap::from([(1, Duration::from_secs(2)), (2, Duration::from_secs(2))]), + drop_prob: HashMap::from([ + (1, drop_probability), + (2, drop_probability), + (3, drop_probability), + (4, drop_probability), + ]), const_period: true, random_round: true, ..Default::default() }); } -#[test] -fn fuzz_with_very_late_leader() { +#[test_case(None, 1, false ; "fuzz")] +#[test_case(Some(2), 0, true ; "fuzz_with_late_leader")] +#[test_case(Some(10), 0, true ; "fuzz_with_very_late_leader")] +fn fuzzed(start_delay_secs: Option, decide_round: i32, random_round: bool) { test_qbft(Test { instance: 1, fuzz: true, - start_delay: HashMap::from([(1, Duration::from_secs(10)), (2, Duration::from_secs(10))]), + start_delay: start_delay_secs + .map(|secs| { + HashMap::from([ + (1, Duration::from_secs(secs)), + (2, Duration::from_secs(secs)), + ]) + }) + .unwrap_or_default(), const_period: true, - random_round: true, + decide_round, + random_round, ..Default::default() }); } @@ -1181,41 +1100,35 @@ fn noop_transport() -> Transport { } } -#[test] -fn formulas() { - let expected = [ - (1, 1, 0), - (2, 2, 0), - (3, 2, 0), - (4, 3, 1), - (5, 4, 1), - (6, 4, 1), - (7, 5, 2), - (8, 6, 2), - (9, 6, 2), - (10, 7, 3), - (11, 8, 3), - (12, 8, 3), - (13, 9, 4), - (14, 10, 4), - (15, 10, 4), - (16, 11, 5), - (17, 12, 5), - (18, 12, 5), - (19, 13, 6), - (20, 14, 6), - (21, 14, 6), - (22, 15, 7), - ]; - - for (n, q, f) in expected { - let d = Definition:: { - nodes: n, - ..noop_definition() - }; - assert_eq!(q, d.quorum(), "Quorum given N={n}"); - assert_eq!(f, d.faulty(), "Faulty given N={n}"); - } +#[test_case(1, 1, 0 ; "n1")] +#[test_case(2, 2, 0 ; "n2")] +#[test_case(3, 2, 0 ; "n3")] +#[test_case(4, 3, 1 ; "n4")] +#[test_case(5, 4, 1 ; "n5")] +#[test_case(6, 4, 1 ; "n6")] +#[test_case(7, 5, 2 ; "n7")] +#[test_case(8, 6, 2 ; "n8")] +#[test_case(9, 6, 2 ; "n9")] +#[test_case(10, 7, 3 ; "n10")] +#[test_case(11, 8, 3 ; "n11")] +#[test_case(12, 8, 3 ; "n12")] +#[test_case(13, 9, 4 ; "n13")] +#[test_case(14, 10, 4 ; "n14")] +#[test_case(15, 10, 4 ; "n15")] +#[test_case(16, 11, 5 ; "n16")] +#[test_case(17, 12, 5 ; "n17")] +#[test_case(18, 12, 5 ; "n18")] +#[test_case(19, 13, 6 ; "n19")] +#[test_case(20, 14, 6 ; "n20")] +#[test_case(21, 14, 6 ; "n21")] +#[test_case(22, 15, 7 ; "n22")] +fn formulas(n: i64, q: i64, f: i64) { + let d = Definition:: { + nodes: n, + ..noop_definition() + }; + assert_eq!(q, d.quorum(), "Quorum given N={n}"); + assert_eq!(f, d.faulty(), "Faulty given N={n}"); } #[test] @@ -1354,35 +1267,27 @@ fn idle_run_returns_when_cancelled() { )); } -#[test] -fn invalid_definition_rejected() { +#[test_case(0, 1, true ; "invalid_nodes")] +#[test_case(4, 0, false ; "invalid_fifo_limit")] +fn invalid_definition_rejected(nodes: i64, fifo_limit: i64, invalid_nodes: bool) { let cts = CancellationTokenSource::new(); let transport = noop_transport(); let (_input_tx, input_rx) = mpmc::bounded::(1); let (_source_tx, source_rx) = mpmc::bounded::(1); let mut def = noop_definition(); - def.nodes = 0; - def.fifo_limit = 1; - assert!(matches!( - qbft::run( - cts.token(), - &def, - &transport, - &0, - 1, - input_rx.clone(), - source_rx.clone(), - ), - Err(QbftError::InvalidNodes { nodes: 0 }) - )); - - def.nodes = 4; - def.fifo_limit = 0; - assert!(matches!( - qbft::run(cts.token(), &def, &transport, &0, 1, input_rx, source_rx), - Err(QbftError::InvalidFifoLimit { fifo_limit: 0 }) - )); + def.nodes = nodes; + def.fifo_limit = fifo_limit; + + let result = qbft::run(cts.token(), &def, &transport, &0, 1, input_rx, source_rx); + if invalid_nodes { + assert!(matches!(result, Err(QbftError::InvalidNodes { nodes: 0 }))); + } else { + assert!(matches!( + result, + Err(QbftError::InvalidFifoLimit { fifo_limit: 0 }) + )); + } } #[test] @@ -1518,24 +1423,20 @@ fn justified_qrc_j1_and_j2() { assert!(get_justified_qrc(&def, &invalid_pr, 2).is_none()); } -#[test] -fn valid_round_change_prepared_round_boundaries() { - let cases = [ - ("negative", 2, -1, false), - ("null at round one", 1, 0, true), - ("previous round", 2, 1, true), - ("current round", 2, 2, false), - ("future round", 2, 3, false), - ]; - - for (name, round, prepared_round, expected) in cases { - let msg = new_msg(MSG_ROUND_CHANGE, 0, 1, round, 0, 0, prepared_round, 7, None); - assert_eq!(expected, valid_round_change_prepared_round(&msg), "{name}"); - } +#[test_case(2, -1, false ; "negative")] +#[test_case(1, 0, true ; "null_at_round_one")] +#[test_case(2, 1, true ; "previous_round")] +#[test_case(2, 2, false ; "current_round")] +#[test_case(2, 3, false ; "future_round")] +fn valid_round_change_prepared_round_boundaries(round: i64, prepared_round: i64, expected: bool) { + let msg = new_msg(MSG_ROUND_CHANGE, 0, 1, round, 0, 0, prepared_round, 7, None); + assert_eq!(expected, valid_round_change_prepared_round(&msg)); } -#[test] -fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites() { +#[test_case(-1 ; "negative")] +#[test_case(2 ; "current_round")] +#[test_case(3 ; "future_round")] +fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites(invalid_pr: i64) { let mut def = noop_definition(); def.nodes = 4; let target_round = 2; @@ -1544,61 +1445,49 @@ fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites() { let prepares = new_prepare_quorum(valid_prepared_round, value); let valid_round_changes = new_round_change_quorum(target_round, valid_prepared_round, value); - for (name, invalid_pr) in [ - ("negative", -1), - ("current round", target_round), - ("future round", target_round + 1), - ] { - let invalid_prepares = new_prepare_quorum(invalid_pr, value); - let invalid_round_change = new_msg( - MSG_ROUND_CHANGE, - 0, - 1, - target_round, - 0, - 0, - invalid_pr, - value, - Some(&invalid_prepares), - ); - assert!( - !is_justified_round_change(&def, &invalid_round_change), - "is_justified_round_change must reject {name} prepared_round" - ); - - let mut only_invalid = new_round_change_quorum(target_round, invalid_pr, value); - only_invalid.extend(invalid_prepares); - assert_eq!( - None, - contains_justified_qrc(&def, &only_invalid, target_round), - "contains_justified_qrc must not count {name} prepared_round" - ); - assert!( - get_justified_qrc(&def, &only_invalid, target_round).is_none(), - "get_justified_qrc must not count {name} prepared_round" - ); - - let mut with_invalid_extra = valid_round_changes.clone(); - with_invalid_extra.push(new_round_change(4, target_round, invalid_pr, value)); - with_invalid_extra.extend(prepares.clone()); - assert_eq!( - Some(value), - contains_justified_qrc(&def, &with_invalid_extra, target_round), - "contains_justified_qrc must ignore extra {name} prepared_round" - ); - let qrc = get_justified_qrc(&def, &with_invalid_extra, target_round) - .expect("valid quorum must remain after filtering invalid prepared_round"); - assert!( - qrc.iter() - .filter(|msg| msg.type_() == MSG_ROUND_CHANGE) - .all(valid_round_change_prepared_round), - "get_justified_qrc must filter extra {name} prepared_round" - ); - } + let invalid_prepares = new_prepare_quorum(invalid_pr, value); + let invalid_round_change = new_msg( + MSG_ROUND_CHANGE, + 0, + 1, + target_round, + 0, + 0, + invalid_pr, + value, + Some(&invalid_prepares), + ); + assert!(!is_justified_round_change(&def, &invalid_round_change)); + + let mut only_invalid = new_round_change_quorum(target_round, invalid_pr, value); + only_invalid.extend(invalid_prepares); + assert_eq!( + None, + contains_justified_qrc(&def, &only_invalid, target_round) + ); + assert!(get_justified_qrc(&def, &only_invalid, target_round).is_none()); + + let mut with_invalid_extra = valid_round_changes.clone(); + with_invalid_extra.push(new_round_change(4, target_round, invalid_pr, value)); + with_invalid_extra.extend(prepares.clone()); + assert_eq!( + Some(value), + contains_justified_qrc(&def, &with_invalid_extra, target_round) + ); + let qrc = get_justified_qrc(&def, &with_invalid_extra, target_round) + .expect("valid quorum must remain after filtering invalid prepared_round"); + assert!( + qrc.iter() + .filter(|msg| msg.type_() == MSG_ROUND_CHANGE) + .all(valid_round_change_prepared_round) + ); } -#[test] -fn quorum_null_prepared_filters_invalid_prepared_rounds() { +#[test_case(1, -1 ; "negative")] +#[test_case(1, 1 ; "current_round")] +#[test_case(1, 2 ; "future_round")] +#[test_case(0, 0 ; "zero_round")] +fn quorum_null_prepared_filters_invalid_prepared_rounds(round: i64, invalid_pr: i64) { let mut def = noop_definition(); def.nodes = 4; @@ -1607,23 +1496,10 @@ fn quorum_null_prepared_filters_invalid_prepared_rounds() { assert!(ok); assert_eq!(3, qrc.len()); - for (name, round, invalid_pr) in [ - ("negative", 1, -1), - ("current round", 1, 1), - ("future round", 1, 2), - ("zero round", 0, 0), - ] { - let invalid = new_round_change_quorum(round, invalid_pr, 0); - let (qrc, ok) = quorum_null_prepared(&def, &invalid, round); - assert!( - !ok, - "quorum_null_prepared must reject {name} prepared_round" - ); - assert!( - qrc.is_empty(), - "quorum_null_prepared must filter {name} prepared_round" - ); - } + let invalid = new_round_change_quorum(round, invalid_pr, 0); + let (qrc, ok) = quorum_null_prepared(&def, &invalid, round); + assert!(!ok); + assert!(qrc.is_empty()); } #[test] @@ -1936,43 +1812,24 @@ struct ChainSplitTest { should_halt: bool, } -#[test] -fn chain_split_same_value() { - test_qbft_chain_split(ChainSplitTest { - decide_round: 1, - value_source: HashMap::from([(1, 1), (2, 1), (3, 1), (4, 1)]), - prepared_val: 1, - should_halt: false, - }); -} - -#[test] -fn chain_split_non_leader_peer_has_different_value() { - test_qbft_chain_split(ChainSplitTest { - decide_round: 1, - value_source: HashMap::from([(1, 1), (2, 3), (3, 1), (4, 1)]), - prepared_val: 1, - should_halt: false, - }); -} - -#[test] -fn chain_split_first_leader_has_different_value_second_leader_succeeds() { - test_qbft_chain_split(ChainSplitTest { - decide_round: 2, - value_source: HashMap::from([(1, 3), (2, 1), (3, 1), (4, 1)]), - prepared_val: 1, - should_halt: false, - }); -} - -#[test] -fn zz_chain_split_no_consensus_halt() { +#[test_case(1, 1, 1, 1, 1, 1, false ; "same_value")] +#[test_case(1, 3, 1, 1, 1, 1, false ; "non_leader_peer_has_different_value")] +#[test_case(3, 1, 1, 1, 2, 1, false ; "first_leader_has_different_value_second_leader_succeeds")] +#[test_case(1, 1, 3, 3, 0, 0, true ; "zz_no_consensus_halt")] +fn chain_split( + value_1: i64, + value_2: i64, + value_3: i64, + value_4: i64, + decide_round: i32, + prepared_val: i32, + should_halt: bool, +) { test_qbft_chain_split(ChainSplitTest { - decide_round: 0, - value_source: HashMap::from([(1, 1), (2, 1), (3, 3), (4, 3)]), - prepared_val: 0, - should_halt: true, + decide_round, + value_source: HashMap::from([(1, value_1), (2, value_2), (3, value_3), (4, value_4)]), + prepared_val, + should_halt, }); } From 4a364dff40811433efe1604c1abc11aea7ee0e0a Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 13:32:36 +0700 Subject: [PATCH 21/28] fix: small fixes --- crates/core/src/qbft/fake_clock.rs | 1 + crates/core/src/qbft/internal_test.rs | 2 +- crates/core/src/qbft/mod.rs | 12 +++++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 8392e7f3..75d07ef6 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -92,6 +92,7 @@ impl FakeClock { inner.now - inner.start } + /// Explicit cleanup; dropping one clone must not cancel timers owned by other clones. pub fn cancel(&self) { let mut inner = self.inner.lock().unwrap(); inner.clients.clear(); diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 38409f61..1ed16e11 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1815,7 +1815,7 @@ struct ChainSplitTest { #[test_case(1, 1, 1, 1, 1, 1, false ; "same_value")] #[test_case(1, 3, 1, 1, 1, 1, false ; "non_leader_peer_has_different_value")] #[test_case(3, 1, 1, 1, 2, 1, false ; "first_leader_has_different_value_second_leader_succeeds")] -#[test_case(1, 1, 3, 3, 0, 0, true ; "zz_no_consensus_halt")] +#[test_case(1, 1, 3, 3, 0, 0, true ; "no_consensus_halt")] fn chain_split( value_1: i64, value_2: i64, diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 54b77949..2537611d 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -57,8 +57,8 @@ pub enum QbftError { #[error("Compare leader value with local value failed")] CompareError, - #[error("bug: expected only comparison or timeout error")] - UnexpectedCompareError, + #[error("bug: expected only comparison or timeout error, got {0}")] + UnexpectedCompareError(Box), #[error("context canceled")] ContextCanceled, @@ -534,7 +534,11 @@ where broadcast_round_change()?; } QbftError::ContextCanceled => return Err(QbftError::ContextCanceled), - _ => return Err(QbftError::UnexpectedCompareError), + _ => { + return Err(QbftError::UnexpectedCompareError(Box::new( + qbft_err, + ))); + } } } } @@ -1288,6 +1292,8 @@ where let null_pr = Default::default(); let null_pv = Some(&Default::default()); + // Normal callers pass round >= 1, which makes pr=0 valid. Keep the explicit + // prepared-round filter so this helper remains safe for direct/future calls. let justification = filter_msgs(all, MSG_ROUND_CHANGE, round, None, Some(null_pr), null_pv) .into_iter() .filter(valid_round_change_prepared_round) From e759a81c8112593f68701a78affe82e260c1e07d Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 13:38:42 +0700 Subject: [PATCH 22/28] fix: linter --- crates/core/src/qbft/fake_clock.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 75d07ef6..2d6c239d 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -92,7 +92,8 @@ impl FakeClock { inner.now - inner.start } - /// Explicit cleanup; dropping one clone must not cancel timers owned by other clones. + /// Explicit cleanup; dropping one clone must not cancel timers owned by + /// other clones. pub fn cancel(&self) { let mut inner = self.inner.lock().unwrap(); inner.clients.clear(); From 998b9ce0bc3e449a29944dfaa698c93f6a291253 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 14:59:02 +0700 Subject: [PATCH 23/28] fix: remove unnecessary filter --- crates/core/src/qbft/internal_test.rs | 3 +-- crates/core/src/qbft/mod.rs | 7 +------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 1ed16e11..af4cd57d 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1486,8 +1486,7 @@ fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites(invalid_pr: #[test_case(1, -1 ; "negative")] #[test_case(1, 1 ; "current_round")] #[test_case(1, 2 ; "future_round")] -#[test_case(0, 0 ; "zero_round")] -fn quorum_null_prepared_filters_invalid_prepared_rounds(round: i64, invalid_pr: i64) { +fn quorum_null_prepared_requires_null_prepared_rounds(round: i64, invalid_pr: i64) { let mut def = noop_definition(); def.nodes = 4; diff --git a/crates/core/src/qbft/mod.rs b/crates/core/src/qbft/mod.rs index 2537611d..956f17e8 100644 --- a/crates/core/src/qbft/mod.rs +++ b/crates/core/src/qbft/mod.rs @@ -1292,12 +1292,7 @@ where let null_pr = Default::default(); let null_pv = Some(&Default::default()); - // Normal callers pass round >= 1, which makes pr=0 valid. Keep the explicit - // prepared-round filter so this helper remains safe for direct/future calls. - let justification = filter_msgs(all, MSG_ROUND_CHANGE, round, None, Some(null_pr), null_pv) - .into_iter() - .filter(valid_round_change_prepared_round) - .collect::>(); + let justification = filter_msgs(all, MSG_ROUND_CHANGE, round, None, Some(null_pr), null_pv); ( justification.clone(), From 0451f2030c14c69a2101e90458e25b32eb780093 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Tue, 19 May 2026 23:42:51 +0700 Subject: [PATCH 24/28] fix: make_is_leader now 0-based --- crates/core/src/qbft/internal_test.rs | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index af4cd57d..8b8aa42d 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -607,9 +607,7 @@ fn seed_from_label(label: &str) -> u64 { /// Construct a leader election function. fn make_is_leader(n: i64) -> impl Fn(&i64, i64, i64) -> bool + Clone { move |instance: &i64, round: i64, process: i64| -> bool { - // Go's helper uses 0-indexed process IDs. These Rust tests use 1..=n, - // so translate the same modulo schedule into the local process range. - ((instance + round - 1).rem_euclid(n)) + 1 == process + (instance + round).rem_euclid(n) == process } } @@ -974,24 +972,12 @@ fn stagger_start(const_period: bool) { }); } -#[test_case(3, true, false, 4, false ; "very_delayed_value_exp")] -#[test_case(1, false, true, 0, true ; "very_delayed_value_const")] -fn very_delayed_value( - instance: i64, - include_process_4: bool, - const_period: bool, - decide_round: i32, - random_round: bool, -) { - let mut value_delay = - HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]); - if include_process_4 { - value_delay.insert(4, Duration::from_secs(5)); - } - +#[test_case(3, false, 4, false ; "very_delayed_value_exp")] +#[test_case(1, true, 0, true ; "very_delayed_value_const")] +fn very_delayed_value(instance: i64, const_period: bool, decide_round: i32, random_round: bool) { test_qbft(Test { instance, - value_delay, + value_delay: HashMap::from([(1, Duration::from_secs(5)), (2, Duration::from_secs(10))]), const_period, decide_round, random_round, From e0e51f78b21ead3047bb1cfcd669118e531db508 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 20 May 2026 00:11:31 +0700 Subject: [PATCH 25/28] fix: add cancelled state for fake clock --- crates/core/src/qbft/fake_clock.rs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 2d6c239d..686b549c 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -15,6 +15,7 @@ struct FakeClockInner { start: Instant, now: Instant, last_id: usize, + cancelled: bool, clients: HashMap, Instant)>, } @@ -25,6 +26,7 @@ impl FakeClock { start: now, now, last_id: 1, + cancelled: false, clients: Default::default(), })), } @@ -41,6 +43,10 @@ impl FakeClock { let client_id = { let mut inner = self.inner.lock().unwrap(); + if inner.cancelled { + return (rx, Box::new(|| {})); + } + let id = inner.last_id; let deadline = inner.now + duration; @@ -92,10 +98,11 @@ impl FakeClock { inner.now - inner.start } - /// Explicit cleanup; dropping one clone must not cancel timers owned by - /// other clones. + /// Explicit terminal cleanup; do not reintroduce `Drop`, since dropping one + /// clone must not cancel timers owned by other clones. pub fn cancel(&self) { let mut inner = self.inner.lock().unwrap(); + inner.cancelled = true; inner.clients.clear(); } } @@ -160,6 +167,20 @@ fn multiple_threads_cancellation() { assert_eq!(Duration::ZERO, clock.elapsed()); } +#[test] +fn timer_created_after_cancel_is_closed() { + let clock = FakeClock::new(Instant::now()); + clock.cancel(); + + let (ch, cancel) = clock.new_timer(Duration::from_secs(5)); + + assert!(matches!( + ch.try_recv(), + Err(mpmc::TryRecvError::Disconnected) + )); + cancel(); +} + #[test] fn cancel_one_timer_only() { let clock = FakeClock::new(Instant::now()); From 6cc22da8e7566ea3ebac42f99586099e29ed7c6f Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 20 May 2026 00:52:33 +0700 Subject: [PATCH 26/28] fix: flanky tests by ordering and add small settle window time --- crates/core/src/qbft/fake_clock.rs | 8 +++++--- crates/core/src/qbft/internal_test.rs | 12 +++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 686b549c..d03063c2 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -1,6 +1,6 @@ use crossbeam::channel as mpmc; use std::{ - collections::HashMap, + collections::BTreeMap, sync::{Arc, Mutex}, thread, time::{Duration, Instant}, @@ -16,7 +16,7 @@ struct FakeClockInner { now: Instant, last_id: usize, cancelled: bool, - clients: HashMap, Instant)>, + clients: BTreeMap, Instant)>, } impl FakeClock { @@ -75,7 +75,9 @@ impl FakeClock { inner.now += duration; let now = inner.now; - for (&id, (ch, deadline)) in inner.clients.iter() { + // Preserve timer creation order for equal deadlines. The QBFT + // fake-clock harness asserts exact rounds, so timer ordering matters. + for (&id, (ch, deadline)) in &inner.clients { if *deadline <= now { expired.push((id, ch.clone())); } diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 8b8aa42d..4d321b16 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -276,10 +276,11 @@ fn test_qbft(test: Test) { .as_ref() .expect("value sender kept until run returns") .clone(); + let (delay_ch, cancel) = clock.new_timer(delay); s.spawn(move || { - let (delay_ch, cancel) = clock.new_timer(delay); - _ = delay_ch.recv(); - _ = v_chan_tx_send.send(i); + if delay_ch.recv().is_ok() { + _ = v_chan_tx_send.send(i); + } cancel(); }); @@ -378,7 +379,6 @@ fn test_qbft(test: Test) { } BroadcastEvent::Delayed(delayed) => pending.push(delayed), } - clock.advance(Duration::from_millis(1)); if clock.elapsed() > Duration::from_secs(180) || real_start.elapsed() > Duration::from_secs(20) { cts.cancel(); clock.cancel(); @@ -517,7 +517,9 @@ fn test_qbft(test: Test) { } default => { - thread::sleep(time::Duration::from_micros(1)); + // Give worker threads a small scheduling window before + // advancing fake time; these tests assert exact rounds. + thread::sleep(Duration::from_micros(50)); clock.advance(Duration::from_millis(1)); if clock.elapsed() > Duration::from_secs(180) || real_start.elapsed() > Duration::from_secs(20) { cts.cancel(); From 7e48b5658c0dddef31e0c7878263925b4c61d594 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 20 May 2026 16:29:31 +0700 Subject: [PATCH 27/28] fix: harden fake-clock to avoid flanky scheduling --- crates/core/src/qbft/fake_clock.rs | 126 +++++++-- crates/core/src/qbft/internal_test.rs | 357 ++++++++++++++++++++++---- 2 files changed, 411 insertions(+), 72 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index d03063c2..160cae56 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -1,7 +1,10 @@ use crossbeam::channel as mpmc; use std::{ collections::BTreeMap, - sync::{Arc, Mutex}, + sync::{ + Arc, Mutex, + atomic::{AtomicIsize, Ordering}, + }, thread, time::{Duration, Instant}, }; @@ -11,15 +14,25 @@ pub struct FakeClock { inner: Arc>, } +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd)] +pub enum TimerPriority { + // Match the Go fake-clock harness at equal deadlines: delayed nodes start + // before protocol timers, while delayed input values arrive after them. + StartDelay, + Protocol, + InputValue, +} + struct FakeClockInner { start: Instant, now: Instant, last_id: usize, cancelled: bool, - clients: BTreeMap, Instant)>, + clients: BTreeMap, Instant, TimerPriority)>, } impl FakeClock { + /// Create a fake clock pinned to an initial instant. pub fn new(now: Instant) -> Self { Self { inner: Arc::new(Mutex::new(FakeClockInner { @@ -32,6 +45,7 @@ impl FakeClock { } } + /// Register a protocol timer with the default protocol priority. pub fn new_timer( &self, duration: Duration, @@ -39,7 +53,22 @@ impl FakeClock { mpmc::Receiver, Box, ) { - let (tx, rx) = mpmc::bounded::(1); + self.new_timer_with_priority(duration, TimerPriority::Protocol) + } + + /// Register a timer with explicit same-deadline ordering priority. + pub fn new_timer_with_priority( + &self, + duration: Duration, + priority: TimerPriority, + ) -> ( + mpmc::Receiver, + Box, + ) { + // Synchronous expiry handoff: advancing fake time must wait until the + // timer owner observes the tick, otherwise exact-round QBFT tests race + // worker scheduling. + let (tx, rx) = mpmc::bounded::(0); let client_id = { let mut inner = self.inner.lock().unwrap(); @@ -51,7 +80,7 @@ impl FakeClock { let deadline = inner.now + duration; inner.last_id += 1; - inner.clients.insert(id, (tx, deadline)); + inner.clients.insert(id, (tx, deadline, priority)); id }; @@ -65,7 +94,26 @@ impl FakeClock { (rx, cancel) } - pub fn advance(&self, duration: Duration) { + /// Advance fake time and deliver all timers expired by the new time. + pub fn advance(&self, duration: Duration) -> usize { + self.advance_inner(duration, None) + } + + /// Advance fake time and wait for each delivered timer action to complete. + pub fn advance_and_wait( + &self, + duration: Duration, + pending_timer_actions: &AtomicIsize, + ) -> usize { + self.advance_inner(duration, Some(pending_timer_actions)) + } + + /// Shared advance path; optionally synchronizes timer delivery with the test harness. + fn advance_inner( + &self, + duration: Duration, + pending_timer_actions: Option<&AtomicIsize>, + ) -> usize { // Advance time and collect expired senders under lock, but perform sends // without holding lock. let mut expired = vec![]; @@ -75,31 +123,55 @@ impl FakeClock { inner.now += duration; let now = inner.now; - // Preserve timer creation order for equal deadlines. The QBFT - // fake-clock harness asserts exact rounds, so timer ordering matters. - for (&id, (ch, deadline)) in &inner.clients { + for (&id, (ch, deadline, priority)) in &inner.clients { if *deadline <= now { - expired.push((id, ch.clone())); + expired.push((id, *deadline, *priority, ch.clone())); } } - for (id, _) in expired.iter() { + for (id, _, _, _) in expired.iter() { inner.clients.remove(id); } now }; - for (_, ch) in expired { - let _ = ch.send(now); + // Equal-deadline order is part of the test harness contract: these + // tests assert exact rounds, and Go's fake-clock scheduling is stable. + expired.sort_by_key(|(id, deadline, priority, _)| (*deadline, *priority, *id)); + + let mut delivered = 0; + for (_, _, _, ch) in expired { + if let Some(pending_timer_actions) = pending_timer_actions { + pending_timer_actions.fetch_add(1, Ordering::SeqCst); + } + + if ch.send(now).is_ok() { + delivered += 1; + if let Some(pending_timer_actions) = pending_timer_actions { + while pending_timer_actions.load(Ordering::SeqCst) > 0 { + thread::yield_now(); + } + } + } else if let Some(pending_timer_actions) = pending_timer_actions { + pending_timer_actions.fetch_sub(1, Ordering::SeqCst); + } } + + delivered } + /// Return fake time elapsed since clock creation. pub fn elapsed(&self) -> Duration { let inner = self.inner.lock().unwrap(); inner.now - inner.start } + /// Return currently registered timers. + pub fn timer_count(&self) -> usize { + self.inner.lock().unwrap().clients.len() + } + /// Explicit terminal cleanup; do not reintroduce `Drop`, since dropping one /// clone must not cancel timers owned by other clones. pub fn cancel(&self) { @@ -110,6 +182,7 @@ impl FakeClock { } #[test] +/// Timers registered by different threads fire after fake time passes deadlines. fn multiple_threads_timers() { let clock = FakeClock::new(Instant::now()); let (done_tx, done_rx) = mpmc::bounded(2); @@ -141,6 +214,7 @@ fn multiple_threads_timers() { } #[test] +/// Cancelling the clock closes outstanding timers without advancing fake time. fn multiple_threads_cancellation() { let clock = FakeClock::new(Instant::now()); let (done_tx, done_rx) = mpmc::bounded(2); @@ -170,6 +244,7 @@ fn multiple_threads_cancellation() { } #[test] +/// A timer created after clock cancellation is immediately closed. fn timer_created_after_cancel_is_closed() { let clock = FakeClock::new(Instant::now()); clock.cancel(); @@ -184,25 +259,42 @@ fn timer_created_after_cancel_is_closed() { } #[test] +/// Cancelling one timer does not affect other timers with the same deadline. fn cancel_one_timer_only() { let clock = FakeClock::new(Instant::now()); let (ch_1, cancel_1) = clock.new_timer(Duration::from_secs(5)); let (ch_2, _) = clock.new_timer(Duration::from_secs(5)); + let (done_tx, done_rx) = mpmc::bounded(1); cancel_1(); - clock.advance(Duration::from_secs(5)); + thread::scope(|s| { + s.spawn(move || { + done_tx.send(ch_2.recv().is_ok()).unwrap(); + }); + + clock.advance(Duration::from_secs(5)); + }); assert!(ch_1.try_recv().is_err()); - assert!(ch_2.try_recv().is_ok()); + assert_eq!(Ok(true), done_rx.try_recv()); } #[test] +/// An expired timer is delivered once and removed from the clock. fn expired_timer_delivers_once() { let clock = FakeClock::new(Instant::now()); let (ch, _) = clock.new_timer(Duration::from_secs(5)); + let (done_tx, done_rx) = mpmc::bounded(1); + thread::scope(|s| { + s.spawn(move || { + done_tx.send(ch.recv().is_ok()).unwrap(); + }); + + clock.advance(Duration::from_secs(5)); + }); + + assert_eq!(Ok(true), done_rx.try_recv()); clock.advance(Duration::from_secs(5)); - assert!(ch.try_recv().is_ok()); - clock.advance(Duration::from_secs(5)); - assert!(ch.try_recv().is_err()); + assert!(done_rx.try_recv().is_err()); } diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 4d321b16..0973c924 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -1,11 +1,18 @@ -use crate::qbft::{self, fake_clock::FakeClock, *}; +use crate::qbft::{ + self, + fake_clock::{FakeClock, TimerPriority}, + *, +}; use cancellation::CancellationTokenSource; use crossbeam::channel as mpmc; use std::{ collections::{BTreeMap, HashMap}, fmt::Write as _, panic::{self, AssertUnwindSafe}, - sync::{Arc, Mutex}, + sync::{ + Arc, Mutex, + atomic::{AtomicIsize, AtomicUsize, Ordering}, + }, thread, time::Duration, }; @@ -24,10 +31,28 @@ const TEST_STREAM_MSG_ROUND: u64 = 11; const TEST_STREAM_MSG_VALUE: u64 = 12; const TEST_STREAM_MSG_PREPARED_ROUND: u64 = 13; const TEST_STREAM_MSG_PREPARED_VALUE: u64 = 14; +const TEST_WAIT_TIMEOUT: Duration = Duration::from_secs(1); +// Wall-clock guard catches lack of harness progress. Fake time still controls +// protocol progress, so slow-but-progressing parallel runs should not fail. +const TEST_STALL_TIMEOUT: Duration = Duration::from_secs(20); type RunOutcome = std::thread::Result>; type TestMsgRef = Msg; +struct PendingCompareGuard { + pending_compares: Arc, +} + +impl Drop for PendingCompareGuard { + fn drop(&mut self) { + self.pending_compares.fetch_sub(1, Ordering::SeqCst); + } +} + +fn complete_timer_action(pending_timer_actions: &AtomicIsize) { + pending_timer_actions.fetch_sub(1, Ordering::SeqCst); +} + struct PendingBroadcast { deliver_at: Duration, key: u64, @@ -65,6 +90,12 @@ struct Test { pub fuzz: bool, } +// Main QBFT simulation harness: +// 1. build one fake clock, four node transports, and QBFT callbacks; +// 2. spawn one thread per node running `qbft::run`; +// 3. route broadcasts through the in-memory network with optional drop/jitter/fuzz; +// 4. advance fake time only after pending compare/timer work is drained; +// 5. collect all decisions and assert same value plus expected round/value. fn test_qbft(test: Test) { const N: usize = 4; const MAX_ROUND: usize = 50; @@ -77,6 +108,8 @@ fn test_qbft(test: Test) { let clock = FakeClock::new(start_time); let cts = CancellationTokenSource::new(); + let pending_compares = Arc::new(AtomicUsize::new(0)); + let pending_timer_actions = Arc::new(AtomicIsize::new(0)); // Keep peer iteration deterministic. These fake-clock tests assert exact // rounds, and broadcast fanout order affects which node observes quorums // first when tests run in parallel. @@ -91,6 +124,11 @@ fn test_qbft(test: Test) { let (unjust_tx, unjust_rx) = mpmc::unbounded::(); let (result_chan_tx, result_chan_rx) = mpmc::bounded::>>(N); let (run_chan_tx, run_chan_rx) = mpmc::bounded::<(i64, RunOutcome)>(N); + let expected_initial_timers = N + test + .value_delay + .keys() + .filter(|process| !test.start_delay.contains_key(process)) + .count(); let is_leader = Box::new(make_is_leader(N as i64)); @@ -116,16 +154,27 @@ fn test_qbft(test: Test) { result_chan_tx.send(q_commit.clone()).expect(WRITE_CHAN_ERR); }) }, - compare: Arc::new(|_, _, _, _, return_err, _| { - return_err.send(Ok(())).expect(WRITE_CHAN_ERR); - }), + compare: { + let pending_compares = pending_compares.clone(); + Arc::new(move |_, _, _, _, return_err, _| { + let _guard = PendingCompareGuard { + pending_compares: pending_compares.clone(), + }; + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + }) + }, nodes: N as i64, fifo_limit: FIFO_LIMIT as i64, log_round_change: { let clock = clock.clone(); let trace = trace.clone(); + let pending_timer_actions = pending_timer_actions.clone(); Box::new(move |_, process, round, new_round, upon_rule, _| { + if upon_rule == UPON_ROUND_TIMEOUT { + complete_timer_action(&pending_timer_actions); + } + trace.push(format!( "{:?} - {}@{} change to {} ~= {}", clock.elapsed(), @@ -151,7 +200,12 @@ fn test_qbft(test: Test) { log_upon_rule: { let clock = clock.clone(); let trace = trace.clone(); + let pending_compares = pending_compares.clone(); Box::new(move |_, process, round, msg, upon_rule| { + if upon_rule == UPON_JUSTIFIED_PRE_PREPARE { + pending_compares.fetch_add(1, Ordering::SeqCst); + } + trace.push(format!( "{:?} {} => {}@{} -> {}@{} ~= {}", clock.elapsed(), @@ -239,9 +293,11 @@ fn test_qbft(test: Test) { let run_chan_tx = run_chan_tx.clone(); let defs = defs.clone(); let is_leader = is_leader.clone(); + let pending_timer_actions = pending_timer_actions.clone(); let trace = trace.clone(); s.spawn(move || { + let mut start_timer_fired = false; if let Some(delay) = start_delay { trace.push(format!( "{:?} Node {} start delay {:?}", @@ -249,14 +305,17 @@ fn test_qbft(test: Test) { i, delay )); - let (delay_ch, _) = clock.new_timer(delay); - _ = delay_ch.recv(); - trace.push(format!( - "{:?} Node {} starting {:?}", - clock.elapsed(), - i, - delay - )); + let (delay_ch, _) = + clock.new_timer_with_priority(delay, TimerPriority::StartDelay); + if delay_ch.recv().is_ok() { + start_timer_fired = true; + trace.push(format!( + "{:?} Node {} starting {:?}", + clock.elapsed(), + i, + delay + )); + } } if start_delay.is_some() { @@ -265,6 +324,9 @@ fn test_qbft(test: Test) { _ = receiver.recv().expect(READ_CHAN_ERR); } } + if start_timer_fired { + complete_timer_action(&pending_timer_actions); + } let (v_chan_tx, v_chan_rx) = mpmc::bounded::(1); let (vs_chan_tx, vs_chan_rx) = mpmc::bounded::(1); @@ -276,10 +338,13 @@ fn test_qbft(test: Test) { .as_ref() .expect("value sender kept until run returns") .clone(); - let (delay_ch, cancel) = clock.new_timer(delay); + let pending_timer_actions = pending_timer_actions.clone(); + let (delay_ch, cancel) = + clock.new_timer_with_priority(delay, TimerPriority::InputValue); s.spawn(move || { if delay_ch.recv().is_ok() { _ = v_chan_tx_send.send(i); + complete_timer_action(&pending_timer_actions); } cancel(); @@ -322,6 +387,21 @@ fn test_qbft(test: Test) { }); } + while clock.timer_count() < expected_initial_timers { + thread::yield_now(); + if real_start.elapsed() > TEST_STALL_TIMEOUT { + cts.cancel(); + clock.cancel(); + panic!( + "qbft test setup hang: timers={} expected={} seed={}\n{}", + clock.timer_count(), + expected_initial_timers, + seed, + trace.dump() + ); + } + } + let mut results = BTreeMap::>::new(); let mut count = 0; let mut decided = false; @@ -330,9 +410,10 @@ fn test_qbft(test: Test) { let mut pending = Vec::::new(); let mut next_fuzz_at = test.fuzz.then_some(Duration::from_millis(100)); let mut fuzz_counter = 0_u64; + let mut last_progress = time::Instant::now(); loop { - broadcasts += deliver_ready_broadcasts( + let delivered = deliver_ready_broadcasts( &mut pending, &receives, &test.drop_prob, @@ -340,6 +421,10 @@ fn test_qbft(test: Test) { &trace, &clock, ); + broadcasts += delivered; + if delivered > 0 { + last_progress = time::Instant::now(); + } if decided { next_fuzz_at = None; @@ -361,6 +446,7 @@ fn test_qbft(test: Test) { )); broadcasts += fanout_broadcast(&receives, &test.drop_prob, seed, &trace, &clock, msg); + last_progress = time::Instant::now(); next_fuzz_at = Some(next + Duration::from_millis(100)); } @@ -379,7 +465,8 @@ fn test_qbft(test: Test) { } BroadcastEvent::Delayed(delayed) => pending.push(delayed), } - if clock.elapsed() > Duration::from_secs(180) || real_start.elapsed() > Duration::from_secs(20) { + last_progress = time::Instant::now(); + if clock.elapsed() > Duration::from_secs(180) { cts.cancel(); clock.cancel(); panic!( @@ -405,6 +492,7 @@ fn test_qbft(test: Test) { recv(result_chan_rx) -> res => { let q_commit = res.expect(READ_CHAN_ERR); + last_progress = time::Instant::now(); for commit in q_commit.clone() { for (_, previous) in results.iter() { @@ -490,6 +578,7 @@ fn test_qbft(test: Test) { recv(run_chan_rx) -> res => { let (node, outcome) = res.expect(READ_CHAN_ERR); + last_progress = time::Instant::now(); if !matches!(outcome, Ok(Ok(()))) { if !decided { @@ -517,11 +606,36 @@ fn test_qbft(test: Test) { } default => { - // Give worker threads a small scheduling window before - // advancing fake time; these tests assert exact rounds. - thread::sleep(Duration::from_micros(50)); - clock.advance(Duration::from_millis(1)); - if clock.elapsed() > Duration::from_secs(180) || real_start.elapsed() > Duration::from_secs(20) { + if pending_compares.load(Ordering::SeqCst) != 0 + || pending_timer_actions.load(Ordering::SeqCst) > 0 + { + thread::yield_now(); + if last_progress.elapsed() > TEST_STALL_TIMEOUT { + cts.cancel(); + clock.cancel(); + panic!( + "qbft test hang: pending_compares={} pending_timer_actions={} decided={} done={} count={} elapsed={:?} real_elapsed={:?} broadcasts={} seed={}\n{}", + pending_compares.load(Ordering::SeqCst), + pending_timer_actions.load(Ordering::SeqCst), + decided, + done, + count, + clock.elapsed(), + real_start.elapsed(), + broadcasts, + seed, + trace.dump() + ); + } + continue; + } + + // Matches the Go harness throttle; ordering correctness + // comes from the pending-work barriers, not this duration. + thread::sleep(Duration::from_micros(1)); + clock.advance_and_wait(Duration::from_millis(1), &pending_timer_actions); + last_progress = time::Instant::now(); + if clock.elapsed() > Duration::from_secs(180) { cts.cancel(); clock.cancel(); panic!( @@ -900,6 +1014,8 @@ impl SomeMsg for TestMsg { } } +// Tests the normal-case path with an available round-1 leader. +// Expect all nodes to decide in round 1 on the leader value. #[test_case(0 ; "happy_0")] #[test_case(1 ; "happy_1")] fn happy(instance: i64) { @@ -910,6 +1026,8 @@ fn happy(instance: i64) { }); } +// Tests prepared-value carryover when commits are suppressed in earlier rounds. +// Expect later rounds to decide the highest prepared value, not a new leader value. #[test_case(1, None, 2, 1, false ; "prepare_round_1_decide_round_2")] #[test_case(2, Some(2), 3, 2, true ; "prepare_round_2_decide_round_3")] fn prepare_round( @@ -932,6 +1050,8 @@ fn prepare_round( }); } +// Tests round change when the first leader starts late. +// Expect the next live leader to drive consensus in round 2. #[test_case(false ; "leader_late_exp")] #[test_case(true ; "leader_down_const")] fn delayed_leader_start(const_period: bool) { @@ -944,6 +1064,8 @@ fn delayed_leader_start(const_period: bool) { }); } +// Tests recovery when two nodes, including early leaders, start much later. +// Expect consensus after enough round changes, with exact round only when deterministic. #[test_case(3, false, 4, false ; "very_late_exp")] #[test_case(1, true, 0, true ; "very_late_const")] fn very_late_start(instance: i64, const_period: bool, decide_round: i32, random_round: bool) { @@ -957,6 +1079,8 @@ fn very_late_start(instance: i64, const_period: bool, decide_round: i32, random_ }); } +// Tests staggered node startup and message buffering/draining. +// Expect consensus once enough live nodes join, with round allowed to vary. #[test_case(false ; "stagger_start_exp")] #[test_case(true ; "stagger_start_const")] fn stagger_start(const_period: bool) { @@ -974,6 +1098,8 @@ fn stagger_start(const_period: bool) { }); } +// Tests late input values on nodes that otherwise participate in the protocol. +// Expect round changes until a valid leader value is available. #[test_case(3, false, 4, false ; "very_delayed_value_exp")] #[test_case(1, true, 0, true ; "very_delayed_value_const")] fn very_delayed_value(instance: i64, const_period: bool, decide_round: i32, random_round: bool) { @@ -987,6 +1113,8 @@ fn very_delayed_value(instance: i64, const_period: bool, decide_round: i32, rand }); } +// Tests input values arriving at different fake times for all nodes. +// Expect consensus once enough nodes can validate/propose values. #[test_case(false ; "stagger_delayed_value_exp")] #[test_case(true ; "stagger_delayed_value_const")] fn stagger_delayed_value(const_period: bool) { @@ -1004,6 +1132,8 @@ fn stagger_delayed_value(const_period: bool) { }); } +// Tests a round-1 leader without input and a round-2 leader that is offline. +// Expect consensus to skip both blocked leaders and decide in round 3. #[test] fn round1_leader_no_value_round2_leader_offline() { test_qbft(Test { @@ -1016,6 +1146,8 @@ fn round1_leader_no_value_round2_leader_offline() { }); } +// Tests delayed broadcast delivery under fake network jitter. +// Expect safety and eventual consensus despite delayed messages. #[test_case(500, false ; "jitter_500ms_exp")] #[test_case(200, true ; "jitter_200ms_const")] fn jitter(bcast_jitter_ms: i32, const_period: bool) { @@ -1028,6 +1160,8 @@ fn jitter(bcast_jitter_ms: i32, const_period: bool) { }); } +// Tests deterministic message loss at 10% and 30%. +// Expect eventual consensus without conflicting decisions. #[test_case(0.1 ; "drop_10_percent_const")] #[test_case(0.3 ; "drop_30_percent_const")] fn dropped_messages(drop_probability: f64) { @@ -1045,6 +1179,8 @@ fn dropped_messages(drop_probability: f64) { }); } +// Tests bogus message injection during normal and delayed-leader scenarios. +// Expect unjust fuzz traffic to be ignored and honest nodes to still decide. #[test_case(None, 1, false ; "fuzz")] #[test_case(Some(2), 0, true ; "fuzz_with_late_leader")] #[test_case(Some(10), 0, true ; "fuzz_with_very_late_leader")] @@ -1088,6 +1224,8 @@ fn noop_transport() -> Transport { } } +// Tests quorum/faulty formulas across node counts. +// Expect quorum and tolerated-fault counts to match the Charon formula. #[test_case(1, 1, 0 ; "n1")] #[test_case(2, 2, 0 ; "n2")] #[test_case(3, 2, 0 ; "n3")] @@ -1119,6 +1257,8 @@ fn formulas(n: i64, q: i64, f: i64) { assert_eq!(f, d.faulty(), "Faulty given N={n}"); } +// Tests PRE-PREPARE justification with mixed ROUND_CHANGE and PREPARE evidence. +// Expect the proposal to be accepted when it carries a justified prepared value. #[test] fn is_justified_pre_prepare_mixed_round_change_prepare_fixture() { let preprepare = new_msg( @@ -1147,6 +1287,8 @@ fn is_justified_pre_prepare_mixed_round_change_prepare_fixture() { assert!(is_justified_pre_prepare(&def, &1, &preprepare, 0)); } +// Tests duplicate PRE-PREPARE rule handling after compare failure. +// Expect the next round proposal to trigger once and exit by cancellation. #[test] fn duplicate_pre_prepare_rules() { let cts = CancellationTokenSource::new(); @@ -1224,6 +1366,8 @@ fn duplicate_pre_prepare_rules() { assert!(matches!(res, Err(QbftError::ContextCanceled))); } +// Tests idle cancellation while no inputs, timers, or messages are available. +// Expect `run` to unblock and return `ContextCanceled`. #[test] fn idle_run_returns_when_cancelled() { let cts = CancellationTokenSource::new(); @@ -1235,8 +1379,10 @@ fn idle_run_returns_when_cancelled() { let (_input_tx, input_rx) = mpmc::bounded::(1); let (_source_tx, source_rx) = mpmc::bounded::(1); let (done_tx, done_rx) = mpmc::bounded(1); + let (started_tx, started_rx) = mpmc::bounded(1); thread::spawn(move || { + started_tx.send(()).expect(WRITE_CHAN_ERR); done_tx .send(qbft::run( &token, &def, &transport, &0, 1, input_rx, source_rx, @@ -1244,17 +1390,21 @@ fn idle_run_returns_when_cancelled() { .expect(WRITE_CHAN_ERR); }); - thread::sleep(Duration::from_millis(10)); + started_rx + .recv_timeout(TEST_WAIT_TIMEOUT) + .expect("run thread must start before cancellation"); cts.cancel(); assert!(matches!( done_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("idle run must unblock on cancellation"), Err(QbftError::ContextCanceled) )); } +// Tests definition validation at the `run` boundary. +// Expect invalid node count and FIFO limit to return typed errors. #[test_case(0, 1, true ; "invalid_nodes")] #[test_case(4, 0, false ; "invalid_fifo_limit")] fn invalid_definition_rejected(nodes: i64, fifo_limit: i64, invalid_nodes: bool) { @@ -1278,6 +1428,8 @@ fn invalid_definition_rejected(nodes: i64, fifo_limit: i64, invalid_nodes: bool) } } +// Tests cancellation under a continuously hot receive channel. +// Expect cancellation to win even when incoming traffic is always ready. #[test] fn run_cancels_under_hot_receive_stream() { let cts = CancellationTokenSource::new(); @@ -1294,6 +1446,7 @@ fn run_cancels_under_hot_receive_stream() { let (_input_tx, input_rx) = mpmc::bounded::(1); let (_source_tx, source_rx) = mpmc::bounded::(1); let (done_tx, done_rx) = mpmc::bounded(1); + let (started_tx, started_rx) = mpmc::bounded(1); let sender_cts = CancellationTokenSource::new(); let sender_token = sender_cts.token().clone(); @@ -1309,6 +1462,7 @@ fn run_cancels_under_hot_receive_stream() { }); thread::spawn(move || { + started_tx.send(()).expect(WRITE_CHAN_ERR); done_tx .send(qbft::run( &token, &def, &transport, &0, 1, input_rx, source_rx, @@ -1316,12 +1470,14 @@ fn run_cancels_under_hot_receive_stream() { .expect(WRITE_CHAN_ERR); }); - thread::sleep(Duration::from_millis(10)); + started_rx + .recv_timeout(TEST_WAIT_TIMEOUT) + .expect("run thread must start before cancellation"); cts.cancel(); assert!(matches!( done_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("run must unblock on cancellation even with a hot receive stream"), Err(QbftError::ContextCanceled) )); @@ -1330,6 +1486,8 @@ fn run_cancels_under_hot_receive_stream() { sender.join().expect("sender thread must exit"); } +// Tests message classification into QBFT upon-rules. +// Expect each fixture to map to the rule required by the protocol transition. #[test] fn classify_rules() { let mut def = noop_definition(); @@ -1376,6 +1534,8 @@ fn classify_rules() { ); } +// Tests ROUND_CHANGE quorum justification forms J1 and J2. +// Expect null-prepared and highest-prepared quorums to be accepted, invalid `pr` rejected. #[test] fn justified_qrc_j1_and_j2() { let mut def = noop_definition(); @@ -1411,6 +1571,8 @@ fn justified_qrc_j1_and_j2() { assert!(get_justified_qrc(&def, &invalid_pr, 2).is_none()); } +// Tests ROUND_CHANGE prepared-round bounds. +// Expect only null prepared round or strictly previous prepared rounds to be valid. #[test_case(2, -1, false ; "negative")] #[test_case(1, 0, true ; "null_at_round_one")] #[test_case(2, 1, true ; "previous_round")] @@ -1421,6 +1583,8 @@ fn valid_round_change_prepared_round_boundaries(round: i64, prepared_round: i64, assert_eq!(expected, valid_round_change_prepared_round(&msg)); } +// Tests invalid prepared rounds at every justification call site. +// Expect invalid ROUND_CHANGE messages to be filtered while valid quorums survive. #[test_case(-1 ; "negative")] #[test_case(2 ; "current_round")] #[test_case(3 ; "future_round")] @@ -1471,6 +1635,8 @@ fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites(invalid_pr: ); } +// Tests null-prepared quorum filtering. +// Expect only `prepared_round = 0` and `prepared_value = 0` messages to form J1. #[test_case(1, -1 ; "negative")] #[test_case(1, 1 ; "current_round")] #[test_case(1, 2 ; "future_round")] @@ -1489,6 +1655,8 @@ fn quorum_null_prepared_requires_null_prepared_rounds(round: i64, invalid_pr: i6 assert!(qrc.is_empty()); } +// Tests duplicate sender handling in quorum filters. +// Expect at most one message per source to count toward a quorum. #[test] fn filter_msgs_keeps_one_per_source() { let msgs = vec![ @@ -1506,6 +1674,8 @@ fn filter_msgs_keeps_one_per_source() { ); } +// Tests compare outcomes: success, error, cached value source, and timeout. +// Expect cached value sources to be preserved and timer expiry to return `TimeoutError`. #[test] fn compare_success_error_cached_value_source_and_timeout() { let cts = CancellationTokenSource::new(); @@ -1602,6 +1772,8 @@ fn compare_success_error_cached_value_source_and_timeout() { )); } +// Tests compare timeout with a cooperative but blocked callback. +// Expect `compare` to return timeout without joining the callback first. #[test] fn compare_timeout_does_not_wait_for_blocked_callback() { let cts = CancellationTokenSource::new(); @@ -1627,12 +1799,14 @@ fn compare_timeout_does_not_wait_for_blocked_callback() { assert!(matches!( result_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("compare must return on timer without waiting for blocked callback"), (0, Err(QbftError::TimeoutError)) )); } +// Tests a compare callback that exits without sending status. +// Expect `compare` to wait for timer/cancel instead of treating disconnect as final. #[test] fn compare_callback_exit_without_status_waits_for_timer() { let cts = CancellationTokenSource::new(); @@ -1654,7 +1828,7 @@ fn compare_callback_exit_without_status_waits_for_timer() { }); callback_done_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("compare callback must exit"); assert!( result_rx.try_recv().is_err(), @@ -1664,12 +1838,14 @@ fn compare_callback_exit_without_status_waits_for_timer() { timer_tx.send(time::Instant::now()).expect(WRITE_CHAN_ERR); assert!(matches!( result_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("compare must return after timer fires"), (0, Err(QbftError::TimeoutError)) )); } +// Tests parent cancellation propagation into the compare callback token. +// Expect `compare` to return `ContextCanceled` and the callback token to be canceled. #[test] fn compare_parent_cancel_cancels_callback_token() { let cts = CancellationTokenSource::new(); @@ -1698,11 +1874,11 @@ fn compare_parent_cancel_cancels_callback_token() { }); compare_started_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("compare must start"); cts.cancel(); - match result_rx.recv_timeout(Duration::from_millis(100)) { + match result_rx.recv_timeout(TEST_WAIT_TIMEOUT) { Ok(result) => assert!(matches!(result, (0, Err(QbftError::ContextCanceled)))), Err(err) => { let _ = timer_tx.send(time::Instant::now()); @@ -1710,10 +1886,12 @@ fn compare_parent_cancel_cancels_callback_token() { } } token_cancelled_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("callback token must be canceled by parent token"); } +// Tests parent cancellation while `run` is waiting inside compare. +// Expect `run` to return cancellation without broadcasting PREPARE. #[test] fn run_parent_cancel_during_compare_does_not_prepare() { const LEADER: i64 = 1; @@ -1761,18 +1939,18 @@ fn run_parent_cancel_during_compare_does_not_prepare() { }); compare_started_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("compare must start"); cts.cancel(); assert!(matches!( done_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("run must return parent cancellation from compare"), Err(QbftError::ContextCanceled) )); compare_cancelled_rx - .recv_timeout(Duration::from_millis(100)) + .recv_timeout(TEST_WAIT_TIMEOUT) .expect("compare callback token must be canceled"); assert!( broadcast_rx.try_iter().all(|type_| type_ != MSG_PREPARE), @@ -1799,6 +1977,8 @@ struct ChainSplitTest { should_halt: bool, } +// Tests value-source disagreement across nodes. +// Expect agreement when a quorum can compare equal values, or halt when no quorum can. #[test_case(1, 1, 1, 1, 1, 1, false ; "same_value")] #[test_case(1, 3, 1, 1, 1, 1, false ; "non_leader_peer_has_different_value")] #[test_case(3, 1, 1, 1, 2, 1, false ; "first_leader_has_different_value_second_leader_succeeds")] @@ -1826,8 +2006,11 @@ fn test_qbft_chain_split(test: ChainSplitTest) { const FIFO_LIMIT: i64 = 100; let clock = FakeClock::new(time::Instant::now()); + let real_start = time::Instant::now(); let cts = CancellationTokenSource::new(); let trace = Trace::new(); + let pending_compares = Arc::new(AtomicUsize::new(0)); + let pending_timer_actions = Arc::new(AtomicIsize::new(0)); // Keep peer iteration deterministic. These fake-clock tests assert exact // rounds, and broadcast fanout order affects which node observes quorums // first when tests run in parallel. @@ -1857,33 +2040,49 @@ fn test_qbft_chain_split(test: ChainSplitTest) { result_chan_tx.send(q_commit.clone()).expect(WRITE_CHAN_ERR); }) }, - compare: Arc::new( - |_, qcommit, input_value_source_ch, input_value_source, return_err, return_value| { - let leader_value_source = qcommit.value_source().expect("value source"); - let local = if *input_value_source == 0 { - let value = input_value_source_ch.recv().expect(READ_CHAN_ERR); - return_value.send(value).expect(WRITE_CHAN_ERR); - value - } else { - *input_value_source - }; - - if leader_value_source != local { - return_err - .send(Err(QbftError::CompareError)) - .expect(WRITE_CHAN_ERR); - return; - } + compare: { + let pending_compares = pending_compares.clone(); + Arc::new( + move |_, + qcommit, + input_value_source_ch, + input_value_source, + return_err, + return_value| { + let _guard = PendingCompareGuard { + pending_compares: pending_compares.clone(), + }; + let leader_value_source = qcommit.value_source().expect("value source"); + let local = if *input_value_source == 0 { + let value = input_value_source_ch.recv().expect(READ_CHAN_ERR); + return_value.send(value).expect(WRITE_CHAN_ERR); + value + } else { + *input_value_source + }; + + if leader_value_source != local { + return_err + .send(Err(QbftError::CompareError)) + .expect(WRITE_CHAN_ERR); + return; + } - return_err.send(Ok(())).expect(WRITE_CHAN_ERR); - }, - ), + return_err.send(Ok(())).expect(WRITE_CHAN_ERR); + }, + ) + }, nodes: N as i64, fifo_limit: FIFO_LIMIT, log_round_change: { let clock = clock.clone(); let trace = trace.clone(); + let pending_timer_actions = pending_timer_actions.clone(); Box::new(move |_, process, round, new_round, upon_rule, _| { + if upon_rule == UPON_ROUND_TIMEOUT { + complete_timer_action(&pending_timer_actions); + } + trace.push(format!( "{:?} - {}@{} change to {} ~= {}", clock.elapsed(), @@ -1903,7 +2102,12 @@ fn test_qbft_chain_split(test: ChainSplitTest) { log_upon_rule: { let clock = clock.clone(); let trace = trace.clone(); + let pending_compares = pending_compares.clone(); Box::new(move |_, process, round, msg, upon_rule| { + if upon_rule == UPON_JUSTIFIED_PRE_PREPARE { + pending_compares.fetch_add(1, Ordering::SeqCst); + } + trace.push(format!( "{:?} {} => {}@{} -> {}@{} ~= {}", clock.elapsed(), @@ -1977,15 +2181,32 @@ fn test_qbft_chain_split(test: ChainSplitTest) { }); } + while clock.timer_count() < N { + thread::yield_now(); + if real_start.elapsed() > TEST_STALL_TIMEOUT { + cts.cancel(); + clock.cancel(); + panic!( + "chain split setup hang: timers={} expected={} elapsed={:?}\n{}", + clock.timer_count(), + N, + clock.elapsed(), + trace.dump() + ); + } + } + let mut results = BTreeMap::>::new(); let mut count = 0; let mut decided = false; let mut done = 0; + let mut last_progress = time::Instant::now(); loop { mpmc::select! { recv(broadcast_rx) -> msg => { let msg = msg.expect(READ_CHAN_ERR); + last_progress = time::Instant::now(); for (target, (out_tx, _)) in receives.iter() { if *target == msg.source() { continue; @@ -2004,6 +2225,7 @@ fn test_qbft_chain_split(test: ChainSplitTest) { } recv(result_chan_rx) -> res => { let q_commit = res.expect(READ_CHAN_ERR); + last_progress = time::Instant::now(); if test.should_halt { cts.cancel(); clock.cancel(); @@ -2066,6 +2288,7 @@ fn test_qbft_chain_split(test: ChainSplitTest) { } recv(run_chan_rx) -> res => { let (node, outcome) = res.expect(READ_CHAN_ERR); + last_progress = time::Instant::now(); let expected_halt = test.should_halt && outcome_is_error(&outcome, |err| matches!(err, QbftError::MaxRoundReached)); if !(decided || expected_halt) { @@ -2091,13 +2314,37 @@ fn test_qbft_chain_split(test: ChainSplitTest) { } } default => { + if pending_compares.load(Ordering::SeqCst) != 0 + || pending_timer_actions.load(Ordering::SeqCst) > 0 + { + thread::yield_now(); + if last_progress.elapsed() > TEST_STALL_TIMEOUT { + cts.cancel(); + clock.cancel(); + panic!( + "chain split hang: pending_compares={} pending_timer_actions={} decided={decided} done={done} count={count} elapsed={:?}\n{}", + pending_compares.load(Ordering::SeqCst), + pending_timer_actions.load(Ordering::SeqCst), + clock.elapsed(), + trace.dump() + ); + } + continue; + } + + // Matches the Go harness throttle; ordering correctness comes + // from the pending-work barriers, not this duration. thread::sleep(Duration::from_micros(1)); + // The no-consensus halt case must reach round 11; using Go's + // 1ms tick here makes this Rust harness exceed its real-time + // guard, so only that halt path fast-forwards fake time. let tick = if test.should_halt { Duration::from_millis(100) } else { Duration::from_millis(1) }; - clock.advance(tick); + clock.advance_and_wait(tick, &pending_timer_actions); + last_progress = time::Instant::now(); let limit = if test.should_halt { let max_round = u32::try_from(MAX_ROUND).expect("MAX_ROUND fits u32"); let seconds = 1_u64 From 024bff78d7ae5c8397647dfb486097194e63895c Mon Sep 17 00:00:00 2001 From: Quang Le Date: Wed, 20 May 2026 16:34:48 +0700 Subject: [PATCH 28/28] fix: lint --- crates/core/src/qbft/fake_clock.rs | 8 +++--- crates/core/src/qbft/internal_test.rs | 36 ++++++++++++++++++--------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/crates/core/src/qbft/fake_clock.rs b/crates/core/src/qbft/fake_clock.rs index 160cae56..6bc32495 100644 --- a/crates/core/src/qbft/fake_clock.rs +++ b/crates/core/src/qbft/fake_clock.rs @@ -108,7 +108,8 @@ impl FakeClock { self.advance_inner(duration, Some(pending_timer_actions)) } - /// Shared advance path; optionally synchronizes timer delivery with the test harness. + /// Shared advance path; optionally synchronizes timer delivery with the + /// test harness. fn advance_inner( &self, duration: Duration, @@ -129,7 +130,7 @@ impl FakeClock { } } - for (id, _, _, _) in expired.iter() { + for (id, ..) in expired.iter() { inner.clients.remove(id); } @@ -182,7 +183,8 @@ impl FakeClock { } #[test] -/// Timers registered by different threads fire after fake time passes deadlines. +/// Timers registered by different threads fire after fake time passes +/// deadlines. fn multiple_threads_timers() { let clock = FakeClock::new(Instant::now()); let (done_tx, done_rx) = mpmc::bounded(2); diff --git a/crates/core/src/qbft/internal_test.rs b/crates/core/src/qbft/internal_test.rs index 0973c924..167f2c95 100644 --- a/crates/core/src/qbft/internal_test.rs +++ b/crates/core/src/qbft/internal_test.rs @@ -93,7 +93,8 @@ struct Test { // Main QBFT simulation harness: // 1. build one fake clock, four node transports, and QBFT callbacks; // 2. spawn one thread per node running `qbft::run`; -// 3. route broadcasts through the in-memory network with optional drop/jitter/fuzz; +// 3. route broadcasts through the in-memory network with optional +// drop/jitter/fuzz; // 4. advance fake time only after pending compare/timer work is drained; // 5. collect all decisions and assert same value plus expected round/value. fn test_qbft(test: Test) { @@ -1027,7 +1028,8 @@ fn happy(instance: i64) { } // Tests prepared-value carryover when commits are suppressed in earlier rounds. -// Expect later rounds to decide the highest prepared value, not a new leader value. +// Expect later rounds to decide the highest prepared value, not a new leader +// value. #[test_case(1, None, 2, 1, false ; "prepare_round_1_decide_round_2")] #[test_case(2, Some(2), 3, 2, true ; "prepare_round_2_decide_round_3")] fn prepare_round( @@ -1065,7 +1067,8 @@ fn delayed_leader_start(const_period: bool) { } // Tests recovery when two nodes, including early leaders, start much later. -// Expect consensus after enough round changes, with exact round only when deterministic. +// Expect consensus after enough round changes, with exact round only when +// deterministic. #[test_case(3, false, 4, false ; "very_late_exp")] #[test_case(1, true, 0, true ; "very_late_const")] fn very_late_start(instance: i64, const_period: bool, decide_round: i32, random_round: bool) { @@ -1258,7 +1261,8 @@ fn formulas(n: i64, q: i64, f: i64) { } // Tests PRE-PREPARE justification with mixed ROUND_CHANGE and PREPARE evidence. -// Expect the proposal to be accepted when it carries a justified prepared value. +// Expect the proposal to be accepted when it carries a justified prepared +// value. #[test] fn is_justified_pre_prepare_mixed_round_change_prepare_fixture() { let preprepare = new_msg( @@ -1535,7 +1539,8 @@ fn classify_rules() { } // Tests ROUND_CHANGE quorum justification forms J1 and J2. -// Expect null-prepared and highest-prepared quorums to be accepted, invalid `pr` rejected. +// Expect null-prepared and highest-prepared quorums to be accepted, invalid +// `pr` rejected. #[test] fn justified_qrc_j1_and_j2() { let mut def = noop_definition(); @@ -1572,7 +1577,8 @@ fn justified_qrc_j1_and_j2() { } // Tests ROUND_CHANGE prepared-round bounds. -// Expect only null prepared round or strictly previous prepared rounds to be valid. +// Expect only null prepared round or strictly previous prepared rounds to be +// valid. #[test_case(2, -1, false ; "negative")] #[test_case(1, 0, true ; "null_at_round_one")] #[test_case(2, 1, true ; "previous_round")] @@ -1584,7 +1590,8 @@ fn valid_round_change_prepared_round_boundaries(round: i64, prepared_round: i64, } // Tests invalid prepared rounds at every justification call site. -// Expect invalid ROUND_CHANGE messages to be filtered while valid quorums survive. +// Expect invalid ROUND_CHANGE messages to be filtered while valid quorums +// survive. #[test_case(-1 ; "negative")] #[test_case(2 ; "current_round")] #[test_case(3 ; "future_round")] @@ -1636,7 +1643,8 @@ fn invalid_round_change_prepared_rounds_are_filtered_from_call_sites(invalid_pr: } // Tests null-prepared quorum filtering. -// Expect only `prepared_round = 0` and `prepared_value = 0` messages to form J1. +// Expect only `prepared_round = 0` and `prepared_value = 0` messages to form +// J1. #[test_case(1, -1 ; "negative")] #[test_case(1, 1 ; "current_round")] #[test_case(1, 2 ; "future_round")] @@ -1675,7 +1683,8 @@ fn filter_msgs_keeps_one_per_source() { } // Tests compare outcomes: success, error, cached value source, and timeout. -// Expect cached value sources to be preserved and timer expiry to return `TimeoutError`. +// Expect cached value sources to be preserved and timer expiry to return +// `TimeoutError`. #[test] fn compare_success_error_cached_value_source_and_timeout() { let cts = CancellationTokenSource::new(); @@ -1806,7 +1815,8 @@ fn compare_timeout_does_not_wait_for_blocked_callback() { } // Tests a compare callback that exits without sending status. -// Expect `compare` to wait for timer/cancel instead of treating disconnect as final. +// Expect `compare` to wait for timer/cancel instead of treating disconnect as +// final. #[test] fn compare_callback_exit_without_status_waits_for_timer() { let cts = CancellationTokenSource::new(); @@ -1845,7 +1855,8 @@ fn compare_callback_exit_without_status_waits_for_timer() { } // Tests parent cancellation propagation into the compare callback token. -// Expect `compare` to return `ContextCanceled` and the callback token to be canceled. +// Expect `compare` to return `ContextCanceled` and the callback token to be +// canceled. #[test] fn compare_parent_cancel_cancels_callback_token() { let cts = CancellationTokenSource::new(); @@ -1978,7 +1989,8 @@ struct ChainSplitTest { } // Tests value-source disagreement across nodes. -// Expect agreement when a quorum can compare equal values, or halt when no quorum can. +// Expect agreement when a quorum can compare equal values, or halt when no +// quorum can. #[test_case(1, 1, 1, 1, 1, 1, false ; "same_value")] #[test_case(1, 3, 1, 1, 1, 1, false ; "non_leader_peer_has_different_value")] #[test_case(3, 1, 1, 1, 2, 1, false ; "first_leader_has_different_value_second_leader_succeeds")]