From 646e551587bddb6ebaa37760580b02d21bd39ae1 Mon Sep 17 00:00:00 2001 From: Carl Date: Fri, 29 May 2020 20:00:14 -0700 Subject: [PATCH 1/5] test --- local-cluster/tests/local_cluster.rs | 132 ++++++++++++++++++++------- 1 file changed, 97 insertions(+), 35 deletions(-) diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 9ef8e3f3bdb46b..c92b3fc38f7a47 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -207,16 +207,21 @@ fn test_leader_failure_4() { /// * `leader_schedule` - An option that specifies whether the cluster should /// run with a fixed, predetermined leader schedule #[allow(clippy::cognitive_complexity)] -fn run_cluster_partition( - partitions: &[&[(usize, bool)]], +fn run_cluster_partition( + partitions: &[&[usize]], leader_schedule: Option<(LeaderSchedule, Vec>)>, -) { + on_partition_start: E, + on_partition_resolved: F, +) where + E: Fn(&mut LocalCluster) -> (), + F: Fn(&mut LocalCluster) -> (), +{ solana_logger::setup(); info!("PARTITION_TEST!"); let num_nodes = partitions.len(); let node_stakes: Vec<_> = partitions .iter() - .flat_map(|p| p.iter().map(|(stake_weight, _)| 100 * *stake_weight as u64)) + .flat_map(|p| p.iter().map(|stake_weight| 100 * *stake_weight as u64)) .collect(); assert_eq!(node_stakes.len(), num_nodes); let cluster_lamports = node_stakes.iter().sum::() * 2; @@ -226,7 +231,7 @@ fn run_cluster_partition( validator_config.enable_partition = Some(enable_partition.clone()); // Returns: - // 1) The keys for the validiators + // 1) The keys for the validators // 2) The amount of time it would take to iterate through one full iteration of the given // leader schedule let (validator_keys, leader_schedule_time): (Vec<_>, u64) = { @@ -252,7 +257,6 @@ fn run_cluster_partition( } }; - let validator_pubkeys: Vec<_> = validator_keys.iter().map(|v| v.pubkey()).collect(); let config = ClusterConfig { cluster_lamports, node_stakes, @@ -288,6 +292,7 @@ fn run_cluster_partition( if reached_epoch { info!("PARTITION_TEST start partition"); enable_partition.clone().store(false, Ordering::Relaxed); + on_partition_start(&mut cluster); break; } else { sleep(Duration::from_millis(100)); @@ -298,13 +303,6 @@ fn run_cluster_partition( info!("PARTITION_TEST remove partition"); enable_partition.store(true, Ordering::Relaxed); - let mut dead_nodes = HashSet::new(); - let mut alive_node_contact_infos = vec![]; - let should_exits: Vec<_> = partitions - .iter() - .flat_map(|p| p.iter().map(|(_, should_exit)| should_exit)) - .collect(); - assert_eq!(should_exits.len(), validator_pubkeys.len()); let timeout = 10; if timeout > 0 { // Give partitions time to propagate their blocks from during the partition @@ -318,26 +316,15 @@ fn run_cluster_partition( ); sleep(Duration::from_millis(propagation_time)); info!("PARTITION_TEST resuming normal operation"); - for (pubkey, should_exit) in validator_pubkeys.iter().zip(should_exits) { - if *should_exit { - info!("Killing validator with id: {}", pubkey); - cluster.exit_node(pubkey); - dead_nodes.insert(*pubkey); - } else { - alive_node_contact_infos.push( - cluster - .validators - .get(pubkey) - .unwrap() - .info - .contact_info - .clone(), - ); - } - } + on_partition_resolved(&mut cluster); } - assert_eq!(alive_node_contact_infos.is_empty(), false); + let alive_node_contact_infos: Vec<_> = cluster + .validators + .values() + .map(|v| v.info.contact_info.clone()) + .collect(); + assert!(!alive_node_contact_infos.is_empty()); info!("PARTITION_TEST discovering nodes"); let cluster_nodes = discover_cluster( &alive_node_contact_infos[0].gossip, @@ -355,7 +342,8 @@ fn run_cluster_partition( #[test] #[serial] fn test_cluster_partition_1_2() { - run_cluster_partition(&[&[(1, false)], &[(1, false), (1, false)]], None) + let empty = |_: &mut LocalCluster| {}; + run_cluster_partition(&[&[1], &[1, 1]], None, empty.clone(), empty) } #[allow(unused_attributes)] @@ -363,13 +351,15 @@ fn test_cluster_partition_1_2() { #[test] #[serial] fn test_cluster_partition_1_1() { - run_cluster_partition(&[&[(1, false)], &[(1, false)]], None) + let empty = |_: &mut LocalCluster| {}; + run_cluster_partition(&[&[1], &[1]], None, empty.clone(), empty) } #[test] #[serial] fn test_cluster_partition_1_1_1() { - run_cluster_partition(&[&[(1, false)], &[(1, false)], &[(1, false)]], None) + let empty = |_: &mut LocalCluster| {}; + run_cluster_partition(&[&[1], &[1], &[1]], None, empty.clone(), empty) } #[test] @@ -387,7 +377,7 @@ fn test_kill_partition() { // 5) Check for recovery let mut leader_schedule = vec![]; let num_slots_per_validator = 8; - let partitions: [&[(usize, bool)]; 3] = [&[(9, true)], &[(10, false)], &[(10, false)]]; + let partitions: [&[usize]; 3] = [&[9], &[10], &[10]]; let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new())) .take(partitions.len()) .collect(); @@ -406,12 +396,84 @@ fn test_kill_partition() { } info!("leader_schedule: {}", leader_schedule.len()); + let empty = |_: &mut LocalCluster| {}; + let validator_to_kill = validator_keys[0].pubkey(); + let exit_pubkey = |cluster: &mut LocalCluster| { + info!("Killing validator with id: {}", validator_to_kill); + cluster.exit_node(&validator_to_kill); + }; + run_cluster_partition( + &partitions, + Some(( + LeaderSchedule::new_from_schedule(leader_schedule), + validator_keys, + )), + empty, + exit_pubkey, + ) +} + +#[test] +#[serial] +fn test_kill_partition_switch_threshold() { + // Needs to be at least 1/3 or there will be no overlap + // with the confirmation supermajority 2/3 + assert!(SWITCH_FORK_THRESHOLD >= 1f64 / 3f64); + let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD; + let total_stake = 10_000; + let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64; + let failures_stake = max_failures_stake - 1; + let total_alive_stake = total_stake - failures_stake; + let alive_stake_1 = total_alive_stake / 2; + let alive_stake_2 = total_alive_stake - alive_stake_1; + info!( + "stakes: {} {} {}", + failures_stake, alive_stake_1, alive_stake_2 + ); + + // This test: + // 1) Spins up three partitions + // 2) Kills the first partition with the stake `failures_stake` + // 5) Check for recovery + let mut leader_schedule = vec![]; + let num_slots_per_validator = 8; + let partitions: [&[usize]; 3] = [ + &[(failures_stake as usize)], + &[(alive_stake_1 as usize)], + &[(alive_stake_2 as usize)], + ]; + let validator_keys: Vec<_> = iter::repeat_with(|| Arc::new(Keypair::new())) + .take(partitions.len()) + .collect(); + for (i, k) in validator_keys.iter().enumerate() { + let num_slots = { + if i == 0 { + // Set up the leader to have 50% of the slots + num_slots_per_validator * (partitions.len() - 1) + } else { + num_slots_per_validator + } + }; + for _ in 0..num_slots { + leader_schedule.push(k.pubkey()) + } + } + info!("leader_schedule: {}", leader_schedule.len()); + + let empty = |_: &mut LocalCluster| {}; + let validator_to_kill = validator_keys[0].pubkey(); + let exit_pubkey = |cluster: &mut LocalCluster| { + info!("Killing validator with id: {}", validator_to_kill); + cluster.exit_node(&validator_to_kill); + }; run_cluster_partition( &partitions, Some(( LeaderSchedule::new_from_schedule(leader_schedule), validator_keys, )), + exit_pubkey, + empty, ) } From 3e677e1385350e678c360eb319caba3204719037 Mon Sep 17 00:00:00 2001 From: Carl Date: Sat, 30 May 2020 00:47:41 -0700 Subject: [PATCH 2/5] Account for slots < root now being in BankForks --- core/src/consensus.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/core/src/consensus.rs b/core/src/consensus.rs index f1d31645815642..445e6c969b463c 100644 --- a/core/src/consensus.rs +++ b/core/src/consensus.rs @@ -383,6 +383,7 @@ impl Tower { total_stake: u64, epoch_vote_accounts: &HashMap, ) -> SwitchForkDecision { + let root = self.lockouts.root_slot.unwrap_or(0); self.last_vote() .slots .last() @@ -407,12 +408,18 @@ impl Tower { let mut locked_out_stake = 0; let mut locked_out_vote_accounts = HashSet::new(); for (candidate_slot, descendants) in descendants.iter() { - // 1) Only consider lockouts a tips of forks as that - // includes all ancestors of that fork. - // 2) Don't consider lockouts on the `last_vote` itself - // 3) Don't consider lockouts on any descendants of + // 1) Don't consider any banks that haven't been frozen yet + // because the neededd stats are unavailable + // 2) Only consider lockouts at the latest `frozen` bank + // on each fork, as that bank will contain all the + // lockout intervals for ancestors on that fork as well. + // 3) Don't consider lockouts on the `last_vote` itself + // 4) Don't consider lockouts on any descendants of // `last_vote` - if !descendants.is_empty() + // 5) Don't consider any banks before the root because + // all lockouts must be ancestors of `last_vote` + if !progress.get_fork_stats(*candidate_slot).map(|stats| stats.computed).unwrap_or(false) + || descendants.iter().any(|d| progress.get_fork_stats(*d).map(|stats| stats.computed).unwrap_or(false)) || candidate_slot == last_vote || ancestors .get(&candidate_slot) @@ -421,6 +428,7 @@ impl Tower { exist in the ancestors map", ) .contains(last_vote) + || *candidate_slot <= root { continue; } @@ -448,10 +456,11 @@ impl Tower { // 2) Not from before the current root as we can't determine if // anything before the root was an ancestor of `last_vote` or not if !last_vote_ancestors.contains(lockout_interval_start) - // The check if the key exists in the ancestors map - // is equivalent to checking if the key is above the - // current root. - && ancestors.contains_key(lockout_interval_start) + // Given a `lockout_interval_start` < root that appears in a + // bank for a `candidate_slot`, it must be that `lockout_interval_start` + // is an ancestor of the current root, because `candidate_slot` is a + // descendant of the current root + && *lockout_interval_start > root && !locked_out_vote_accounts.contains(vote_account_pubkey) { let stake = epoch_vote_accounts From 684125bb27dd58e7f847b36bcfae05939eb1db76 Mon Sep 17 00:00:00 2001 From: Carl Date: Sat, 30 May 2020 17:15:38 -0700 Subject: [PATCH 3/5] Refactor local cluster --- local-cluster/src/local_cluster.rs | 24 ++++++- local-cluster/tests/local_cluster.rs | 98 ++++++++++++++-------------- 2 files changed, 71 insertions(+), 51 deletions(-) diff --git a/local-cluster/src/local_cluster.rs b/local-cluster/src/local_cluster.rs index 960db5dbfb49a7..6252b11a478b89 100644 --- a/local-cluster/src/local_cluster.rs +++ b/local-cluster/src/local_cluster.rs @@ -1,4 +1,7 @@ -use crate::cluster::{Cluster, ClusterValidatorInfo, ValidatorInfo}; +use crate::{ + cluster::{Cluster, ClusterValidatorInfo, ValidatorInfo}, + cluster_tests, +}; use itertools::izip; use log::*; use solana_client::thin_client::{create_client, ThinClient}; @@ -336,6 +339,25 @@ impl LocalCluster { Self::transfer_with_client(&client, source_keypair, dest_pubkey, lamports) } + pub fn check_for_new_roots(&self, num_new_roots: usize, test_name: &str) { + let alive_node_contact_infos: Vec<_> = self + .validators + .values() + .map(|v| v.info.contact_info.clone()) + .collect(); + assert!(!alive_node_contact_infos.is_empty()); + info!("{} discovering nodes", test_name); + let cluster_nodes = discover_cluster( + &alive_node_contact_infos[0].gossip, + alive_node_contact_infos.len(), + ) + .unwrap(); + info!("{} discovered {} nodes", test_name, cluster_nodes.len()); + info!("{} looking for new roots on all nodes", test_name); + cluster_tests::check_for_new_roots(num_new_roots, &alive_node_contact_infos); + info!("{} done waiting for roots", test_name); + } + fn transfer_with_client( client: &ThinClient, source_keypair: &Keypair, diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index c92b3fc38f7a47..b227bfdea25627 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -4,8 +4,10 @@ use serial_test_derive::serial; use solana_client::rpc_client::RpcClient; use solana_client::thin_client::create_client; use solana_core::{ - broadcast_stage::BroadcastStageType, consensus::VOTE_THRESHOLD_DEPTH, - gossip_service::discover_cluster, validator::ValidatorConfig, + broadcast_stage::BroadcastStageType, + consensus::{SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH}, + gossip_service::discover_cluster, + validator::ValidatorConfig, }; use solana_download_utils::download_snapshot; use solana_ledger::bank_forks::CompressionType; @@ -303,38 +305,22 @@ fn run_cluster_partition( info!("PARTITION_TEST remove partition"); enable_partition.store(true, Ordering::Relaxed); - let timeout = 10; - if timeout > 0 { - // Give partitions time to propagate their blocks from during the partition - // after the partition resolves - let propagation_time = leader_schedule_time; - info!("PARTITION_TEST resolving partition. sleeping {}ms", timeout); - sleep(Duration::from_millis(10_000)); - info!( - "PARTITION_TEST waiting for blocks to propagate after partition {}ms", - propagation_time - ); - sleep(Duration::from_millis(propagation_time)); - info!("PARTITION_TEST resuming normal operation"); - on_partition_resolved(&mut cluster); - } - - let alive_node_contact_infos: Vec<_> = cluster - .validators - .values() - .map(|v| v.info.contact_info.clone()) - .collect(); - assert!(!alive_node_contact_infos.is_empty()); - info!("PARTITION_TEST discovering nodes"); - let cluster_nodes = discover_cluster( - &alive_node_contact_infos[0].gossip, - alive_node_contact_infos.len(), - ) - .unwrap(); - info!("PARTITION_TEST discovered {} nodes", cluster_nodes.len()); - info!("PARTITION_TEST looking for new roots on all nodes"); - cluster_tests::check_for_new_roots(16, &alive_node_contact_infos); - info!("PARTITION_TEST done waiting for roots"); + // Give partitions time to propagate their blocks from during the partition + // after the partition resolves + let timeout = 10_000; + let propagation_time = leader_schedule_time; + info!( + "PARTITION_TEST resolving partition. sleeping {} ms", + timeout + ); + sleep(Duration::from_millis(timeout)); + info!( + "PARTITION_TEST waiting for blocks to propagate after partition {}ms", + propagation_time + ); + sleep(Duration::from_millis(propagation_time)); + info!("PARTITION_TEST resuming normal operation"); + on_partition_resolved(&mut cluster); } #[allow(unused_attributes)] @@ -343,7 +329,10 @@ fn run_cluster_partition( #[serial] fn test_cluster_partition_1_2() { let empty = |_: &mut LocalCluster| {}; - run_cluster_partition(&[&[1], &[1, 1]], None, empty.clone(), empty) + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_cluster_partition(&[&[1], &[1, 1]], None, empty, on_partition_resolved) } #[allow(unused_attributes)] @@ -352,14 +341,20 @@ fn test_cluster_partition_1_2() { #[serial] fn test_cluster_partition_1_1() { let empty = |_: &mut LocalCluster| {}; - run_cluster_partition(&[&[1], &[1]], None, empty.clone(), empty) + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_cluster_partition(&[&[1], &[1]], None, empty, on_partition_resolved) } #[test] #[serial] fn test_cluster_partition_1_1_1() { let empty = |_: &mut LocalCluster| {}; - run_cluster_partition(&[&[1], &[1], &[1]], None, empty.clone(), empty) + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_cluster_partition(&[&[1], &[1], &[1]], None, empty, on_partition_resolved) } #[test] @@ -398,9 +393,10 @@ fn test_kill_partition() { let empty = |_: &mut LocalCluster| {}; let validator_to_kill = validator_keys[0].pubkey(); - let exit_pubkey = |cluster: &mut LocalCluster| { + let on_partition_resolved = |cluster: &mut LocalCluster| { info!("Killing validator with id: {}", validator_to_kill); cluster.exit_node(&validator_to_kill); + cluster.check_for_new_roots(16, &"PARTITION_TEST"); }; run_cluster_partition( &partitions, @@ -409,23 +405,28 @@ fn test_kill_partition() { validator_keys, )), empty, - exit_pubkey, + on_partition_resolved, ) } #[test] #[serial] +#[allow(clippy::assertions_on_constants)] fn test_kill_partition_switch_threshold() { // Needs to be at least 1/3 or there will be no overlap // with the confirmation supermajority 2/3 assert!(SWITCH_FORK_THRESHOLD >= 1f64 / 3f64); + let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD; let total_stake = 10_000; let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64; - let failures_stake = max_failures_stake - 1; + let failures_stake = max_failures_stake - 2; let total_alive_stake = total_stake - failures_stake; let alive_stake_1 = total_alive_stake / 2; let alive_stake_2 = total_alive_stake - alive_stake_1; + + assert!(alive_stake_1 as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD); + assert!(alive_stake_2 as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD); info!( "stakes: {} {} {}", failures_stake, alive_stake_1, alive_stake_2 @@ -460,20 +461,22 @@ fn test_kill_partition_switch_threshold() { } info!("leader_schedule: {}", leader_schedule.len()); - let empty = |_: &mut LocalCluster| {}; let validator_to_kill = validator_keys[0].pubkey(); - let exit_pubkey = |cluster: &mut LocalCluster| { + let on_partition_start = |cluster: &mut LocalCluster| { info!("Killing validator with id: {}", validator_to_kill); cluster.exit_node(&validator_to_kill); }; + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; run_cluster_partition( &partitions, Some(( LeaderSchedule::new_from_schedule(leader_schedule), validator_keys, )), - exit_pubkey, - empty, + on_partition_start, + on_partition_resolved, ) } @@ -1175,12 +1178,7 @@ fn test_faulty_node(faulty_node_type: BroadcastStageType) { let cluster = LocalCluster::new(&cluster_config); // Check for new roots - let alive_node_contact_infos: Vec<_> = cluster - .validators - .values() - .map(|v| v.info.contact_info.clone()) - .collect(); - cluster_tests::check_for_new_roots(16, &alive_node_contact_infos); + cluster.check_for_new_roots(16, &"test_faulty_node"); } #[test] From 42ec0fa2413988c4a3020abe28ddb3e71db17304 Mon Sep 17 00:00:00 2001 From: Carl Date: Sat, 30 May 2020 21:09:33 -0700 Subject: [PATCH 4/5] Add test for making sure switch doesn't happen past failure threshold --- local-cluster/src/cluster_tests.rs | 58 +++++++++++++++++++- local-cluster/src/local_cluster.rs | 21 ++++++- local-cluster/tests/local_cluster.rs | 82 ++++++++++++++++++++++------ 3 files changed, 140 insertions(+), 21 deletions(-) diff --git a/local-cluster/src/cluster_tests.rs b/local-cluster/src/cluster_tests.rs index 0b47ef48df87ea..836282db9526e1 100644 --- a/local-cluster/src/cluster_tests.rs +++ b/local-cluster/src/cluster_tests.rs @@ -284,7 +284,7 @@ pub fn kill_entry_and_spend_and_verify_rest( } } -pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo]) { +pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo], test_name: &str) { let mut roots = vec![HashSet::new(); contact_infos.len()]; let mut done = false; let mut last_print = Instant::now(); @@ -295,7 +295,7 @@ pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo]) roots[i].insert(slot); let min_node = roots.iter().map(|r| r.len()).min().unwrap_or(0); if last_print.elapsed().as_secs() > 3 { - info!("PARTITION_TEST min observed roots {}/16", min_node); + info!("{} min observed roots {}/16", test_name, min_node); last_print = Instant::now(); } done = min_node >= num_new_roots; @@ -304,6 +304,60 @@ pub fn check_for_new_roots(num_new_roots: usize, contact_infos: &[ContactInfo]) } } +pub fn check_no_new_roots( + num_slots_to_wait: usize, + contact_infos: &[ContactInfo], + test_name: &str, +) { + assert!(!contact_infos.is_empty()); + let mut roots = vec![0; contact_infos.len()]; + let max_slot = contact_infos + .iter() + .enumerate() + .map(|(i, ingress_node)| { + let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE); + let initial_root = client + .get_slot() + .unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id)); + roots[i] = initial_root; + client + .get_slot_with_commitment(CommitmentConfig::recent()) + .unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id)) + }) + .max() + .unwrap(); + + let end_slot = max_slot + num_slots_to_wait as u64; + let mut current_slot; + let mut last_print = Instant::now(); + let client = create_client(contact_infos[0].client_facing_addr(), VALIDATOR_PORT_RANGE); + loop { + current_slot = client + .get_slot_with_commitment(CommitmentConfig::recent()) + .unwrap_or_else(|_| panic!("get_slot for {} failed", contact_infos[0].id)); + if current_slot > end_slot { + break; + } + if last_print.elapsed().as_secs() > 3 { + info!( + "{} current slot: {}, waiting for slot: {}", + test_name, current_slot, end_slot + ); + last_print = Instant::now(); + } + } + + for (i, ingress_node) in contact_infos.iter().enumerate() { + let client = create_client(ingress_node.client_facing_addr(), VALIDATOR_PORT_RANGE); + assert_eq!( + client + .get_slot() + .unwrap_or_else(|_| panic!("get_slot for {} failed", ingress_node.id)), + roots[i] + ); + } +} + fn poll_all_nodes_for_signature( entry_point_info: &ContactInfo, cluster_nodes: &[ContactInfo], diff --git a/local-cluster/src/local_cluster.rs b/local-cluster/src/local_cluster.rs index 6252b11a478b89..87f1fdfb3fd797 100644 --- a/local-cluster/src/local_cluster.rs +++ b/local-cluster/src/local_cluster.rs @@ -354,7 +354,26 @@ impl LocalCluster { .unwrap(); info!("{} discovered {} nodes", test_name, cluster_nodes.len()); info!("{} looking for new roots on all nodes", test_name); - cluster_tests::check_for_new_roots(num_new_roots, &alive_node_contact_infos); + cluster_tests::check_for_new_roots(num_new_roots, &alive_node_contact_infos, test_name); + info!("{} done waiting for roots", test_name); + } + + pub fn check_no_new_roots(&self, num_slots_to_wait: usize, test_name: &str) { + let alive_node_contact_infos: Vec<_> = self + .validators + .values() + .map(|v| v.info.contact_info.clone()) + .collect(); + assert!(!alive_node_contact_infos.is_empty()); + info!("{} discovering nodes", test_name); + let cluster_nodes = discover_cluster( + &alive_node_contact_infos[0].gossip, + alive_node_contact_infos.len(), + ) + .unwrap(); + info!("{} discovered {} nodes", test_name, cluster_nodes.len()); + info!("{} making sure no new roots on any nodes", test_name); + cluster_tests::check_no_new_roots(num_slots_to_wait, &alive_node_contact_infos, test_name); info!("{} done waiting for roots", test_name); } diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index b227bfdea25627..7b474404519dc4 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -409,24 +409,18 @@ fn test_kill_partition() { ) } -#[test] -#[serial] #[allow(clippy::assertions_on_constants)] -fn test_kill_partition_switch_threshold() { +fn run_kill_partition_switch_threshold( + failures_stake: u64, + alive_stake_1: u64, + alive_stake_2: u64, + on_partition_resolved: F, +) where + F: Fn(&mut LocalCluster) -> (), +{ // Needs to be at least 1/3 or there will be no overlap // with the confirmation supermajority 2/3 assert!(SWITCH_FORK_THRESHOLD >= 1f64 / 3f64); - - let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD; - let total_stake = 10_000; - let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64; - let failures_stake = max_failures_stake - 2; - let total_alive_stake = total_stake - failures_stake; - let alive_stake_1 = total_alive_stake / 2; - let alive_stake_2 = total_alive_stake - alive_stake_1; - - assert!(alive_stake_1 as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD); - assert!(alive_stake_2 as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD); info!( "stakes: {} {} {}", failures_stake, alive_stake_1, alive_stake_2 @@ -435,7 +429,7 @@ fn test_kill_partition_switch_threshold() { // This test: // 1) Spins up three partitions // 2) Kills the first partition with the stake `failures_stake` - // 5) Check for recovery + // 5) runs `on_partition_resolved` let mut leader_schedule = vec![]; let num_slots_per_validator = 8; let partitions: [&[usize]; 3] = [ @@ -466,9 +460,6 @@ fn test_kill_partition_switch_threshold() { info!("Killing validator with id: {}", validator_to_kill); cluster.exit_node(&validator_to_kill); }; - let on_partition_resolved = |cluster: &mut LocalCluster| { - cluster.check_for_new_roots(16, &"PARTITION_TEST"); - }; run_cluster_partition( &partitions, Some(( @@ -480,6 +471,61 @@ fn test_kill_partition_switch_threshold() { ) } +#[test] +#[serial] +fn test_kill_partition_switch_threshold_no_progress() { + let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD; + let total_stake = 10_000; + let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64; + + let failures_stake = max_failures_stake; + let total_alive_stake = total_stake - failures_stake; + let alive_stake_1 = total_alive_stake / 2; + let alive_stake_2 = total_alive_stake - alive_stake_1; + + // Check that no new roots were set 400 slots after partition resolves (gives time + // for lockouts built during partition to resolve and gives validators an opportunity + // to try and switch forks) + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_no_new_roots(400, &"PARTITION_TEST"); + }; + + // This kills `max_failures_stake`, so no progress should be made + run_kill_partition_switch_threshold( + failures_stake, + alive_stake_1, + alive_stake_2, + on_partition_resolved, + ); +} + +#[test] +#[serial] +fn test_kill_partition_switch_threshold() { + let max_switch_threshold_failure_pct = 1.0 - 2.0 * SWITCH_FORK_THRESHOLD; + let total_stake = 10_000; + let max_failures_stake = (max_switch_threshold_failure_pct * total_stake as f64) as u64; + let failures_stake = max_failures_stake - 2; + let total_alive_stake = total_stake - failures_stake; + let alive_stake_1 = total_alive_stake / 2; + let alive_stake_2 = total_alive_stake - alive_stake_1; + + // Both remaining validators must be > SWITCH_FORK_THRESHOLD in order + // to guarantee progress in reasonable amount of time + assert!(alive_stake_1 as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD); + assert!(alive_stake_2 as f64 / total_stake as f64 > SWITCH_FORK_THRESHOLD); + + let on_partition_resolved = |cluster: &mut LocalCluster| { + cluster.check_for_new_roots(16, &"PARTITION_TEST"); + }; + run_kill_partition_switch_threshold( + failures_stake, + alive_stake_1, + alive_stake_2, + on_partition_resolved, + ); +} + #[test] #[serial] fn test_two_unbalanced_stakes() { From 7c9b962f1a2edfa8ae8d8e4fc6808dbdf6a0ef4c Mon Sep 17 00:00:00 2001 From: Carl Date: Sat, 30 May 2020 21:59:19 -0700 Subject: [PATCH 5/5] Update unit tests --- core/src/consensus.rs | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/core/src/consensus.rs b/core/src/consensus.rs index 445e6c969b463c..2a8f7583d07309 100644 --- a/core/src/consensus.rs +++ b/core/src/consensus.rs @@ -1053,8 +1053,11 @@ pub mod test { // Fill the BankForks according to the above fork structure vote_simulator.fill_bank_forks(forks, &HashMap::new()); + for (_, fork_progress) in vote_simulator.progress.iter_mut() { + fork_progress.fork_stats.computed = true; + } let ancestors = vote_simulator.bank_forks.read().unwrap().ancestors(); - let descendants = vote_simulator.bank_forks.read().unwrap().descendants(); + let mut descendants = vote_simulator.bank_forks.read().unwrap().descendants(); let mut tower = Tower::new_with_key(&my_pubkey); // Last vote is 47 @@ -1131,6 +1134,23 @@ pub mod test { SwitchForkDecision::FailedSwitchThreshold ); + // Adding another validator lockout on a different fork, and the lockout + // covers the last vote would count towards the switch threshold, + // unless the bank is not the most recent frozen bank on the fork (14 is a + // frozen/computed bank > 13 on the same fork in this case) + vote_simulator.simulate_lockout_interval(13, (12, 47), &other_vote_account); + assert_eq!( + tower.check_switch_threshold( + 110, + &ancestors, + &descendants, + &vote_simulator.progress, + total_stake, + bank0.epoch_vote_accounts(0).unwrap(), + ), + SwitchForkDecision::FailedSwitchThreshold + ); + // Adding another validator lockout on a different fork, and the lockout // covers the last vote, should satisfy the switch threshold vote_simulator.simulate_lockout_interval(14, (12, 47), &other_vote_account); @@ -1146,10 +1166,26 @@ pub mod test { SwitchForkDecision::SwitchProof(Hash::default()) ); + // Adding another unfrozen descendant of the tip of 14 should not remove + // slot 14 from consideration because it is still the most recent frozen + // bank on its fork + descendants.get_mut(&14).unwrap().insert(10000); + assert_eq!( + tower.check_switch_threshold( + 110, + &ancestors, + &descendants, + &vote_simulator.progress, + total_stake, + bank0.epoch_vote_accounts(0).unwrap(), + ), + SwitchForkDecision::SwitchProof(Hash::default()) + ); + // If we set a root, then any lockout intervals below the root shouldn't // count toward the switch threshold. This means the other validator's // vote lockout no longer counts - vote_simulator.set_root(43); + tower.lockouts.root_slot = Some(43); assert_eq!( tower.check_switch_threshold( 110,