Skip to content

Commit

Permalink
fix: Update logic used to evaluate control plane IPs to ensure at lea…
Browse files Browse the repository at this point in the history
…st 2 AZs each have 5 or more IPs available (#38)
  • Loading branch information
bryantbiggs authored Mar 11, 2023
1 parent 9bdc48f commit 6e94e1a
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 77 deletions.
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/process/checks.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ Checks that are specific to Amazon EKS

**❌ Remediation required**

There are at least 5 available IPs for the control plane to upgrade; required for cross account ENI creation.
There are at least 2 subnets in different availability zones, each with at least 5 available IPs for the control plane to upgrade.

#### EKS002

Expand Down
1 change: 1 addition & 0 deletions eksup/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ aws-sdk-eks = "0.24"
aws-types = "0.54"
clap = { version = "4.0", features = ["derive", "string"] }
handlebars = { version = "4.3", features = ["rust-embed"] }
itertools = "0.10"
# https://kube.rs/kubernetes-version/
k8s-openapi = { version = "0.17.0", default-features = false, features = ["v1_22"] }
kube = { version = "0.80.0", default-features = false, features = [ "client", "derive", "rustls-tls" ] }
Expand Down
122 changes: 72 additions & 50 deletions eksup/src/eks/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use aws_sdk_eks::{
model::{Addon, Cluster, Nodegroup},
Client as EksClient,
};
use itertools::Itertools;
use kube::Client as K8sClient;
use serde::{Deserialize, Serialize};
use tabled::{locator::ByColumnName, Disable, Margin, Style, Table, Tabled};
Expand Down Expand Up @@ -100,57 +101,69 @@ pub(crate) async fn cluster_health(cluster: &Cluster) -> Result<Vec<ClusterHealt
pub struct InsufficientSubnetIps {
#[tabled(inline)]
pub finding: finding::Finding,
#[tabled(display_with = "tabled_vec_to_string")]
pub ids: Vec<String>,
pub id: String,
pub available_ips: i32,
}

impl Findings for Option<InsufficientSubnetIps> {
impl Findings for Vec<InsufficientSubnetIps> {
fn to_markdown_table(&self, leading_whitespace: &str) -> Result<String> {
match self {
Some(finding) => {
let mut table = Table::new(vec![finding]);
table
.with(Disable::column(ByColumnName::new("CHECK")))
.with(Margin::new(1, 0, 0, 0).set_fill('\t', 'x', 'x', 'x'))
.with(Style::markdown());

Ok(format!("{table}\n"))
}
None => Ok(format!(
if self.is_empty() {
return Ok(format!(
"{leading_whitespace}✅ - There is sufficient IP space in the subnets provided"
)),
));
}

let mut table = Table::new(self);
table
.with(Disable::column(ByColumnName::new("CHECK")))
.with(Margin::new(1, 0, 0, 0).set_fill('\t', 'x', 'x', 'x'))
.with(Style::markdown());

Ok(format!("{table}\n"))
}

fn to_stdout_table(&self) -> Result<String> {
match self {
None => Ok("".to_owned()),
Some(finding) => {
let mut table = Table::new(vec![finding]);
table.with(Style::sharp());

Ok(format!("{table}\n"))
}
if self.is_empty() {
return Ok("".to_owned());
}

let mut table = Table::new(self);
table.with(Style::sharp());

Ok(format!("{table}\n"))
}
}

pub(crate) async fn control_plane_ips(
ec2_client: &Ec2Client,
cluster: &Cluster,
) -> Result<Option<InsufficientSubnetIps>> {
pub(crate) async fn control_plane_ips(ec2_client: &Ec2Client, cluster: &Cluster) -> Result<Vec<InsufficientSubnetIps>> {
let subnet_ids = match cluster.resources_vpc_config() {
Some(vpc_config) => match vpc_config.subnet_ids() {
Some(subnet_ids) => subnet_ids.to_owned(),
None => return Ok(None),
None => return Ok(vec![]),
},
None => return Ok(None),
None => return Ok(vec![]),
};

let subnet_ips = resources::get_subnet_ips(ec2_client, subnet_ids).await?;
if subnet_ips.available_ips >= 5 {
return Ok(None);

let availability_zone_ips: Vec<(String, i32)> = subnet_ips
.iter()
.group_by(|subnet| subnet.availablity_zone_id.clone())
.into_iter()
.map(|(az, subnets)| {
let total_ips = subnets.map(|subnet| subnet.available_ips).sum();
(az, total_ips)
})
.collect();

// There are at least 2 different availability zones with 5 or more IPs; no finding
if availability_zone_ips
.iter()
.filter(|(_az, ips)| ips >= &5)
.collect::<Vec<_>>()
.len()
>= 2
{
return Ok(vec![]);
}

let remediation = finding::Remediation::Required;
Expand All @@ -160,13 +173,16 @@ pub(crate) async fn control_plane_ips(
remediation,
};

let finding = InsufficientSubnetIps {
finding,
ids: subnet_ips.ids,
available_ips: subnet_ips.available_ips,
};

Ok(Some(finding))
Ok(
availability_zone_ips
.iter()
.map(|(az, ips)| InsufficientSubnetIps {
finding: finding.clone(),
id: az.clone(),
available_ips: *ips,
})
.collect(),
)
}

/// Check if the subnets used by the pods will support an upgrade
Expand All @@ -179,10 +195,10 @@ pub(crate) async fn pod_ips(
k8s_client: &K8sClient,
required_ips: i32,
recommended_ips: i32,
) -> Result<Option<InsufficientSubnetIps>> {
) -> Result<Vec<InsufficientSubnetIps>> {
let eniconfigs = k8s::get_eniconfigs(k8s_client).await?;
if eniconfigs.is_empty() {
return Ok(None);
return Ok(vec![]);
}

let subnet_ids = eniconfigs
Expand All @@ -191,12 +207,13 @@ pub(crate) async fn pod_ips(
.collect();

let subnet_ips = resources::get_subnet_ips(ec2_client, subnet_ids).await?;
let available_ips: i32 = subnet_ips.iter().map(|subnet| subnet.available_ips).sum();

if subnet_ips.available_ips >= recommended_ips {
return Ok(None);
if available_ips >= recommended_ips {
return Ok(vec![]);
}

let remediation = if subnet_ips.available_ips >= required_ips {
let remediation = if available_ips >= required_ips {
finding::Remediation::Required
} else {
finding::Remediation::Recommended
Expand All @@ -208,13 +225,18 @@ pub(crate) async fn pod_ips(
remediation,
};

let subnetips = InsufficientSubnetIps {
finding,
ids: subnet_ips.ids,
available_ips: subnet_ips.available_ips,
};

Ok(Some(subnetips))
Ok(
subnet_ips
.iter()
.group_by(|subnet| subnet.availablity_zone_id.clone())
.into_iter()
.map(|(az, subnets)| InsufficientSubnetIps {
finding: finding.clone(),
id: az,
available_ips: subnets.map(|subnet| subnet.available_ips).sum(),
})
.collect(),
)
}

/// Details of the addon as viewed from an upgrade perspective
Expand Down
4 changes: 2 additions & 2 deletions eksup/src/eks/findings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ pub async fn get_cluster_findings(cluster: &Cluster) -> Result<ClusterFindings>
#[derive(Debug, Serialize, Deserialize)]
pub struct SubnetFindings {
/// The Amazon EKS service requires at least 5 available IPs in order to upgrade a cluster in-place
pub control_plane_ips: Option<checks::InsufficientSubnetIps>,
pub control_plane_ips: Vec<checks::InsufficientSubnetIps>,
/// This is the number of IPs available to pods when custom networking is enabled on the AWS VPC CNI,
/// pulling the available number of IPs for the subnets listed in the ENIConfig resource(s)
pub pod_ips: Option<checks::InsufficientSubnetIps>,
pub pod_ips: Vec<checks::InsufficientSubnetIps>,
}

/// Collects findings related to networking and subnets
Expand Down
34 changes: 20 additions & 14 deletions eksup/src/eks/resources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ pub async fn get_cluster(client: &EksClient, name: &str) -> Result<Cluster> {

/// Container for the subnet IDs and their total available IPs
#[derive(Clone, Debug, Serialize, Deserialize)]
pub(crate) struct SubnetIPs {
pub(crate) ids: Vec<String>,
pub(crate) struct VpcSubnet {
pub(crate) id: String,
pub(crate) available_ips: i32,
pub(crate) availablity_zone_id: String,
}

/// Describe the subnets provided by ID
Expand All @@ -45,7 +46,7 @@ pub(crate) struct SubnetIPs {
/// IP contention/exhaustion across the various subnets in use
/// by the control plane ENIs, the nodes, and the pods (when custom
/// networking is enabled)
pub(crate) async fn get_subnet_ips(client: &Ec2Client, subnet_ids: Vec<String>) -> Result<SubnetIPs> {
pub(crate) async fn get_subnet_ips(client: &Ec2Client, subnet_ids: Vec<String>) -> Result<Vec<VpcSubnet>> {
let subnets = client
.describe_subnets()
.set_subnet_ids(Some(subnet_ids))
Expand All @@ -54,17 +55,22 @@ pub(crate) async fn get_subnet_ips(client: &Ec2Client, subnet_ids: Vec<String>)
.subnets
.context("Subnets not found")?;

let available_ips = subnets
.iter()
.map(|subnet| subnet.available_ip_address_count.unwrap_or_default())
.sum();

let ids = subnets
.iter()
.map(|subnet| subnet.subnet_id().unwrap_or_default().to_string())
.collect::<Vec<String>>();

Ok(SubnetIPs { ids, available_ips })
Ok(
subnets
.iter()
.map(|subnet| {
let id = subnet.subnet_id().unwrap_or_default().to_string();
let available_ips = subnet.available_ip_address_count.unwrap_or_default();
let availablity_zone_id = subnet.availability_zone_id().unwrap_or_default().to_string();

VpcSubnet {
id,
available_ips,
availablity_zone_id,
}
})
.collect(),
)
}

pub async fn get_addons(client: &EksClient, cluster_name: &str) -> Result<Vec<Addon>> {
Expand Down
4 changes: 2 additions & 2 deletions eksup/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ pub async fn create(args: &Create) -> Result<()> {

let results = analysis::analyze(&aws_config, &cluster).await?;

if let Err(_err) = playbook::create(playbook, region, &cluster, results) {
// eprintln!("{err}");
if let Err(err) = playbook::create(playbook, region, &cluster, results) {
eprintln!("{err}");
process::exit(2);
}
}
Expand Down
16 changes: 8 additions & 8 deletions examples/test-mixed_v1.24_upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@
| K8S001 || v1.21 | v1.23 | +2 | 2 |
| K8S001 || v1.22 | v1.23 | +1 | 2 |

| | NAME | NODE | CONTROL PLANE | SKEW |
|----|----------------------------|-------|---------------|------|
|| ip-10-0-10-49.ec2.internal | v1.21 | v1.23 | +2 |
|| ip-10-0-14-22.ec2.internal | v1.22 | v1.23 | +1 |
|| ip-10-0-20-62.ec2.internal | v1.22 | v1.23 | +1 |
|| ip-10-0-7-12.ec2.internal | v1.21 | v1.23 | +2 |
| | NAME | NODE | CONTROL PLANE | SKEW |
|----|-----------------------------|-------|---------------|------|
|| ip-10-0-0-100.ec2.internal | v1.21 | v1.23 | +2 |
|| ip-10-0-14-188.ec2.internal | v1.22 | v1.23 | +1 |
|| ip-10-0-19-35.ec2.internal | v1.21 | v1.23 | +2 |
|| ip-10-0-40-93.ec2.internal | v1.22 | v1.23 | +1 |


3. Verify that there are at least 5 free IPs in the VPC subnets used by the control plane. Amazon EKS creates new elastic network interfaces (ENIs) in any of the subnets specified for the control plane. If there are not enough available IPs, then the upgrade will fail (your control plane will stay on the prior version).
Expand Down Expand Up @@ -337,7 +337,7 @@ The default update strategy for EKS managed nodegroups is a surge, rolling updat
Check [[EKS006]](https://clowdhaus.github.io/eksup/process/checks/#eks006)
| | MANAGED NODEGROUP | LAUNCH TEMP ID | CURRENT | LATEST |
|---|-------------------------------------|----------------------|---------|--------|
| ⚠️ | standard-20230310135434793800000027 | lt-0d8873f5c893efaa0 | 1 | 2 |
| ⚠️ | standard-20230311143408696200000027 | lt-0a9ebcea03f330711 | 1 | 2 |
##### Upgrade
Expand Down Expand Up @@ -413,7 +413,7 @@ A starting point for the instance refresh configuration is to use a value of 70%
Check [[EKS007]](https://clowdhaus.github.io/eksup/process/checks/#eks007)
| | AUTOSCALING GROUP | LAUNCH TEMP ID | CURRENT | LATEST |
|---|--------------------------------------|----------------------|---------|--------|
| ⚠️ | different-20230310135435081600000029 | lt-0a880c2680a8cf174 | 1 | 2 |
| ⚠️ | different-20230311143408778000000029 | lt-061e6a6f3cc5c1db9 | 1 | 2 |
##### Upgrade
Expand Down

0 comments on commit 6e94e1a

Please sign in to comment.