From 56970ac613c963cf04f688f75d4cffdf2f05c64c Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Thu, 16 Feb 2023 17:57:23 -0500 Subject: [PATCH] feat: Add support for reporting on the use of readiness probes [K8S006] (#21) --- docs/process/checks.md | 6 -- eksup/src/analysis.rs | 1 + eksup/src/k8s/checks.rs | 47 +++++++++++--- eksup/src/k8s/findings.rs | 3 + eksup/src/k8s/resources.rs | 93 ++++++++++++++++++++++------ eksup/src/playbook.rs | 2 + eksup/templates/playbook.md | 3 + examples/test-mixed_v1.24_upgrade.md | 11 +++- 8 files changed, 130 insertions(+), 36 deletions(-) diff --git a/docs/process/checks.md b/docs/process/checks.md index bbdf461..f7df6cc 100644 --- a/docs/process/checks.md +++ b/docs/process/checks.md @@ -211,16 +211,10 @@ Either `.spec.affinity.podAntiAffinity` or `.spec.topologySpreadConstraints` is #### K8S006 -!!! info "🚧 _Not yet implemented_" - **❌ Remediation required** A `readinessProbe` must be set to ensure traffic is not sent to pods before they are ready following their re-deployment from a node replacement. -**⚠️ Remediation recommended** - -If a `livenessProbe` is provided, it should not be the same as `readinessProbe`, and a `startupProbe` should also accompany it. - #### K8S007 !!! info "🚧 _Not yet implemented_" diff --git a/eksup/src/analysis.rs b/eksup/src/analysis.rs index 394e4c6..12830d7 100644 --- a/eksup/src/analysis.rs +++ b/eksup/src/analysis.rs @@ -34,6 +34,7 @@ impl Results { output.push_str(&self.data_plane.version_skew.to_stdout_table()?); output.push_str(&self.kubernetes.min_replicas.to_stdout_table()?); output.push_str(&self.kubernetes.min_ready_seconds.to_stdout_table()?); + output.push_str(&self.kubernetes.readiness_probe.to_stdout_table()?); Ok(output) } diff --git a/eksup/src/k8s/checks.rs b/eksup/src/k8s/checks.rs index d7d39af..5c65736 100644 --- a/eksup/src/k8s/checks.rs +++ b/eksup/src/k8s/checks.rs @@ -246,16 +246,43 @@ pub(crate) struct PodTopologyDistributionFinding { pub(crate) fcode: finding::Code, } -#[derive(Debug, Serialize, Deserialize)] -pub(crate) struct ProbeFinding { +#[derive(Debug, Serialize, Deserialize, Tabled)] +#[tabled(rename_all = "UpperCase")] +pub struct Probe { + #[tabled(inline)] + pub finding: finding::Finding, + + #[tabled(inline)] pub(crate) resource: Resource, - /// - pub(crate) readiness: Option, - pub(crate) liveness: Option, - pub(crate) startup: Option, +} - pub(crate) remediation: finding::Remediation, - pub(crate) fcode: finding::Code, +impl Findings for Vec { + fn to_markdown_table(&self, leading_whitespace: &str) -> Result { + if self.is_empty() { + return Ok(format!( + "{leading_whitespace}✅ - All relevant Kubernetes workloads have a readiness probe configured" + )); + } + + let mut table = Table::new(self); + table + .with(Disable::column(ByColumnName::new("CHECK"))) + .with(Margin::new(1, 0, 0, 0).set_fill('\t', 'x', 'x', 'x')) + .with(Style::markdown()); + + Ok(format!("{table}\n")) + } + + fn to_stdout_table(&self) -> Result { + if self.is_empty() { + return Ok("".to_owned()); + } + + let mut table = Table::new(self); + table.with(Style::sharp()); + + Ok(format!("{table}\n")) + } } #[derive(Debug, Serialize, Deserialize)] @@ -286,8 +313,8 @@ pub trait K8sFindings { // fn pod_disruption_budget(&self) -> Result>; // /// K8S005 - check if resources have podAntiAffinity or topologySpreadConstraints // fn pod_topology_distribution(&self) -> Result>; - // /// K8S006 - check if resources have readinessProbe - // fn readiness_probe(&self) -> Result>; + /// K8S006 - check if resources have readinessProbe + fn readiness_probe(&self) -> Option; // /// K8S008 - check if resources use the Docker socket // fn docker_socket(&self) -> Result>; } diff --git a/eksup/src/k8s/findings.rs b/eksup/src/k8s/findings.rs index fc7fe6c..22cc487 100644 --- a/eksup/src/k8s/findings.rs +++ b/eksup/src/k8s/findings.rs @@ -11,6 +11,7 @@ use crate::k8s::{ pub struct KubernetesFindings { pub min_replicas: Vec, pub min_ready_seconds: Vec, + pub readiness_probe: Vec, } pub async fn get_kubernetes_findings(k8s_client: &K8sClient) -> Result { @@ -19,9 +20,11 @@ pub async fn get_kubernetes_findings(k8s_client: &K8sClient) -> Result = resources.iter().filter_map(|s| s.min_replicas()).collect(); let min_ready_seconds: Vec = resources.iter().filter_map(|s| s.min_ready_seconds()).collect(); + let readiness_probe: Vec = resources.iter().filter_map(|s| s.readiness_probe()).collect(); Ok(KubernetesFindings { min_replicas, min_ready_seconds, + readiness_probe, }) } diff --git a/eksup/src/k8s/resources.rs b/eksup/src/k8s/resources.rs index 27da6c4..a6de6a1 100644 --- a/eksup/src/k8s/resources.rs +++ b/eksup/src/k8s/resources.rs @@ -7,10 +7,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use tabled::Tabled; -use crate::{ - finding, - k8s::checks::{K8sFindings, MinReadySeconds, MinReplicas}, -}; +use crate::{finding, k8s::checks}; /// Custom resource definition for ENIConfig as specified in the AWS VPC CNI /// @@ -33,6 +30,31 @@ pub struct EniConfigSpec { pub security_groups: Option>, } +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub enum Kind { + DaemonSet, + Deployment, + ReplicaSet, + ReplicationController, + StatefulSet, + CronJob, + Job, +} + +impl std::fmt::Display for Kind { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match *self { + Kind::DaemonSet => write!(f, "DaemonSet"), + Kind::Deployment => write!(f, "Deployment"), + Kind::ReplicaSet => write!(f, "ReplicaSet"), + Kind::ReplicationController => write!(f, "ReplicationController"), + Kind::StatefulSet => write!(f, "StatefulSet"), + Kind::CronJob => write!(f, "CronJob"), + Kind::Job => write!(f, "Job"), + } + } +} + /// Returns all of the ENIConfigs in the cluster, if any are present /// /// This is used to extract the subnet ID(s) to retrieve the number of @@ -58,7 +80,7 @@ async fn get_deployments(client: &Client) -> Result> { let metadata = StdMetadata { name: objmeta.name.unwrap(), namespace: objmeta.namespace.unwrap(), - kind: "Deployment".to_string(), + kind: Kind::Deployment, labels: objmeta.labels.unwrap_or_default(), annotations: objmeta.annotations.unwrap_or_default(), }; @@ -90,7 +112,7 @@ async fn _get_replicasets(client: &Client) -> Result> { let metadata = StdMetadata { name: objmeta.name.unwrap(), namespace: objmeta.namespace.unwrap(), - kind: "ReplicaSet".to_string(), + kind: Kind::ReplicaSet, labels: objmeta.labels.unwrap_or_default(), annotations: objmeta.annotations.unwrap_or_default(), }; @@ -122,7 +144,7 @@ async fn get_statefulsets(client: &Client) -> Result> { let metadata = StdMetadata { name: objmeta.name.unwrap(), namespace: objmeta.namespace.unwrap(), - kind: "StatefulSet".to_string(), + kind: Kind::StatefulSet, labels: objmeta.labels.unwrap_or_default(), annotations: objmeta.annotations.unwrap_or_default(), }; @@ -154,7 +176,7 @@ async fn get_daemonsets(client: &Client) -> Result> { let metadata = StdMetadata { name: objmeta.name.unwrap(), namespace: objmeta.namespace.unwrap(), - kind: "DaemonSet".to_string(), + kind: Kind::DaemonSet, labels: objmeta.labels.unwrap_or_default(), annotations: objmeta.annotations.unwrap_or_default(), }; @@ -186,7 +208,7 @@ async fn get_jobs(client: &Client) -> Result> { let metadata = StdMetadata { name: objmeta.name.unwrap(), namespace: objmeta.namespace.unwrap(), - kind: "Job".to_string(), + kind: Kind::Job, labels: objmeta.labels.unwrap_or_default(), annotations: objmeta.annotations.unwrap_or_default(), }; @@ -218,7 +240,7 @@ async fn get_cronjobs(client: &Client) -> Result> { let metadata = StdMetadata { name: objmeta.name.unwrap(), namespace: objmeta.namespace.unwrap(), - kind: "CronJob".to_string(), + kind: Kind::CronJob, labels: objmeta.labels.unwrap_or_default(), annotations: objmeta.annotations.unwrap_or_default(), }; @@ -265,14 +287,14 @@ pub struct Resource { /// Namespace where the resource is provisioned pub namespace: String, /// Kind of the resource - pub kind: String, + pub kind: Kind, } #[derive(Debug, Serialize, Deserialize)] pub struct StdMetadata { pub name: String, pub namespace: String, - pub kind: String, + pub kind: Kind, pub labels: BTreeMap, pub annotations: BTreeMap, } @@ -297,16 +319,16 @@ pub struct StdResource { pub spec: StdSpec, } -impl K8sFindings for StdResource { +impl checks::K8sFindings for StdResource { fn get_resource(&self) -> Resource { Resource { name: self.metadata.name.to_owned(), namespace: self.metadata.namespace.to_owned(), - kind: self.metadata.kind.to_string(), + kind: self.metadata.kind.to_owned(), } } - fn min_replicas(&self) -> Option { + fn min_replicas(&self) -> Option { let replicas = self.spec.replicas; match replicas { @@ -318,7 +340,7 @@ impl K8sFindings for StdResource { symbol: remediation.symbol(), remediation, }; - Some(MinReplicas { + Some(checks::MinReplicas { finding, resource: self.get_resource(), replicas, @@ -331,7 +353,7 @@ impl K8sFindings for StdResource { } } - fn min_ready_seconds(&self) -> Option { + fn min_ready_seconds(&self) -> Option { let seconds = self.spec.min_ready_seconds; match seconds { @@ -344,7 +366,7 @@ impl K8sFindings for StdResource { remediation, }; - Some(MinReadySeconds { + Some(checks::MinReadySeconds { finding, resource: self.get_resource(), seconds, @@ -356,6 +378,41 @@ impl K8sFindings for StdResource { None => None, } } + + fn readiness_probe(&self) -> Option { + let pod_template = self.spec.template.to_owned(); + + let resource = self.get_resource(); + match resource.kind { + Kind::DaemonSet | Kind::Job | Kind::CronJob => return None, + _ => (), + } + + match pod_template { + Some(pod_template) => { + let containers = pod_template.spec.unwrap_or_default().containers; + + for container in containers { + if container.readiness_probe.is_none() { + let remediation = finding::Remediation::Required; + let finding = finding::Finding { + code: finding::Code::K8S006, + symbol: remediation.symbol(), + remediation, + }; + + // As soon as we find one container without a readiness probe, we return the finding + return Some(checks::Probe { + finding, + resource: self.get_resource(), + }); + } + } + None + } + None => None, + } + } } pub async fn get_resources(client: &Client) -> Result> { diff --git a/eksup/src/playbook.rs b/eksup/src/playbook.rs index 7286ead..4e8970c 100644 --- a/eksup/src/playbook.rs +++ b/eksup/src/playbook.rs @@ -57,6 +57,7 @@ pub struct TemplateData { // kubernetes_findings: k8s::KubernetesFindings, min_replicas: String, min_ready_seconds: String, + readiness_probe: String, } fn get_release_data() -> Result> { @@ -177,6 +178,7 @@ pub(crate) fn create(args: &Playbook, cluster: &Cluster, analysis: analysis::Res // kubernetes_findings, min_replicas: kubernetes_findings.min_replicas.to_markdown_table("\t")?, min_ready_seconds: kubernetes_findings.min_ready_seconds.to_markdown_table("\t")?, + readiness_probe: kubernetes_findings.readiness_probe.to_markdown_table("\t")?, }; let filename = match &args.filename { diff --git a/eksup/templates/playbook.md b/eksup/templates/playbook.md index 2ac4f3b..881a333 100644 --- a/eksup/templates/playbook.md +++ b/eksup/templates/playbook.md @@ -209,6 +209,9 @@ When upgrading the control plane, Amazon EKS performs standard infrastructure an #### Check [[K8S003]](https://clowdhaus.github.io/eksup/process/checks/#k8s003) {{ min_ready_seconds }} + #### Check [[K8S006]](https://clowdhaus.github.io/eksup/process/checks/#k8s006) +{{ readiness_probe }} + 2. Inspect [AWS service quotas](https://docs.aws.amazon.com/general/latest/gr/aws_service_limits.html) before upgrading. Accounts that are multi-tenant or already have a number of resources provisioned may be at risk of hitting service quota limits which will cause the cluster upgrade to fail, or impede the upgrade process. {{#if pod_ips}} diff --git a/examples/test-mixed_v1.24_upgrade.md b/examples/test-mixed_v1.24_upgrade.md index 7f8806f..818bd2a 100644 --- a/examples/test-mixed_v1.24_upgrade.md +++ b/examples/test-mixed_v1.24_upgrade.md @@ -73,8 +73,8 @@ #### Check [[K8S001]](https://clowdhaus.github.io/eksup/process/checks/#k8s001) | CHECK | | NODE | CONTROL PLANE | SKEW | QUANTITY | |--------|----|-------|---------------|------|----------| - | K8S001 | ❌ | v1.21 | v1.23 | +2 | 2 | | K8S001 | ⚠️ | v1.22 | v1.23 | +1 | 2 | + | K8S001 | ❌ | v1.21 | v1.23 | +2 | 2 | | | NAME | NODE | CONTROL PLANE | SKEW | |----|-----------------------------|-------|---------------|------| @@ -142,7 +142,7 @@ #### Check [[EKS005]](https://clowdhaus.github.io/eksup/process/checks/#eks005) | | NAME | CURRENT | LATEST | DEFAULT | |----|------------|---------------------|--------------------|--------------------| - | ⚠️ | coredns | v1.8.4-eksbuild.2 | v1.8.7-eksbuild.3 | v1.8.7-eksbuild.3 | + | ⚠️ | coredns | v1.8.4-eksbuild.2 | v1.9.3-eksbuild.2 | v1.8.7-eksbuild.3 | | ❌ | kube-proxy | v1.21.14-eksbuild.3 | v1.24.9-eksbuild.1 | v1.24.7-eksbuild.2 | | ❌ | vpc-cni | v1.11.3-eksbuild.3 | v1.12.2-eksbuild.1 | v1.11.4-eksbuild.1 | @@ -221,6 +221,13 @@ When upgrading the control plane, Amazon EKS performs standard infrastructure an #### Check [[K8S003]](https://clowdhaus.github.io/eksup/process/checks/#k8s003) ✅ - All relevant Kubernetes workloads minReadySeconds set to more than 0 + #### Check [[K8S006]](https://clowdhaus.github.io/eksup/process/checks/#k8s006) + | | NAME | NAMESPACE | KIND | + |----|---------|-------------|-------------| + | ❌ | bad-dpl | deployment | Deployment | + | ❌ | bad-ss | statefulset | StatefulSet | + + 2. Inspect [AWS service quotas](https://docs.aws.amazon.com/general/latest/gr/aws_service_limits.html) before upgrading. Accounts that are multi-tenant or already have a number of resources provisioned may be at risk of hitting service quota limits which will cause the cluster upgrade to fail, or impede the upgrade process. 3. Verify that there is sufficient IP space available to the pods running in the cluster when using custom networking. With the in-place, surge upgrade process, there will be higher IP consumption during the upgrade.