From eb80c097557dffff21d2bcbbc07520c400dfbf39 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Thu, 13 Jun 2024 02:45:58 +0530 Subject: [PATCH 1/2] enhancement: add `livez` endpoint Add a `livez` endpoint to identify network outages. This helps in restarting the binary if such as case is observed. Signed-off-by: Pranshu Srivastava Signed-off-by: Pranshu Srivastava --- README.md | 8 +++++ README.md.tpl | 8 +++++ examples/autosharding/statefulset.yaml | 2 +- examples/daemonsetsharding/daemonset.yaml | 2 +- .../deployment-no-node-pods.yaml | 2 +- examples/daemonsetsharding/deployment.yaml | 2 +- examples/standard/deployment.yaml | 2 +- .../kube-state-metrics.libsonnet | 2 +- pkg/app/server.go | 35 +++++++++++++++---- 9 files changed, 51 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index fbebaaf9fc..e263c0dbec 100644 --- a/README.md +++ b/README.md @@ -342,6 +342,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service. +#### Healthcheck Endpoints + +The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes: + +* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe. +* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe. +* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe. + #### Limited privileges environment If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can: diff --git a/README.md.tpl b/README.md.tpl index 932b186c12..66bfe837ba 100644 --- a/README.md.tpl +++ b/README.md.tpl @@ -343,6 +343,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service. +#### Healthcheck Endpoints + +The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes: + +* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe. +* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe. +* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe. + #### Limited privileges environment If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can: diff --git a/examples/autosharding/statefulset.yaml b/examples/autosharding/statefulset.yaml index 37dd37153f..cc12a09152 100644 --- a/examples/autosharding/statefulset.yaml +++ b/examples/autosharding/statefulset.yaml @@ -37,7 +37,7 @@ spec: image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 livenessProbe: httpGet: - path: /healthz + path: /livez port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/daemonsetsharding/daemonset.yaml b/examples/daemonsetsharding/daemonset.yaml index 7a4ea43d87..897a296848 100644 --- a/examples/daemonsetsharding/daemonset.yaml +++ b/examples/daemonsetsharding/daemonset.yaml @@ -32,7 +32,7 @@ spec: image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 livenessProbe: httpGet: - path: /healthz + path: /livez port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/daemonsetsharding/deployment-no-node-pods.yaml b/examples/daemonsetsharding/deployment-no-node-pods.yaml index a4b4032214..c5995a30d1 100644 --- a/examples/daemonsetsharding/deployment-no-node-pods.yaml +++ b/examples/daemonsetsharding/deployment-no-node-pods.yaml @@ -27,7 +27,7 @@ spec: image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 livenessProbe: httpGet: - path: /healthz + path: /livez port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/daemonsetsharding/deployment.yaml b/examples/daemonsetsharding/deployment.yaml index cbd6d7dd8d..2973ddc2fb 100644 --- a/examples/daemonsetsharding/deployment.yaml +++ b/examples/daemonsetsharding/deployment.yaml @@ -26,7 +26,7 @@ spec: image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 livenessProbe: httpGet: - path: /healthz + path: /livez port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/standard/deployment.yaml b/examples/standard/deployment.yaml index 85c3cec3e4..3f6ce8bc21 100644 --- a/examples/standard/deployment.yaml +++ b/examples/standard/deployment.yaml @@ -24,7 +24,7 @@ spec: - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0 livenessProbe: httpGet: - path: /healthz + path: /livez port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet index a7c2904123..e392152dfe 100644 --- a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet @@ -193,7 +193,7 @@ }, livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: { port: 8080, - path: '/healthz', + path: '/livez', } }, readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: { port: 8081, diff --git a/pkg/app/server.go b/pkg/app/server.go index 3efd97b354..6a7db543e5 100644 --- a/pkg/app/server.go +++ b/pkg/app/server.go @@ -30,6 +30,12 @@ import ( "strings" "time" + "gopkg.in/yaml.v3" + "k8s.io/client-go/kubernetes" + _ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins. + "k8s.io/client-go/tools/clientcmd" + "k8s.io/klog/v2" + "github.com/oklog/run" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" @@ -38,10 +44,6 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/common/version" "github.com/prometheus/exporter-toolkit/web" - "gopkg.in/yaml.v3" - _ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins. - "k8s.io/client-go/tools/clientcmd" - "k8s.io/klog/v2" "k8s.io/kube-state-metrics/v2/internal/discovery" "k8s.io/kube-state-metrics/v2/internal/store" @@ -59,6 +61,7 @@ import ( const ( metricsPath = "/metrics" healthzPath = "/healthz" + livezPath = "/livez" ) // promLogger implements promhttp.Logger @@ -321,7 +324,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error { WebConfigFile: &tlsConfig, } - metricsMux := buildMetricsServer(m, durationVec) + metricsMux := buildMetricsServer(m, durationVec, kubeClient) metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port)) metricsServer := http.Server{ Handler: metricsMux, @@ -393,7 +396,7 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux { return mux } -func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec) *http.ServeMux { +func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux { mux := http.NewServeMux() // TODO: This doesn't belong into serveMetrics @@ -403,7 +406,23 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol)) mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace)) + // Add metricsPath mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m)) + + // Add livezPath + mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + + // Query the Kube API to make sure we are not affected by a network outage. + got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background()) + if got.Error() != nil { + w.WriteHeader(http.StatusServiceUnavailable) + w.Write([]byte(http.StatusText(http.StatusServiceUnavailable))) + return + } + w.WriteHeader(http.StatusOK) + w.Write([]byte(http.StatusText(http.StatusOK))) + })) + // Add healthzPath mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -424,6 +443,10 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome Address: healthzPath, Text: "Healthz", }, + { + Address: livezPath, + Text: "Livez", + }, }, } landingPage, err := web.NewLandingPage(landingConfig) From 6f8f7d1f7be92178302e4f21aec59fb84ae42091 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Tue, 25 Jun 2024 15:00:20 +0530 Subject: [PATCH 2/2] fixup! enhancement: add `livez` endpoint --- examples/autosharding/statefulset.yaml | 2 +- examples/daemonsetsharding/daemonset.yaml | 2 +- examples/daemonsetsharding/deployment-no-node-pods.yaml | 2 +- examples/daemonsetsharding/deployment.yaml | 2 +- examples/standard/deployment.yaml | 2 +- jsonnet/kube-state-metrics/kube-state-metrics.libsonnet | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/autosharding/statefulset.yaml b/examples/autosharding/statefulset.yaml index cc12a09152..2b3a8e39b5 100644 --- a/examples/autosharding/statefulset.yaml +++ b/examples/autosharding/statefulset.yaml @@ -49,7 +49,7 @@ spec: name: telemetry readinessProbe: httpGet: - path: / + path: /metrics port: 8081 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/daemonsetsharding/daemonset.yaml b/examples/daemonsetsharding/daemonset.yaml index 897a296848..e856c673f7 100644 --- a/examples/daemonsetsharding/daemonset.yaml +++ b/examples/daemonsetsharding/daemonset.yaml @@ -44,7 +44,7 @@ spec: name: telemetry readinessProbe: httpGet: - path: / + path: /metrics port: 8081 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/daemonsetsharding/deployment-no-node-pods.yaml b/examples/daemonsetsharding/deployment-no-node-pods.yaml index c5995a30d1..8f6e7b4482 100644 --- a/examples/daemonsetsharding/deployment-no-node-pods.yaml +++ b/examples/daemonsetsharding/deployment-no-node-pods.yaml @@ -39,7 +39,7 @@ spec: name: telemetry readinessProbe: httpGet: - path: / + path: /metrics port: 8081 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/daemonsetsharding/deployment.yaml b/examples/daemonsetsharding/deployment.yaml index 2973ddc2fb..3b51b034d6 100644 --- a/examples/daemonsetsharding/deployment.yaml +++ b/examples/daemonsetsharding/deployment.yaml @@ -38,7 +38,7 @@ spec: name: telemetry readinessProbe: httpGet: - path: / + path: /metrics port: 8081 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/examples/standard/deployment.yaml b/examples/standard/deployment.yaml index 3f6ce8bc21..ce74e4fbf9 100644 --- a/examples/standard/deployment.yaml +++ b/examples/standard/deployment.yaml @@ -36,7 +36,7 @@ spec: name: telemetry readinessProbe: httpGet: - path: / + path: /metrics port: 8081 initialDelaySeconds: 5 timeoutSeconds: 5 diff --git a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet index e392152dfe..82695482b4 100644 --- a/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet +++ b/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet @@ -197,7 +197,7 @@ } }, readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: { port: 8081, - path: '/', + path: '/metrics', } }, };