Skip to content

Commit

Permalink
Merge pull request #2418 from rexagod/livez
Browse files Browse the repository at this point in the history
feat: add `livez` endpoint
  • Loading branch information
k8s-ci-robot authored Jun 25, 2024
2 parents f4ab888 + 6f8f7d1 commit d862cac
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 18 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo

After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.

#### Healthcheck Endpoints

The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:

* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.

#### Limited privileges environment

If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:
Expand Down
8 changes: 8 additions & 0 deletions README.md.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,14 @@ Note that your GCP identity is case sensitive but `gcloud info` as of Google Clo

After running the above, if you see `Clusterrolebinding "cluster-admin-binding" created`, then you are able to continue with the setup of this service.

#### Healthcheck Endpoints

The following healthcheck endpoints are available, some of which are used to determine the result of the aforementioned probes:

* `/livez`: Returns a 200 status code if the application is not affected by an outage of the Kubernetes API Server. We recommend to use this as a liveness probe.
* `/metrics`: Returns a 200 status code if the application is able to serve metrics. While this is available for both ports, we recommend to use the telemetry metrics endpoint as a readiness probe.
* `/healthz`: Returns a 200 status code if the application is running. We recommend to use this as a startup probe.

#### Limited privileges environment

If you want to run kube-state-metrics in an environment where you don't have cluster-reader role, you can:
Expand Down
4 changes: 2 additions & 2 deletions examples/autosharding/statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
Expand All @@ -49,7 +49,7 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /
path: /metrics
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
Expand Down
4 changes: 2 additions & 2 deletions examples/daemonsetsharding/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
Expand All @@ -44,7 +44,7 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /
path: /metrics
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
Expand Down
4 changes: 2 additions & 2 deletions examples/daemonsetsharding/deployment-no-node-pods.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
Expand All @@ -39,7 +39,7 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /
path: /metrics
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
Expand Down
4 changes: 2 additions & 2 deletions examples/daemonsetsharding/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
Expand All @@ -38,7 +38,7 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /
path: /metrics
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
Expand Down
4 changes: 2 additions & 2 deletions examples/standard/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ spec:
- image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.12.0
livenessProbe:
httpGet:
path: /healthz
path: /livez
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
Expand All @@ -36,7 +36,7 @@ spec:
name: telemetry
readinessProbe:
httpGet:
path: /
path: /metrics
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
Expand Down
4 changes: 2 additions & 2 deletions jsonnet/kube-state-metrics/kube-state-metrics.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,11 @@
},
livenessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8080,
path: '/healthz',
path: '/livez',
} },
readinessProbe: { timeoutSeconds: 5, initialDelaySeconds: 5, httpGet: {
port: 8081,
path: '/',
path: '/metrics',
} },
};

Expand Down
35 changes: 29 additions & 6 deletions pkg/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ import (
"strings"
"time"

"gopkg.in/yaml.v3"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"

"github.com/oklog/run"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
Expand All @@ -38,10 +44,6 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/version"
"github.com/prometheus/exporter-toolkit/web"
"gopkg.in/yaml.v3"
_ "k8s.io/client-go/plugin/pkg/client/auth" // Initialize common client auth plugins.
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"

"k8s.io/kube-state-metrics/v2/internal/discovery"
"k8s.io/kube-state-metrics/v2/internal/store"
Expand All @@ -59,6 +61,7 @@ import (
const (
metricsPath = "/metrics"
healthzPath = "/healthz"
livezPath = "/livez"
)

// promLogger implements promhttp.Logger
Expand Down Expand Up @@ -321,7 +324,7 @@ func RunKubeStateMetrics(ctx context.Context, opts *options.Options) error {
WebConfigFile: &tlsConfig,
}

metricsMux := buildMetricsServer(m, durationVec)
metricsMux := buildMetricsServer(m, durationVec, kubeClient)
metricsServerListenAddress := net.JoinHostPort(opts.Host, strconv.Itoa(opts.Port))
metricsServer := http.Server{
Handler: metricsMux,
Expand Down Expand Up @@ -393,7 +396,7 @@ func buildTelemetryServer(registry prometheus.Gatherer) *http.ServeMux {
return mux
}

func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec) *http.ServeMux {
func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prometheus.ObserverVec, client kubernetes.Interface) *http.ServeMux {
mux := http.NewServeMux()

// TODO: This doesn't belong into serveMetrics
Expand All @@ -403,7 +406,23 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))

// Add metricsPath
mux.Handle(metricsPath, promhttp.InstrumentHandlerDuration(durationObserver, m))

// Add livezPath
mux.Handle(livezPath, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {

// Query the Kube API to make sure we are not affected by a network outage.
got := client.CoreV1().RESTClient().Get().AbsPath("/livez").Do(context.Background())
if got.Error() != nil {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte(http.StatusText(http.StatusServiceUnavailable)))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte(http.StatusText(http.StatusOK)))
}))

// Add healthzPath
mux.HandleFunc(healthzPath, func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
Expand All @@ -424,6 +443,10 @@ func buildMetricsServer(m *metricshandler.MetricsHandler, durationObserver prome
Address: healthzPath,
Text: "Healthz",
},
{
Address: livezPath,
Text: "Livez",
},
},
}
landingPage, err := web.NewLandingPage(landingConfig)
Expand Down

0 comments on commit d862cac

Please sign in to comment.