Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Liveness probe support #389

Merged
merged 1 commit into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ func main() {
log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,")
}

err = observability.InitProbes(nthConfig.EnableProbes, nthConfig.ProbesPort, nthConfig.ProbesEndpoint)
if err != nil {
nthConfig.Print()
log.Fatal().Err(err).Msg("Unable to instantiate probes service,")
}

imds := ec2metadata.New(nthConfig.MetadataURL, nthConfig.MetadataTries)

interruptionEventStore := interruptioneventstore.New(nthConfig)
Expand Down
3 changes: 3 additions & 0 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ Parameter | Description | Default
`logLevel` | Sets the log level (INFO, DEBUG, or ERROR) | `INFO`
`enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false`
`prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092`
`enableProbesServer` |If true, start an http server exposing `/healthz` endpoint for probes. | `false`
`probesServerPort` | Replaces the default HTTP port for exposing probes endpoint. | `8080`
`probesServerEndpoint` | Replaces the default endpoint for exposing probes endpoint. | `/healthz`
`podMonitor.create` | if `true`, create a PodMonitor | `false`
`podMonitor.interval` | Prometheus scrape interval | `30s`
`podMonitor.sampleLimit` | Number of scraped samples accepted | `5000`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ spec:
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
value: {{ .Values.prometheusServerPort | quote }}
- name: ENABLE_PROBES_SERVER
value: {{ .Values.enableProbesServer | quote }}
- name: PROBES_SERVER_PORT
value: {{ .Values.probesServerPort | quote }}
- name: PROBES_SERVER_ENDPOINT
value: {{ .Values.probesServerEndpoint | quote }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.enablePrometheusServer }}
Expand All @@ -175,6 +181,13 @@ spec:
name: http-metrics
protocol: TCP
{{- end }}
{{- if .Values.enableProbesServer }}
ports:
- containerPort: {{ .Values.probesServerPort }}
hostPort: {{ .Values.probesServerPort }}
name: liveness-probe
protocol: TCP
{{- end }}
nodeSelector:
{{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux
{{- with .Values.nodeSelector }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ spec:
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
value: {{ .Values.prometheusServerPort | quote }}
- name: ENABLE_PROBES_SERVER
value: {{ .Values.enableProbesServer | quote }}
- name: PROBES_SERVER_PORT
value: {{ .Values.probesServerPort | quote }}
- name: PROBES_SERVER_ENDPOINT
value: {{ .Values.probesServerEndpoint | quote }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.enablePrometheusServer }}
Expand All @@ -149,6 +155,13 @@ spec:
name: http-metrics
protocol: TCP
{{- end }}
{{- if .Values.enableProbesServer }}
ports:
- containerPort: {{ .Values.probesServerPort }}
hostPort: {{ .Values.probesServerPort }}
name: liveness-probe
protocol: TCP
{{- end }}
nodeSelector:
{{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows
{{- with .Values.nodeSelector }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ spec:
value: {{ .Values.webhookProxy | quote }}
- name: ENABLE_PROMETHEUS_SERVER
value: {{ .Values.enablePrometheusServer | quote }}
- name: ENABLE_PROBES_SERVER
value: {{ .Values.enableProbesServer | quote }}
- name: ENABLE_SPOT_INTERRUPTION_DRAINING
value: "false"
- name: ENABLE_SCHEDULED_EVENT_DRAINING
Expand All @@ -130,6 +132,10 @@ spec:
value: {{ .Values.queueURL | quote }}
- name: PROMETHEUS_SERVER_PORT
value: {{ .Values.prometheusServerPort | quote }}
- name: PROBES_SERVER_PORT
value: {{ .Values.probesServerPort | quote }}
- name: PROBES_SERVER_ENDPOINT
value: {{ .Values.probesServerEndpoint | quote }}
- name: AWS_REGION
value: {{ .Values.awsRegion | quote }}
- name: AWS_ENDPOINT
Expand All @@ -155,6 +161,13 @@ spec:
name: http-metrics
protocol: TCP
{{- end }}
{{- if .Values.enableProbesServer }}
ports:
- containerPort: {{ .Values.probesServerPort }}
hostPort: {{ .Values.probesServerPort }}
name: liveness-probe
protocol: TCP
{{- end }}
nodeSelector:
{{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux
{{- with .Values.nodeSelector }}
Expand Down
12 changes: 12 additions & 0 deletions config/helm/aws-node-termination-handler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ podLabels: {}
linuxPodLabels: {}
windowsPodLabels: {}

# liveness probe settings.
probes:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
periodSeconds: 5

resources:
requests:
memory: "64Mi"
Expand Down Expand Up @@ -144,6 +152,10 @@ nodeSelectorTermsArch: ""
enablePrometheusServer: false
prometheusServerPort: 9092

enableProbesServer: false
probesServerPort: 8080
probesServerEndpoint: "/healthz"

tolerations:
- operator: "Exists"

Expand Down
13 changes: 13 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ const (
// https://github.com/prometheus/prometheus/wiki/Default-port-allocations
prometheusPortDefault = 9092
prometheusPortConfigKey = "PROMETHEUS_SERVER_PORT"
// probes
enableProbesDefault = false
enableProbesConfigKey = "ENABLE_PROBES_SERVER"
probesPortDefault = 8080
probesPortConfigKey = "PROBES_SERVER_PORT"
probesEndpointDefault = "/healthz"
probesEndpointConfigKey = "PROBES_SERVER_ENDPOINT"
region = ""
awsRegionConfigKey = "AWS_REGION"
awsEndpointConfigKey = "AWS_ENDPOINT"
Expand Down Expand Up @@ -115,6 +122,9 @@ type Config struct {
UptimeFromFile string
EnablePrometheus bool
PrometheusPort int
EnableProbes bool
ProbesPort int
ProbesEndpoint string
AWSRegion string
AWSEndpoint string
QueueURL string
Expand Down Expand Up @@ -162,6 +172,9 @@ func ParseCliArgs() (config Config, err error) {
flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).")
flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.")
flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.")
flag.BoolVar(&config.EnableProbes, "enable-probes-server", getBoolEnv(enableProbesConfigKey, enableProbesDefault), "If true, a http server is used for exposing probes in /healthz endpoint.")
flag.IntVar(&config.ProbesPort, "probes-server-port", getIntEnv(probesPortConfigKey, probesPortDefault), "The port for running the probes http server.")
flag.StringVar(&config.ProbesEndpoint, "probes-server-endpoint", getEnv(probesEndpointConfigKey, probesEndpointDefault), "If specified, use this endpoint to make liveness probe")
flag.StringVar(&config.AWSRegion, "aws-region", getEnv(awsRegionConfigKey, ""), "If specified, use the AWS region for AWS API calls")
flag.StringVar(&config.AWSEndpoint, "aws-endpoint", getEnv(awsEndpointConfigKey, ""), "[testing] If specified, use the AWS endpoint to make API calls")
flag.StringVar(&config.QueueURL, "queue-url", getEnv(queueURLConfigKey, ""), "Listens for messages on the specified SQS queue URL")
Expand Down
41 changes: 41 additions & 0 deletions pkg/observability/probes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package observability

import (
"net"
"net/http"
"strconv"
"time"

"github.com/rs/zerolog/log"
)

// InitProbes will initialize, register and expose, via http server, the probes.
func InitProbes(enabled bool, port int, endpoint string) error {
if !enabled {
return nil
}

http.HandleFunc(endpoint, livenessHandler)

probes := &http.Server{
Addr: net.JoinHostPort("", strconv.Itoa(port)),
ReadTimeout: 1 * time.Second,
WriteTimeout: 1 * time.Second,
}

// Starts HTTP server exposing the probes path
go func() {
log.Info().Msgf("Starting to serve handler %s, port %d", endpoint, port)
if err := probes.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Err(err).Msg("Failed to listen and serve http server")
}
}()

return nil
}

func livenessHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Add("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"health":"OK"}`))
}
30 changes: 30 additions & 0 deletions pkg/observability/probes_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package observability

import (
"net/http"
"net/http/httptest"
"testing"
)

func TestLivenessHandler(t *testing.T) {
req := httptest.NewRequest("GET", "/healthz", nil)
rr := httptest.NewRecorder()
handler := http.HandlerFunc(livenessHandler)

handler.ServeHTTP(rr, req)

if contentType := rr.Header().Get("Content-Type"); contentType != "application/json" {
t.Errorf("handler returned wrong status content type: got %v want %v",
contentType, "application/json")
}

if status := rr.Code; status != http.StatusOK {
t.Errorf("handler returned wrong status code: got %v want %v",
status, http.StatusOK)
}

if body := rr.Body.String(); body != `{"health":"OK"}` {
t.Errorf("handler returned wrong body: got %v want %v",
body, http.StatusText(http.StatusOK))
}
}