Skip to content

Commit

Permalink
feat(healthcheck): Add better exporter health check
Browse files Browse the repository at this point in the history
Better logic around the exporter health.

Related #110

Signed-off-by: oluwole fadeyi <[email protected]>
  • Loading branch information
tfadeyi committed Feb 2, 2024
1 parent 736dd75 commit 549c307
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 5 deletions.
9 changes: 4 additions & 5 deletions deploy/charts/auth0-exporter/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,10 @@ spec:
- "--web.listen-address"
- "{{- .Values.exporter.port }}"
{{- end }}
{{/* TODO add back in new release*/}}
{{/* {{- if .Values.exporter.timeout }}*/}}
{{/* - "--web.timeout"*/}}
{{/* - {{ .Values.exporter.timeout }}*/}}
{{/* {{- end }}*/}}
{{- if .Values.exporter.timeout }}
- "--web.timeout"
- {{ .Values.exporter.timeout }}
{{- end }}
{{- if (not .Values.exporter.metrics.users.enabled) }}
- "--metrics.users.disabled"
{{- end }}
Expand Down
64 changes: 64 additions & 0 deletions pkg/barrier/barrier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package barrier

import (
"context"
"github.com/juju/errors"
"sync"
"time"
)

type (
Barrier struct {
Capacity int
Value int
RefillPeriod time.Duration
mutex sync.RWMutex
}
)

func New(cap int, rate time.Duration) *Barrier {
return &Barrier{
Capacity: cap,
Value: cap,
RefillPeriod: rate,
}
}

func (b *Barrier) RecordBadEvent() {
b.mutex.Lock()
defer b.mutex.Unlock()
if b.Value <= 0 {
return
}
b.Value = b.Value - 1
}

func (b *Barrier) IsBadState() bool {
b.mutex.Lock()
defer b.mutex.Unlock()
return b.Value == 0
}

func (b *Barrier) refill() {
b.mutex.Lock()
defer b.mutex.Unlock()
if b.Value == b.Capacity {
return
}
b.Value = b.Value + 1
}

func (b *Barrier) Start(ctx context.Context) error {
if b.Value < 0 {
return errors.New("The instance cannot be initialised with a negative number")
}

for {
select {
case <-ctx.Done():
return nil
case <-time.Tick(b.RefillPeriod):

Check failure on line 60 in pkg/barrier/barrier.go

View workflow job for this annotation

GitHub Actions / lint

SA1015: using time.Tick leaks the underlying ticker, consider using it only in endless functions, tests and the main package, and use time.NewTicker here (staticcheck)
b.refill()
}
}
}
43 changes: 43 additions & 0 deletions pkg/barrier/barrier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package barrier

import (
"context"
"testing"
"time"

"github.com/stretchr/testify/assert"
)

func TestBarrier(t *testing.T) {
t.Parallel()
t.Run("successfully record bad event", func(t *testing.T) {
b := New(10, 1*time.Second)
b.RecordBadEvent()
assert.Equal(t, 9, b.Value)
})
t.Run("errors if the instance capacity is set to a negative number", func(t *testing.T) {
ctx := context.Background()
b := New(-1, 1*time.Second)
assert.Error(t, b.Start(ctx))
})
t.Run("record bad event doesn't go to a negative number", func(t *testing.T) {
b := New(0, 1*time.Second)
b.RecordBadEvent()
assert.Equal(t, 0, b.Value)
})
t.Run("isBadState returns true when instance reaches 0", func(t *testing.T) {
b := New(0, 1*time.Second)
assert.True(t, true, b.IsBadState())
})
t.Run("successfully refill barrier at the refill rate", func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
b := New(10, 1*time.Second)
go b.Start(ctx)

Check failure on line 35 in pkg/barrier/barrier_test.go

View workflow job for this annotation

GitHub Actions / lint

Error return value of `b.Start` is not checked (errcheck)
b.RecordBadEvent()
b.RecordBadEvent()
assert.Equal(t, 8, b.Value)
time.Sleep(3 * time.Second)
cancel()
assert.Equal(t, 10, b.Value)
})
}
6 changes: 6 additions & 0 deletions pkg/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/labstack/echo/v4"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/tfadeyi/auth0-simple-exporter/pkg/barrier"
"github.com/tfadeyi/auth0-simple-exporter/pkg/client"
"github.com/tfadeyi/auth0-simple-exporter/pkg/client/logs"
"github.com/tfadeyi/auth0-simple-exporter/pkg/exporter/metrics"
Expand All @@ -28,6 +29,8 @@ type (
// exporter
namespace string
subsystem string
// detects whether the exporter is in a bad state
state *barrier.Barrier
// checkpoint from where to start fetching logs
startTime time.Time
userMetricDisabled bool
Expand Down Expand Up @@ -63,6 +66,7 @@ func New(ctx context.Context, opts ...Option) *exporter {
namespace: "auth0",
subsystem: "",
ctx: ctx,
state: barrier.New(10, 5*time.Minute),
startTime: time.Now(),
targetScrapeRequestErrors: prometheus.NewCounter(
prometheus.CounterOpts{
Expand Down Expand Up @@ -149,6 +153,7 @@ func (e *exporter) collect(ctx context.Context, m *metrics.Metrics) error {
eventLogs := list.([]*management.Log)
e.logger.V(0).Error(err, "Request was terminated by Prometheus. The exporter could not finish polling the Auth0 log client to fetch the tenant logs."+
"Please try increase the prometheus scrape period", "logs_events_found", len(eventLogs), "from", e.startTime)
e.state.RecordBadEvent()
case errors.Is(err, context.DeadlineExceeded):
eventLogs := list.([]*management.Log)
e.logger.V(0).Error(err, "Request could not be completed in the current request timeout. The exporter could not finish polling the Auth0 log client to fetch the tenant logs."+
Expand Down Expand Up @@ -182,6 +187,7 @@ func (e *exporter) collect(ctx context.Context, m *metrics.Metrics) error {
eventUsers := list.([]*management.User)
e.logger.V(0).Error(err, "Request was terminated by Prometheus. The exporter could not finish polling the Auth0 user client to fetch the tenant users."+
"Please increase the prometheus scrape period ", "users_found", len(eventUsers))
e.state.RecordBadEvent()
case errors.Is(err, context.DeadlineExceeded):
eventUsers := list.([]*management.User)
e.logger.V(0).Error(err, "Request could not be completed in the current request timeout. The exporter could not finish polling the Auth0 user client to fetch the tenant users."+
Expand Down
7 changes: 7 additions & 0 deletions pkg/exporter/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ func (e *exporter) Export() error {
</html>`, e.metricsAddr))
})
server.GET("/healthz", func(ctx echo.Context) error {
// check if the exporter might be stuck
if e.state.IsBadState() {
return echo.ErrInternalServerError
}
return ctx.JSON(http.StatusOK, "ok")
})

Expand Down Expand Up @@ -112,6 +116,9 @@ func (e *exporter) Export() error {
return server.Start(fmt.Sprintf(":%d", e.hostPort))
})
}
grp.Go(func() error {
return e.state.Start(ctx)
})
grp.Go(func() error {
<-ctx.Done()
return server.Shutdown(context.Background())
Expand Down

0 comments on commit 549c307

Please sign in to comment.