Skip to content

Commit

Permalink
[dbnode] Add reason tag to bootstrap retries metric (#3317)
Browse files Browse the repository at this point in the history
  • Loading branch information
vpranckaitis authored Mar 5, 2021
1 parent 0068083 commit 3e78412
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 23 deletions.
125 changes: 107 additions & 18 deletions src/dbnode/integration/bootstrap_retries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,27 @@ package integration

import (
"errors"
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/uber-go/tally"

"github.com/m3db/m3/src/dbnode/namespace"
"github.com/m3db/m3/src/dbnode/retention"
"github.com/m3db/m3/src/dbnode/storage"
"github.com/m3db/m3/src/dbnode/storage/bootstrap"
"github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper"
"github.com/m3db/m3/src/x/context"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestBootstrapRetriesDueToError(t *testing.T) {
// Setup the test bootstrapper to only proceed when a signal is sent.
signalCh := make(chan bool)

setup := bootstrapRetryTestSetup(t, func(
setup, testScope := bootstrapRetryTestSetup(t, func(
ctx context.Context,
namespaces bootstrap.Namespaces,
cache bootstrap.Cache,
Expand All @@ -51,14 +54,13 @@ func TestBootstrapRetriesDueToError(t *testing.T) {
return bootstrap.NamespaceResults{}, errors.New("error in bootstrapper")
}
// Mark all as fulfilled
noopNone := bootstrapper.NewNoOpAllBootstrapperProvider()
bs, err := noopNone.Provide()
bs, err := bootstrapper.NewNoOpAllBootstrapperProvider().Provide()
require.NoError(t, err)
return bs.Bootstrap(ctx, namespaces, cache)
})

go func() {
// Wait for server to start
// Wait for server to get started by the main test method.
setup.WaitUntilServerIsUp()

// First bootstrap pass. Bootstrapper produces an error. Check if DB is not marked bootstrapped.
Expand All @@ -70,7 +72,7 @@ func TestBootstrapRetriesDueToError(t *testing.T) {
signalCh <- false
assert.False(t, setup.DB().IsBootstrapped(), "database should not yet be bootstrapped")

// Still boostrap retry. Bootstrapper completes in-memory range without errors. DB finishes bootstrapping.
// Still bootstrap retry. Bootstrapper completes in-memory range without errors. DB finishes bootstrapping.
signalCh <- false
}()

Expand All @@ -80,13 +82,67 @@ func TestBootstrapRetriesDueToError(t *testing.T) {
}()

assert.True(t, setup.DB().IsBootstrapped(), "database should be bootstrapped")
assertRetryMetric(t, testScope, "other")
}

func TestBootstrapRetriesDueToObsoleteRanges(t *testing.T) {
// Setup the test bootstrapper to only proceed when a signal is sent.
signalCh := make(chan struct{})

setup, testScope := bootstrapRetryTestSetup(t, func(
ctx context.Context,
namespaces bootstrap.Namespaces,
cache bootstrap.Cache,
) (bootstrap.NamespaceResults, error) {
// read from signalCh twice so we could advance the clock exactly in between of those signals
<-signalCh
<-signalCh
bs, err := bootstrapper.NewNoOpAllBootstrapperProvider().Provide()
require.NoError(t, err)
return bs.Bootstrap(ctx, namespaces, cache)
})

go func() {
// Wait for server to get started by the main test method.
setup.WaitUntilServerIsUp()

// First bootstrap pass, persist ranges. Check if DB is not marked bootstrapped and advance clock.
signalCh <- struct{}{}
assert.False(t, setup.DB().IsBootstrapped(), "database should not yet be bootstrapped")
setup.SetNowFn(setup.NowFn()().Add(2 * time.Hour))
signalCh <- struct{}{}

// Still first bootstrap pass, in-memory ranges. Due to advanced clock previously calculated
// ranges are obsolete. Check if DB is not marked bootstrapped.
signalCh <- struct{}{}
assert.False(t, setup.DB().IsBootstrapped(), "database should not yet be bootstrapped")
signalCh <- struct{}{}

// Bootstrap retry, persist ranges. Check if DB isn't marked as bootstrapped on the second pass.
signalCh <- struct{}{}
assert.False(t, setup.DB().IsBootstrapped(), "database should not yet be bootstrapped")
signalCh <- struct{}{}

// Still bootstrap retry, in-memory ranges. DB finishes bootstrapping.
signalCh <- struct{}{}
assert.False(t, setup.DB().IsBootstrapped(), "database should not yet be bootstrapped")
signalCh <- struct{}{}
}()

require.NoError(t, setup.StartServer()) // Blocks until bootstrap is complete
defer func() {
require.NoError(t, setup.StopServer())
}()

assert.True(t, setup.DB().IsBootstrapped(), "database should be bootstrapped")
assertRetryMetric(t, testScope, "obsolete-ranges")
}

func TestBootstrapRetriesDueToUnfulfilledRanges(t *testing.T) {
// Setup the test bootstrapper to only proceed when a signal is sent.
signalCh := make(chan bool)

setup := bootstrapRetryTestSetup(t, func(
setup, testScope := bootstrapRetryTestSetup(t, func(
ctx context.Context,
namespaces bootstrap.Namespaces,
cache bootstrap.Cache,
Expand All @@ -104,7 +160,7 @@ func TestBootstrapRetriesDueToUnfulfilledRanges(t *testing.T) {
})

go func() {
// Wait for server to start
// Wait for server to get started by the main test method.
setup.WaitUntilServerIsUp()

// First bootstrap pass. Bootstrap produces unfulfilled ranges for persist range.
Expand Down Expand Up @@ -132,6 +188,8 @@ func TestBootstrapRetriesDueToUnfulfilledRanges(t *testing.T) {
}()

assert.True(t, setup.DB().IsBootstrapped(), "database should be bootstrapped")

assertRetryMetric(t, testScope, "other")
}

type bootstrapFn = func(
Expand All @@ -140,7 +198,9 @@ type bootstrapFn = func(
cache bootstrap.Cache,
) (bootstrap.NamespaceResults, error)

func bootstrapRetryTestSetup(t *testing.T, bootstrapFn bootstrapFn) TestSetup {
func bootstrapRetryTestSetup(t *testing.T, bootstrapFn bootstrapFn) (TestSetup, tally.TestScope) {
testScope := tally.NewTestScope("testScope", map[string]string{})

rOpts := retention.NewOptions().
SetRetentionPeriod(12 * time.Hour).
SetBufferPast(5 * time.Minute).
Expand All @@ -151,7 +211,9 @@ func bootstrapRetryTestSetup(t *testing.T, bootstrapFn bootstrapFn) TestSetup {
opts := NewTestOptions(t).
SetNamespaces([]namespace.Metadata{ns1})

setup, err := NewTestSetup(t, opts, nil)
setup, err := NewTestSetup(t, opts, nil, func(storageOpts storage.Options) storage.Options {
return storageOpts.SetInstrumentOptions(storageOpts.InstrumentOptions().SetMetricsScope(testScope))
})
require.NoError(t, err)
defer setup.Close()

Expand All @@ -160,16 +222,43 @@ func bootstrapRetryTestSetup(t *testing.T, bootstrapFn bootstrapFn) TestSetup {

bootstrapOpts = newDefaulTestResultOptions(setup.StorageOpts())
bootstrapperSourceOpts = testBootstrapperSourceOptions{read: bootstrapFn}
boostrapper = newTestBootstrapperSource(bootstrapperSourceOpts, bootstrapOpts, nil)

processOpts = bootstrap.NewProcessOptions().
SetTopologyMapProvider(setup).
SetOrigin(setup.Origin())
processOpts = bootstrap.NewProcessOptions().
SetTopologyMapProvider(setup).
SetOrigin(setup.Origin())
)
bootstrapOpts.SetInstrumentOptions(bootstrapOpts.InstrumentOptions().SetMetricsScope(testScope))
boostrapper := newTestBootstrapperSource(bootstrapperSourceOpts, bootstrapOpts, nil)

processProvider, err := bootstrap.NewProcessProvider(
boostrapper, processOpts, bootstrapOpts, fsOpts)
require.NoError(t, err)
setup.SetStorageOpts(setup.StorageOpts().SetBootstrapProcessProvider(processProvider))
return setup
return setup, testScope
}

func assertRetryMetric(t *testing.T, testScope tally.TestScope, expectedReason string) {
const (
metricName = "bootstrap-retries"
reasonTag = "reason"
)
valuesByReason := make(map[string]int)
for _, counter := range testScope.Snapshot().Counters() {
if strings.Contains(counter.Name(), metricName) {
reason := ""
if r, ok := counter.Tags()[reasonTag]; ok {
reason = r
}
valuesByReason[reason] = int(counter.Value())
}
}

val, ok := valuesByReason[expectedReason]
if assert.True(t, ok, "missing metric for expected reason") {
assert.Equal(t, 1, val)
}
for r, val := range valuesByReason {
if r != expectedReason {
assert.Equal(t, 0, val)
}
}
}
2 changes: 1 addition & 1 deletion src/dbnode/storage/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func (m *bootstrapManager) Bootstrap() (BootstrapResult, error) {
// NB(r): Last bootstrap failed, since this could be due to transient
// failure we retry the bootstrap again. This is to avoid operators
// needing to manually intervene for cases where failures are transient.
m.instrumentation.bootstrapFailed(i + 1)
m.instrumentation.bootstrapFailed(i+1, bootstrapErr)
m.sleepFn(bootstrapRetryInterval)
continue
}
Expand Down
34 changes: 30 additions & 4 deletions src/dbnode/storage/bootstrap_instrumentation.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ import (
"go.uber.org/zap"
"go.uber.org/zap/zapcore"

"github.com/m3db/m3/src/dbnode/storage/bootstrap"
"github.com/m3db/m3/src/x/clock"
xerrors "github.com/m3db/m3/src/x/errors"
"github.com/m3db/m3/src/x/ident"
"github.com/m3db/m3/src/x/instrument"
)
Expand Down Expand Up @@ -111,14 +113,33 @@ func (i *instrumentationContext) emitAndLogInvariantViolation(err error, msg str
})
}

type bootstrapRetriesMetrics struct {
obsoleteRanges tally.Counter
other tally.Counter
}

func newBootstrapRetriesMetrics(scope tally.Scope) bootstrapRetriesMetrics {
const metricName = "bootstrap-retries"

reason := func(reason string) map[string]string {
return map[string]string{
"reason": reason,
}
}
return bootstrapRetriesMetrics{
obsoleteRanges: scope.Tagged(reason("obsolete-ranges")).Counter(metricName),
other: scope.Tagged(reason("other")).Counter(metricName),
}
}

type bootstrapInstrumentation struct {
opts Options
scope tally.Scope
log *zap.Logger
nowFn clock.NowFn
status tally.Gauge
durableStatus tally.Gauge
numRetries tally.Counter
numRetries bootstrapRetriesMetrics
}

func newBootstrapInstrumentation(opts Options) *bootstrapInstrumentation {
Expand All @@ -130,7 +151,7 @@ func newBootstrapInstrumentation(opts Options) *bootstrapInstrumentation {
nowFn: opts.ClockOptions().NowFn(),
status: scope.Gauge("bootstrapped"),
durableStatus: scope.Gauge("bootstrapped-durable"),
numRetries: scope.Counter("bootstrap-retries"),
numRetries: newBootstrapRetriesMetrics(scope),
}
}

Expand All @@ -139,8 +160,13 @@ func (i *bootstrapInstrumentation) bootstrapPreparing() *instrumentationContext
return newInstrumentationContext(i.nowFn, i.log, i.scope, i.opts)
}

func (i *bootstrapInstrumentation) bootstrapFailed(retry int) {
i.numRetries.Inc(1)
func (i *bootstrapInstrumentation) bootstrapFailed(retry int, err error) {
numRetries := i.numRetries.other
if xerrors.Is(err, bootstrap.ErrFileSetSnapshotTypeRangeAdvanced) {
numRetries = i.numRetries.obsoleteRanges
}
numRetries.Inc(1)

i.log.Warn("retrying bootstrap after backoff",
zap.Duration("backoff", bootstrapRetryInterval),
zap.Int("numRetries", retry))
Expand Down
90 changes: 90 additions & 0 deletions src/dbnode/storage/bootstrap_instrumentation_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) 2021 Uber Technologies, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

package storage

import (
"errors"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/uber-go/tally"

"github.com/m3db/m3/src/dbnode/storage/bootstrap"
xerrors "github.com/m3db/m3/src/x/errors"
)

func TestBootstrapRetryMetricReason(t *testing.T) {
tests := []struct {
name string
err error
expectedReason string
}{
{
name: "any error",
err: xerrors.NewInvalidParamsError(errors.New("some error")),
expectedReason: "other",
},
{
name: "obsolete ranges error",
err: xerrors.NewInvalidParamsError(bootstrap.ErrFileSetSnapshotTypeRangeAdvanced),
expectedReason: "obsolete-ranges",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
testScope := tally.NewTestScope("testScope", map[string]string{})
options := NewOptions()
options = options.SetInstrumentOptions(options.InstrumentOptions().SetMetricsScope(testScope))
instr := newBootstrapInstrumentation(options)

instr.bootstrapFailed(1, tt.err)

retryMetrics := getBootstrapRetryMetrics(testScope)
assert.NotEmpty(t, retryMetrics)
assert.Equal(t, 1, retryMetrics[tt.expectedReason], "metric with reason:%v should be set to 1")
for reason, value := range retryMetrics {
if reason != tt.expectedReason {
assert.Equal(t, 0, value, "metric with reason:%v should stay 0")
}
}
})
}
}

func getBootstrapRetryMetrics(testScope tally.TestScope) map[string]int {
const (
metricName = "bootstrap-retries"
reasonTag = "reason"
)
valuesByReason := make(map[string]int)
for _, counter := range testScope.Snapshot().Counters() {
if strings.Contains(counter.Name(), metricName) {
reason := ""
if r, ok := counter.Tags()[reasonTag]; ok {
reason = r
}
valuesByReason[reason] = int(counter.Value())
}
}
return valuesByReason
}

0 comments on commit 3e78412

Please sign in to comment.