-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
kv: update kvprober with quarantine pool
These changes update the kvprober to add ranges that fail probing into a quarantine pool where they are continuously probed. A metric which indicates the duration of the longest tenured range has also been added. Resolves #74407 Release justification: low risk, high benefit changes to existing functionality. Release note: None
- Loading branch information
1 parent
34089c8
commit b3f9b08
Showing
4 changed files
with
166 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the Business Source License | ||
// included in the file licenses/BSL.txt. | ||
// | ||
// As of the Change Date specified in that file, in accordance with | ||
// the Business Source License, use of this software will be governed | ||
// by the Apache License, Version 2.0, included in the file | ||
// licenses/APL.txt. | ||
|
||
// Package kvprober sends queries to KV in a loop, with configurable sleep | ||
// times, in order to generate data about the healthiness or unhealthiness of | ||
// kvclient & below. | ||
// | ||
// Prober increments metrics that SRE & other operators can use as alerting | ||
// signals. It also writes to logs to help narrow down the problem (e.g. which | ||
// range(s) are acting up). | ||
package kvprober | ||
|
||
import ( | ||
"context" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"github.com/cockroachdb/cockroach/pkg/settings/cluster" | ||
"github.com/cockroachdb/cockroach/pkg/util/log" | ||
"github.com/cockroachdb/cockroach/pkg/util/timeutil" | ||
"github.com/cockroachdb/errors" | ||
) | ||
|
||
type quarantinePool struct { | ||
steps []Step | ||
size int64 | ||
entryTimeMap map[roachpb.RangeID]time.Time | ||
} | ||
|
||
func newQuarantinePool(settings *cluster.Settings) *quarantinePool { | ||
poolSize := quarantinePoolSize.Get(&settings.SV) | ||
return &quarantinePool{ | ||
size: poolSize, | ||
entryTimeMap: make(map[roachpb.RangeID]time.Time), | ||
steps: make([]Step, poolSize), | ||
} | ||
} | ||
|
||
func (qp *quarantinePool) add(ctx context.Context, step Step) { | ||
if int64(len(qp.steps)) >= qp.size-1 { | ||
log.Health.Errorf(ctx, "cannot add range %s to quarantine pool, at capacity", step.RangeID.String()) | ||
} else { | ||
qp.steps = append(qp.steps, step) | ||
qp.entryTimeMap[step.RangeID] = timeutil.Now() | ||
} | ||
} | ||
|
||
func (qp *quarantinePool) remove(ctx context.Context, step Step) { | ||
if len(qp.steps) < 1 { | ||
log.Health.Errorf(ctx, "cannot remove range %s from quarantine pool, pool is empty", step.RangeID.String()) | ||
return | ||
} | ||
idx := -1 | ||
for k, v := range qp.steps { | ||
if v.RangeID == step.RangeID { | ||
idx = k | ||
break | ||
} | ||
} | ||
if idx == -1 { | ||
log.Health.Errorf(ctx, "cannot remove range %s from quarantine pool, not found", step.RangeID.String()) | ||
return | ||
} | ||
// Expensive op if pool size is very large. | ||
qp.steps = append(qp.steps[:idx], qp.steps[idx+1:]...) | ||
delete(qp.entryTimeMap, step.RangeID) | ||
} | ||
|
||
func (qp *quarantinePool) next(ctx context.Context) (Step, error) { | ||
if len(qp.steps) > 0 { | ||
step := qp.steps[0] | ||
return step, nil | ||
} | ||
return Step{}, errors.New("there are no keys in quarantine") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters