Skip to content

Commit

Permalink
colflow: introduce a cluster setting for max retries of FD acquisition
Browse files Browse the repository at this point in the history
We recently introduced a mechanism of retrying the acquision of file
descriptors needed for the disk-spilling queries to be able to get out
of a deadlock. We hard-coded the number of retries at 8, and this commit
makes that number configurable via a cluster setting (the idea is that
some users might be ok retrying for longer, so they will have an option
to do that). This cluster setting will also be the escape hatch to the
previous behavior (indefinite wait on `Acquire`) when the setting is set
to zero.

Release note: None
  • Loading branch information
yuzefovich committed Jul 20, 2022
1 parent bfe4fa6 commit 9d9bf55
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 20 deletions.
1 change: 1 addition & 0 deletions pkg/sql/colflow/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ go_library(
"//pkg/col/coldataext",
"//pkg/roachpb",
"//pkg/rpc/nodedialer",
"//pkg/settings",
"//pkg/sql/catalog/descs",
"//pkg/sql/colcontainer",
"//pkg/sql/colexec",
Expand Down
34 changes: 20 additions & 14 deletions pkg/sql/colflow/vectorized_flow.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/col/coldataext"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
"github.com/cockroachdb/cockroach/pkg/sql/colcontainer"
"github.com/cockroachdb/cockroach/pkg/sql/colexec"
Expand Down Expand Up @@ -62,9 +63,9 @@ import (
// but due to the method name conflict doesn't.
type fdCountingSemaphore struct {
semaphore.Semaphore
globalCount *metric.Gauge
count int64
testingAcquireMaxRetries int
globalCount *metric.Gauge
count int64
acquireMaxRetries int
}

var fdCountingSemaphorePool = sync.Pool{
Expand All @@ -74,12 +75,12 @@ var fdCountingSemaphorePool = sync.Pool{
}

func newFDCountingSemaphore(
sem semaphore.Semaphore, globalCount *metric.Gauge, testingAcquireMaxRetries int,
sem semaphore.Semaphore, globalCount *metric.Gauge, sv *settings.Values,
) *fdCountingSemaphore {
s := fdCountingSemaphorePool.Get().(*fdCountingSemaphore)
s.Semaphore = sem
s.globalCount = globalCount
s.testingAcquireMaxRetries = testingAcquireMaxRetries
s.acquireMaxRetries = int(fdCountingSemaphoreMaxRetries.Get(sv))
return s
}

Expand All @@ -89,6 +90,16 @@ var errAcquireTimeout = pgerror.New(
"COCKROACH_VEC_MAX_OPEN_FDS environment variable",
)

var fdCountingSemaphoreMaxRetries = settings.RegisterIntSetting(
settings.TenantWritable,
"sql.distsql.acquire_vec_fds.max_retries",
"determines the number of retries performed during the acquisition of "+
"file descriptors needed for disk-spilling operations, set to 0 for "+
"unlimited retries",
8,
settings.NonNegativeInt,
)

func (s *fdCountingSemaphore) Acquire(ctx context.Context, n int) error {
if s.TryAcquire(n) {
return nil
Expand All @@ -108,17 +119,13 @@ func (s *fdCountingSemaphore) Acquire(ctx context.Context, n int) error {
// so the initial backoff time of 100ms seems ok (we are spilling to disk
// after all, so the query is likely to experience significant latency). The
// current choice of options is such that we'll spend on the order of 25s
// in the retry loop before timing out.
maxRetries := s.testingAcquireMaxRetries
if maxRetries <= 0 {
// Make sure that the retry loop is finite.
maxRetries = 8
}
// in the retry loop before timing out with the default value of the
// 'sql.distsql.acquire_vec_fds.max_retries' cluster settings.
opts := retry.Options{
InitialBackoff: 100 * time.Millisecond,
Multiplier: 2.0,
RandomizationFactor: 0.25,
MaxRetries: maxRetries,
MaxRetries: s.acquireMaxRetries,
}
for r := retry.StartWithCtx(ctx, opts); r.Next(); {
if s.TryAcquire(n) {
Expand Down Expand Up @@ -245,8 +252,7 @@ func (f *vectorizedFlow) Setup(
}
flowCtx := f.GetFlowCtx()
f.countingSemaphore = newFDCountingSemaphore(
f.Cfg.VecFDSemaphore, f.Cfg.Metrics.VecOpenFDs,
flowCtx.TestingKnobs().VecFDsAcquireMaxRetriesCount,
f.Cfg.VecFDSemaphore, f.Cfg.Metrics.VecOpenFDs, &flowCtx.EvalCtx.Settings.SV,
)
f.creator = newVectorizedFlowCreator(
helper,
Expand Down
5 changes: 3 additions & 2 deletions pkg/sql/colflow/vectorized_flow_deadlock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ func TestVectorizedFlowDeadlocksWhenSpilling(t *testing.T) {
// Set the testing knob so that the first operator to spill would
// use up the whole FD limit.
VecFDsToAcquire: vecFDsLimit,
// Allow just one retry to speed up the test.
VecFDsAcquireMaxRetriesCount: 1,
}},
}
tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ServerArgs: serverArgs})
Expand All @@ -60,6 +58,9 @@ func TestVectorizedFlowDeadlocksWhenSpilling(t *testing.T) {
// disk.
_, err = conn.ExecContext(ctx, "SET distsql_workmem = '1KiB'")
require.NoError(t, err)
// Allow just one retry to speed up the test.
_, err = conn.ExecContext(ctx, "SET CLUSTER SETTING sql.distsql.acquire_vec_fds.max_retries = 1")
require.NoError(t, err)

queryCtx, queryCtxCancel := context.WithDeadline(ctx, timeutil.Now().Add(10*time.Second))
defer queryCtxCancel()
Expand Down
4 changes: 0 additions & 4 deletions pkg/sql/execinfra/server_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,6 @@ type TestingKnobs struct {
// that should be acquired by a single disk-spilling operator in the
// vectorized engine.
VecFDsToAcquire int
// VecFDsAcquireMaxRetriesCount, if positive, determines the maximum number
// of retries done when acquiring the file descriptors for a disk-spilling
// operator in the vectorized engine.
VecFDsAcquireMaxRetriesCount int

// TableReaderBatchBytesLimit, if not 0, overrides the limit that the
// TableReader will set on the size of results it wants to get for individual
Expand Down

0 comments on commit 9d9bf55

Please sign in to comment.