From 9bc1393f5c860e790997eb727eef42f335a7e591 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Wed, 4 Jan 2023 16:46:31 +0000 Subject: [PATCH] workload: fix non-determinism in TPC-H data generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the non-determinism in the TPC-H data generation by using a local slice for the random permutation of indexes into randPartNames. It also fixes the permutation algorithm to use a modified Fisher–Yates shuffle. Fixes #93958 Release note: None --- pkg/ccl/workloadccl/allccl/all_test.go | 4 +--- pkg/workload/tpch/generate.go | 2 +- pkg/workload/tpch/random.go | 13 +++++++++---- pkg/workload/tpch/random_test.go | 6 +----- pkg/workload/tpch/tpch.go | 10 +--------- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/pkg/ccl/workloadccl/allccl/all_test.go b/pkg/ccl/workloadccl/allccl/all_test.go index 06cce9643b4d..5946d800c00e 100644 --- a/pkg/ccl/workloadccl/allccl/all_test.go +++ b/pkg/ccl/workloadccl/allccl/all_test.go @@ -253,8 +253,6 @@ func hashTableInitialData( func TestDeterministicInitialData(t *testing.T) { defer leaktest.AfterTest(t)() - skip.WithIssue(t, 93958, "flaky test") - // There are other tests that run initial data generation under race, so we // don't get anything from running this one under race as well. skip.UnderRace(t, "uninteresting under race") @@ -280,7 +278,7 @@ func TestDeterministicInitialData(t *testing.T) { `sqlsmith`: 0xcbf29ce484222325, `startrek`: 0xa0249fbdf612734c, `tpcc`: 0xab32e4f5e899eb2f, - `tpch`: 0xe013881749bb67e8, + `tpch`: 0xe4fd28db230b9149, `ycsb`: 0x1244ea1c29ef67f6, } diff --git a/pkg/workload/tpch/generate.go b/pkg/workload/tpch/generate.go index 4c3af60dd9aa..fe216e8621f2 100644 --- a/pkg/workload/tpch/generate.go +++ b/pkg/workload/tpch/generate.go @@ -156,7 +156,7 @@ func (w *tpch) tpchPartInitialRowBatch(batchIdx int, cb coldata.Batch, a *bufall // P_PARTKEY unique within [SF * 200,000]. cb.ColVec(0).Int64()[0] = int64(partKey) // P_NAME generated by concatenating five unique randomly selected part name strings. - cb.ColVec(1).Bytes().Set(0, randPartName(rng, l.namePerm, a)) + cb.ColVec(1).Bytes().Set(0, randPartName(rng, a)) m, mfgr := randMfgr(rng, a) // P_MFGR text appended with digit ["Manufacturer#",M], where M = random value [1,5]. cb.ColVec(2).Bytes().Set(0, mfgr) // diff --git a/pkg/workload/tpch/random.go b/pkg/workload/tpch/random.go index 039e390296be..eb36f9a5779d 100644 --- a/pkg/workload/tpch/random.go +++ b/pkg/workload/tpch/random.go @@ -125,11 +125,16 @@ const nPartNames = 5 // randPartName concatenates 5 random unique strings from randPartNames, separated // by spaces. -func randPartName(rng *rand.Rand, namePerm []int, a *bufalloc.ByteAllocator) []byte { - // do nPartNames iterations of rand.Perm, to get a random 5-subset of the - // indexes into randPartNames. +func randPartName(rng *rand.Rand, a *bufalloc.ByteAllocator) []byte { + namePerm := make([]int, len(randPartNames)) + for i := range namePerm { + namePerm[i] = i + } + // Create a random 5-subset of the indexes into randPartNames using a modified + // Fisher–Yates shuffle. for i := 0; i < nPartNames; i++ { - j := rng.Intn(len(namePerm)) + // N.B. Correctness requires that i <= j < len(namePerm) + j := rng.Intn(len(namePerm)-i) + i namePerm[i], namePerm[j] = namePerm[j], namePerm[i] } var buf []byte diff --git a/pkg/workload/tpch/random_test.go b/pkg/workload/tpch/random_test.go index 00e926838f23..65fd1f75ca88 100644 --- a/pkg/workload/tpch/random_test.go +++ b/pkg/workload/tpch/random_test.go @@ -25,11 +25,7 @@ func TestRandPartName(t *testing.T) { rng := rand.New(rand.NewSource(uint64(timeutil.Now().UnixNano()))) seen := make(map[string]int) runOneRound := func() { - namePerm := make([]int, len(randPartNames)) - for i := range namePerm { - namePerm[i] = i - } - res := randPartName(rng, namePerm, &a) + res := randPartName(rng, &a) names := strings.Split(string(res), " ") assert.Equal(t, len(names), nPartNames) seenLocal := make(map[string]int) diff --git a/pkg/workload/tpch/tpch.go b/pkg/workload/tpch/tpch.go index 71773f70de08..cb02b39cabc8 100644 --- a/pkg/workload/tpch/tpch.go +++ b/pkg/workload/tpch/tpch.go @@ -190,9 +190,6 @@ func (w *tpch) Hooks() workload.Hooks { type generateLocals struct { rng *rand.Rand - // namePerm is a slice of ordinals into randPartNames. - namePerm []int - orderData *orderSharedRandomData } @@ -201,13 +198,8 @@ func (w *tpch) Tables() []workload.Table { if w.localsPool == nil { w.localsPool = &sync.Pool{ New: func() interface{} { - namePerm := make([]int, len(randPartNames)) - for i := range namePerm { - namePerm[i] = i - } return &generateLocals{ - rng: rand.New(rand.NewSource(uint64(timeutil.Now().UnixNano()))), - namePerm: namePerm, + rng: rand.New(rand.NewSource(uint64(timeutil.Now().UnixNano()))), orderData: &orderSharedRandomData{ partKeys: make([]int, 0, 7), shipDates: make([]int64, 0, 7),