Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dnm] repro for #106108 #106254

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions pkg/cmd/roachtest/roachstress.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,6 @@ if [ $# -gt 0 ] ; then
shift 1
fi

# Sanity-check used GCE project. You still need to set non default for GCE even if running on AWS.
if [ -z "${local}" ] && [ "${GCE_PROJECT-cockroach-ephemeral}" == "cockroach-ephemeral" ]; then
cat <<EOF
Please do not use roachstress on the cockroach-ephemeral project.
This may compete over quota with scheduled roachtest builds.
Use the andrei-jepsen project instead or reach out to dev-inf.

The project can be specified via the environment:
export GCE_PROJECT=XXX
EOF
exit 2
fi

# Define the artifacts base dir, within which both the built binaries and the
# artifacts will be stored.
sha=$(git rev-parse --short HEAD)
Expand Down
19 changes: 15 additions & 4 deletions pkg/cmd/roachtest/tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func registerFailover(r registry.Registry) {
Name: "failover/chaos" + suffix,
Owner: registry.OwnerKV,
Benchmark: true,
Timeout: 60 * time.Minute,
Timeout: 15 * time.Minute,
Cluster: r.MakeClusterSpec(10, spec.CPU(2), spec.PreferLocalSSD(false)), // uses disk stalls
Leases: leases,
SkipPostValidations: registry.PostValidationNoDeadNodes, // cleanup kills nodes
Expand Down Expand Up @@ -226,13 +226,24 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
// 100.000 keys.
var insertCount int
if readOnly {
insertCount = 100000
// insertCount = 100000
insertCount = 1 // HACK
}
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
c.Run(ctx, c.Node(10), fmt.Sprintf(
`./cockroach workload init kv --splits 1000 --insert-count %d {pgurl:1}`, insertCount))

m.Go(func(ctx context.Context) error {
c.Run(ctx, c.Node(10), fmt.Sprintf(
`./cockroach workload init kv --splits 1000 --insert-count %d {pgurl:1}`, insertCount))
time.Sleep(45*time.Second) // give deadlocked mutex time to explode
return nil
})
m.Wait()

if rand.Intn(100) < 100 { // avoid linters complaining about unreachable code...
return // HACK
}

// Scatter the ranges, then relocate them off of the SQL gateways n1-n2.
t.L().Printf("scattering table")
Expand Down
1 change: 1 addition & 0 deletions pkg/util/syncutil/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go_test(
"int_map_bench_test.go",
"int_map_reference_test.go",
"int_map_test.go",
"mutex_stuck_test.go",
"mutex_sync_race_test.go", # keep
],
args = ["-test.timeout=55s"],
Expand Down
31 changes: 31 additions & 0 deletions pkg/util/syncutil/mutex_stuck_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package syncutil

import (
"testing"
"time"
)

func TestStuckMutex(t *testing.T) {
t.Skip("manual")
var rwm RWMutex
go func() {
func() {
func() {
rwm.Lock() // oops
}()
}()
}()
for i := 0; i < 9999; i++ {
time.Sleep(time.Second)
}
}
48 changes: 47 additions & 1 deletion pkg/util/syncutil/mutex_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,14 @@

package syncutil

import "sync"
import (
"fmt"
"os"
"runtime"
"strings"
"sync"
"time"
)

// DeadlockEnabled is true if the deadlock detector is enabled.
const DeadlockEnabled = false
Expand All @@ -37,6 +44,45 @@ func (m *Mutex) AssertHeld() {
// An RWMutex is a reader/writer mutual exclusion lock.
type RWMutex struct {
sync.RWMutex
pcs []uintptr
watching *time.Timer // from mtime.AfterFunc
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anchor comment that I'll reference from elsewhere

}

func fatalWithStack(pcs []uintptr) {
var buf strings.Builder
fs := runtime.CallersFrames(pcs)
for {
frame, more := fs.Next()
if !more {
break
}
_, err := fmt.Fprintf(&buf, "%s:%d %s\n", frame.File, frame.Line, frame.Function)
if err != nil {
_, _ = fmt.Fprintf(&buf, "error: %v\n", err)
}
if !more {
break
}
}
_, _ = fmt.Fprintf(os.Stderr, "stuck mutex, acquired at:\n%s", &buf)
os.Exit(17)
}

func (rw *RWMutex) Lock() {
rw.RWMutex.Lock()
if len(rw.pcs) == 0 {
rw.pcs = make([]uintptr, 8)
}
rw.pcs = rw.pcs[:runtime.Callers(2, rw.pcs[:cap(rw.pcs)])]
rw.watching = time.AfterFunc(25*time.Second, func() {
fatalWithStack(rw.pcs) // data race but we have bigger problems!
})
}

func (rw *RWMutex) Unlock() {
rw.watching.Stop()
rw.watching = nil
rw.RWMutex.Unlock()
}

// AssertHeld may panic if the mutex is not locked for writing (but it is not
Expand Down