Skip to content

Commit

Permalink
roachtest: unskip fuse disk-stall roachtest variant
Browse files Browse the repository at this point in the history
The `fuse` variant of the `disk-stalled` roachtest was skipped in
\cockroachdb#95865.

Re-enable the skipped variant, updating it to make use of our forked
version of `charybdefs`. This fork includes a patch that allows for
specifying a delay time for syscalls, making it possible to simulate a
complete disk stall. Previously, delay times were limited to 50ms, which
meant that the detection time had to be even lower (e.g. 40ms), which
was not representative of how Cockroach is configured in practice.

Allow the roachprod infrastructure to interpolate strings such as
`{store-dir}`, etc. when provided as `ExtraArgs` or the `KeyCmd`. Fix by
expanding expanding all args, rather than just `ExtraArgs`.

Fixes cockroachdb#95874.

Release note: None.
  • Loading branch information
nicktrav committed Feb 4, 2023
1 parent ca70b82 commit 6ed2fb0
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 33 deletions.
67 changes: 44 additions & 23 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
"github.com/stretchr/testify/require"
)

const maxSyncDur = 10 * time.Second

// registerDiskStalledDetection registers the disk stall test.
func registerDiskStalledDetection(r registry.Registry) {
stallers := map[string]func(test.Test, cluster.Cluster) diskStaller{
Expand All @@ -48,10 +50,10 @@ func registerDiskStalledDetection(r registry.Registry) {
r.Add(registry.TestSpec{
Name: fmt.Sprintf("disk-stalled/%s", name),
Owner: registry.OwnerStorage,
Cluster: r.MakeClusterSpec(4),
Cluster: r.MakeClusterSpec(4, spec.ReuseNone()),
Timeout: 20 * time.Minute,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runDiskStalledDetection(ctx, t, c, makeStaller(t, c))
runDiskStalledDetection(ctx, t, c, makeStaller(t, c), true /* doStall */)
},
// Encryption is implemented within the virtual filesystem layer,
// just like disk-health monitoring. It's important to exercise
Expand All @@ -69,31 +71,34 @@ func registerDiskStalledDetection(r registry.Registry) {
stallLogDir := stallLogDir
stallDataDir := stallDataDir
r.Add(registry.TestSpec{
Skip: "#95886",
SkipDetails: "The test current only induces a 50us disk-stall, which cannot be reliably detected.",
Name: fmt.Sprintf(
"disk-stalled/fuse/log=%t,data=%t",
stallLogDir, stallDataDir,
),
Owner: registry.OwnerStorage,
Cluster: r.MakeClusterSpec(4),
Cluster: r.MakeClusterSpec(4, spec.ReuseNone()),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runDiskStalledDetection(ctx, t, c, &fuseDiskStaller{
t: t,
c: c,
stallLogs: stallLogDir,
stallData: stallDataDir,
})
}, stallLogDir || stallDataDir /* doStall */)
},
EncryptionSupport: registry.EncryptionMetamorphic,
})
}
}
}

func runDiskStalledDetection(ctx context.Context, t test.Test, c cluster.Cluster, s diskStaller) {
const maxSyncDur = 10 * time.Second
func runDiskStalledDetection(
ctx context.Context, t test.Test, c cluster.Cluster, s diskStaller, doStall bool,
) {
startOpts := option.DefaultStartOpts()
startOpts.RoachprodOpts.ExtraArgs = []string{
"--store", s.DataDir(),
"--log", fmt.Sprintf(`{sinks: {stderr: {filter: INFO}}, file-defaults: {dir: "%s"}}`, s.LogDir()),
}
startSettings := install.MakeClusterSettings()
startSettings.Env = append(startSettings.Env,
"COCKROACH_AUTO_BALLAST=false",
Expand All @@ -109,11 +114,13 @@ func runDiskStalledDetection(ctx context.Context, t test.Test, c cluster.Cluster
c.Start(ctx, t.L(), startOpts, startSettings, c.Range(1, 3))

// Assert the process monotonic times are as expected.
start, ok := getProcessStartMonotonic(ctx, t, c, 1)
var ok bool
var start, exit time.Duration
start, ok = getProcessStartMonotonic(ctx, t, c, 1)
if !ok {
t.Fatal("unable to retrieve process start time; did Cockroach not start?")
}
if exit, exitOk := getProcessExitMonotonic(ctx, t, c, 1); exitOk {
if exit, ok = getProcessExitMonotonic(ctx, t, c, 1); ok && exit > 0 {
t.Fatalf("process has an exit monotonic time of %d; did Cockroach already exit?", exit)
}

Expand Down Expand Up @@ -165,7 +172,9 @@ func runDiskStalledDetection(ctx context.Context, t test.Test, c cluster.Cluster
t.L().PrintfCtx(ctx, "%.2f transactions completed before stall", totalTxnsPreStall)

t.Status("inducing write stall")
m.ExpectDeath()
if doStall {
m.ExpectDeath()
}
s.Stall(ctx, c.Node(1))
defer s.Unstall(ctx, c.Node(1))

Expand All @@ -181,8 +190,10 @@ func runDiskStalledDetection(ctx context.Context, t test.Test, c cluster.Cluster
t.Status("pinging SQL connection to n1")
err := n1Conn.PingContext(ctx)
t.L().PrintfCtx(ctx, "pinging n1's connection: %s", err)
if err == nil {
if doStall && err == nil {
t.Fatal("connection to n1 is still alive")
} else if !doStall && err != nil {
t.Fatalf("connection to n1 is dead: %s", err)
}
}

Expand Down Expand Up @@ -223,22 +234,29 @@ func runDiskStalledDetection(ctx context.Context, t test.Test, c cluster.Cluster
// Unstall the stalled node. It should be able to be reaped.
s.Unstall(ctx, c.Node(1))
time.Sleep(1 * time.Second)
exit, ok := getProcessExitMonotonic(ctx, t, c, 1)
if !ok {
t.Fatalf("unable to retrieve process exit time; stall went undetected")
exit, ok = getProcessExitMonotonic(ctx, t, c, 1)
if doStall {
if !ok {
t.Fatalf("unable to retrieve process exit time; stall went undetected")
}
t.L().PrintfCtx(ctx, "node exited at %s after test start\n", exit-start)
} else if ok && exit > 0 {
t.Fatal("no stall induced, but process exited")
}
t.L().PrintfCtx(ctx, "node exited at %s after test start\n", exit-start)

// Shut down the nodes, allowing any devices to be unmounted during cleanup.
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Range(1, 3))
}

func getProcessStartMonotonic(
ctx context.Context, t test.Test, c cluster.Cluster, nodeID int,
) (time.Duration, bool) {
) (since time.Duration, ok bool) {
return getProcessMonotonicTimestamp(ctx, t, c, nodeID, "ActiveEnterTimestampMonotonic")
}

func getProcessExitMonotonic(
ctx context.Context, t test.Test, c cluster.Cluster, nodeID int,
) (time.Duration, bool) {
) (since time.Duration, ok bool) {
return getProcessMonotonicTimestamp(ctx, t, c, nodeID, "ActiveExitTimestampMonotonic")
}

Expand All @@ -262,7 +280,7 @@ func getProcessMonotonicTimestamp(
t.Fatalf("unable to parse monotonic timestamp %q: %s", parts[1], err)
}
if u == 0 {
return 0, false
return 0, true
}
return time.Duration(u) * time.Microsecond, true
}
Expand All @@ -287,7 +305,7 @@ func (s *dmsetupDiskStaller) device() string { return getDevice(s.t, s.c.Spec())

func (s *dmsetupDiskStaller) Setup(ctx context.Context) {
dev := s.device()
s.c.Run(ctx, s.c.All(), `sudo umount /mnt/data1`)
s.c.Run(ctx, s.c.All(), `sudo umount -f /mnt/data1 || true`)
s.c.Run(ctx, s.c.All(), `sudo dmsetup remove_all`)
s.c.Run(ctx, s.c.All(), `echo "0 $(sudo blockdev --getsz `+dev+`) linear `+dev+` 0" | `+
`sudo dmsetup create data1`)
Expand Down Expand Up @@ -377,7 +395,7 @@ func (s *cgroupDiskStaller) setThroughput(
))
}

// fuseDiskStaller uses a FUSE filesystem (charybdefs) to insert an artifical
// fuseDiskStaller uses a FUSE filesystem (charybdefs) to insert an artificial
// delay on all I/O.
type fuseDiskStaller struct {
t test.Test
Expand Down Expand Up @@ -417,11 +435,14 @@ func (s *fuseDiskStaller) Setup(ctx context.Context) {
}

func (s *fuseDiskStaller) Cleanup(ctx context.Context) {
s.c.Run(ctx, s.c.All(), "sudo umount {store-dir}/faulty")
s.c.Run(ctx, s.c.All(), "sudo umount -f {store-dir}/faulty || true")
}

func (s *fuseDiskStaller) Stall(ctx context.Context, nodes option.NodeListOption) {
s.c.Run(ctx, nodes, "charybdefs-nemesis --delay")
// Stall for 2x the max sync duration. The tool expects an integer
// representing the delay time, in microseconds.
stallMicros := (2 * maxSyncDur).Microseconds()
s.c.Run(ctx, nodes, "charybdefs-nemesis", "--delay", strconv.FormatInt(stallMicros, 10))
}

func (s *fuseDiskStaller) Unstall(ctx context.Context, nodes option.NodeListOption) {
Expand Down
40 changes: 32 additions & 8 deletions pkg/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -420,10 +420,14 @@ func (c *SyncedCluster) generateStartCmd(
if err != nil {
return "", err
}
keyCmd, err := c.generateKeyCmd(ctx, l, node, startOpts)
if err != nil {
return "", err
}

return execStartTemplate(startTemplateData{
LogDir: c.LogDir(node),
KeyCmd: c.generateKeyCmd(node, startOpts),
KeyCmd: keyCmd,
EnvVars: append(append([]string{
fmt.Sprintf("ROACHPROD=%s", c.roachprodEnvValue(node)),
"GOTRACEBACK=crash",
Expand Down Expand Up @@ -552,16 +556,18 @@ func (c *SyncedCluster) generateStartArgs(
args = append(args, c.generateStartFlagsSQL()...)
}

args = append(args, startOpts.ExtraArgs...)

// Argument template expansion is node specific (e.g. for {store-dir}).
e := expander{
node: node,
}
for _, arg := range startOpts.ExtraArgs {
for i, arg := range args {
expandedArg, err := e.expand(ctx, l, c, arg)
if err != nil {
return nil, err
}
args = append(args, expandedArg)
args[i] = expandedArg
}

return args, nil
Expand All @@ -585,9 +591,14 @@ func (c *SyncedCluster) generateStartFlagsKV(node Node, startOpts StartOpts) []s
args = append(args, `--store`,
fmt.Sprintf(`path=%s,attrs=store%d:node%d:node%dstore%d`, storeDir, i, node, node, i))
}
} else {
} else if startOpts.ExtraArgs[idx] == "--store=" {
// The flag and path were provided together. Strip the flag prefix.
storeDir := strings.TrimPrefix(startOpts.ExtraArgs[idx], "--store=")
storeDirs = append(storeDirs, storeDir)
} else {
// Else, the store flag and path were specified as separate arguments. The
// path is the subsequent arg.
storeDirs = append(storeDirs, startOpts.ExtraArgs[idx+1])
}

if startOpts.EncryptedStores {
Expand Down Expand Up @@ -710,9 +721,11 @@ func (c *SyncedCluster) generateInitCmd(node Node) string {
return initCmd
}

func (c *SyncedCluster) generateKeyCmd(node Node, startOpts StartOpts) string {
func (c *SyncedCluster) generateKeyCmd(
ctx context.Context, l *logger.Logger, node Node, startOpts StartOpts,
) (string, error) {
if !startOpts.EncryptedStores {
return ""
return "", nil
}

var storeDirs []string
Expand All @@ -721,9 +734,14 @@ func (c *SyncedCluster) generateKeyCmd(node Node, startOpts StartOpts) string {
storeDir := c.NodeDir(node, i)
storeDirs = append(storeDirs, storeDir)
}
} else {
} else if startOpts.ExtraArgs[storeArgIdx] == "--store=" {
// The flag and path were provided together. Strip the flag prefix.
storeDir := strings.TrimPrefix(startOpts.ExtraArgs[storeArgIdx], "--store=")
storeDirs = append(storeDirs, storeDir)
} else {
// Else, the store flag and path were specified as separate arguments. The
// path is the subsequent arg.
storeDirs = append(storeDirs, startOpts.ExtraArgs[storeArgIdx+1])
}

// Command to create the store key.
Expand All @@ -735,7 +753,13 @@ func (c *SyncedCluster) generateKeyCmd(node Node, startOpts StartOpts) string {
openssl rand -out %[1]s/aes-128.key 48;
fi;`, storeDir)
}
return keyCmd.String()

e := expander{node: node}
expanded, err := e.expand(ctx, l, c, keyCmd.String())
if err != nil {
return "", err
}
return expanded, nil
}

func (c *SyncedCluster) useStartSingleNode() bool {
Expand Down
3 changes: 1 addition & 2 deletions pkg/roachprod/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ sudo service cassandra stop;
sudo rm -rf "${charybde_dir}" "${nemesis_path}" /usr/local/bin/charybdefs{,-nemesis}
sudo mkdir -p "${charybde_dir}"
sudo chmod 777 "${charybde_dir}"
# TODO(bilal): Change URL back to scylladb/charybdefs once https://github.com/scylladb/charybdefs/pull/28 is merged.
git clone --depth 1 "https://github.com/itsbilal/charybdefs.git" "${charybde_dir}"
git clone --depth 1 --branch crl "https://github.com/cockroachdb/charybdefs.git" "${charybde_dir}"
cd "${charybde_dir}"
thrift -r --gen cpp server.thrift
Expand Down

0 comments on commit 6ed2fb0

Please sign in to comment.