Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(cosmovisor): fix typo daemon + fix flaky test #21487

Merged
merged 7 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tools/cosmovisor/process_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
func TestLaunchProcess(t *testing.T) {
// binaries from testdata/validate directory
home := copyTestData(t, "validate")
cfg := &cosmovisor.Config{Home: home, Name: "dummyd", PollInterval: 20, UnsafeSkipBackup: true}
cfg := &cosmovisor.Config{Home: home, Name: "dummyd", PollInterval: 15, UnsafeSkipBackup: true}
logger := log.NewTestLogger(t).With(log.ModuleKey, "cosmosvisor")

// should run the genesis binary and produce expected output
Expand Down Expand Up @@ -145,7 +145,7 @@ func TestLaunchProcessWithRestartDelay(t *testing.T) {
func TestPlanShutdownGrace(t *testing.T) {
// binaries from testdata/validate directory
home := copyTestData(t, "dontdie")
cfg := &cosmovisor.Config{Home: home, Name: "dummyd", PollInterval: 20, UnsafeSkipBackup: true, ShutdownGrace: 2 * time.Second}
cfg := &cosmovisor.Config{Home: home, Name: "dummyd", PollInterval: 15, UnsafeSkipBackup: true, ShutdownGrace: 2 * time.Second}
logger := log.NewTestLogger(t).With(log.ModuleKey, "cosmosvisor")

// should run the genesis binary and produce expected output
Expand Down
40 changes: 29 additions & 11 deletions tools/cosmovisor/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
)

type fileWatcher struct {
deamonHome string
daemonHome string
filename string // full path to a watched file
interval time.Duration

Expand Down Expand Up @@ -53,7 +53,7 @@ func newUpgradeFileWatcher(cfg *Config) (*fileWatcher, error) {
}

return &fileWatcher{
deamonHome: cfg.Home,
daemonHome: cfg.Home,
currentBin: bin,
filename: filenameAbs,
interval: cfg.PollInterval,
Expand Down Expand Up @@ -109,7 +109,32 @@ func (fw *fileWatcher) CheckUpdate(currentUpgrade upgradetypes.Plan) bool {

stat, err := os.Stat(fw.filename)
if err != nil {
// file doesn't exists
if os.IsNotExist(err) {
return false
} else {
panic(fmt.Errorf("failed to stat upgrade info file: %w", err))
}
}

// check https://github.com/cosmos/cosmos-sdk/issues/21086
// If new file is still empty, wait a small amount of time for write to complete
if stat.Size() == 0 {
for range 10 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: do you think this could be improved with exponential backoff? I think that this is too arbitrary and might cause again flakyness in the test.

for i range 10 {
...
time.Sleep(math.Pow(2, float64(i)) * time.Millisecond)

If it were to fail all 10 times, we would have waited a total of ~2046ms. Let's maybe reduce the amount of retries to 5 (62ms).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can modify in a follow up PR. In practice the loop never runs more than a few times since the file is quickly finalized. The failing test was related more to 1) not waiting at all (fixed here) and 2) test case delay being too similar to timeout (already fixed).

In production, retry or waiting some amount of time is necessary to avoid reading empty files (race condition).

time.Sleep(2 * time.Millisecond)
stat, err = os.Stat(fw.filename)
if err != nil {
if os.IsNotExist(err) {
return false
} else {
panic(fmt.Errorf("failed to stat upgrade info file: %w", err))
}
}
if stat.Size() == 0 {
break
}
}
}
if stat.Size() == 0 {
return false
}

Expand All @@ -118,13 +143,6 @@ func (fw *fileWatcher) CheckUpdate(currentUpgrade upgradetypes.Plan) bool {
return false
}

// if fw.lastModTime.IsZero() { // check https://github.com/cosmos/cosmos-sdk/issues/21086
// // first initialization or daemon restart while upgrading-info.json exists.
// // it could be that it was just created and not fully written to disk.
// // wait tiniest bit of time to allow the file to be fully written.
// time.Sleep(2 * time.Millisecond)
// }

info, err := parseUpgradeInfoFile(fw.filename, fw.disableRecase)
if err != nil {
panic(fmt.Errorf("failed to parse upgrade info file: %w", err))
Expand Down Expand Up @@ -167,7 +185,7 @@ func (fw *fileWatcher) checkHeight() (int64, error) {
return 0, nil
}

result, err := exec.Command(fw.currentBin, "status", "--home", fw.deamonHome).CombinedOutput() //nolint:gosec // we want to execute the status command
result, err := exec.Command(fw.currentBin, "status", "--home", fw.daemonHome).CombinedOutput() //nolint:gosec // we want to execute the status command
if err != nil {
return 0, err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ sleep 1
test -z $4 && exit 1001
echo 'UPGRADE "Chain2" NEEDED at height: 49: {}'
echo '{"name":"Chain2","height":49,"info":""}' > $4
# Shutdown grace test waits 2 seconds for flush
# Flush within 1 second
sleep 1
echo 'Flushed'
sleep 1
# Now chain is halted for shutdown grace test.
sleep 2
echo Did not kill in time. Never should be printed!!!
Loading