Skip to content

Commit

Permalink
roachprod: provide workaround for long-running AWS clusters
Browse files Browse the repository at this point in the history
In cockroachdb#98076, we started validating hostnames before running any commands
to avoid situations where a stale cache could lead to unintended
interference with other clusters due to public IP reuse. The check
relies on the VM's `hostname` matching the expected cluster name in
the cache. GCP and Azure clusters set the hostname to the instance
name by default, but that is not the case for AWS; the aforementioned
PR explicitly sets the hostname when the instance is created.

However, in the case of long running AWS clusters (created before host
validation was introduced) or clusters that are created with an
outdated version of `roachprod`, the hostname will still be the
default AWS hostname, and any interaction with that cluster will fail
if using a recent `roachprod` version. To remedy this situation, this
commit includes:

* better error reporting. When we attempt to run a command on an AWS
cluster and host validation fails, we display a message to the user
explaining that their hostnames may need fixing.

* if the user confirms that the cluster still exists (by running
`roachprod list`), they are able to automatically fix the hostnames to
the expected value by running a new `fix-long-running-aws-hostnames`
command. This is a temporary workaround that should be removed once we
no longer have clusters that would be affected by this issue.

This commit will be reverted once we no longer have clusters created
with the default hostnames; this will be easier to achieve once we
have an easy way for everyone to upgrade their `roachprod` (see cockroachdb#97311).

Epic: none

Release note: None
  • Loading branch information
renatolabs committed Mar 15, 2023
1 parent ddc737e commit e6cf25d
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 3 deletions.
13 changes: 13 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,18 @@ var storageSnapshotCmd = &cobra.Command{
}),
}

var fixLongRunningAWSHostnamesCmd = &cobra.Command{
Use: "fix-long-running-aws-hostnames <cluster>",
Short: "changes the hostnames of VMs in long-running AWS clusters",
Long: `This is a temporary workaround, and will be removed once we no longer ` +
`have AWS clusters that were created with the default hostname.`,

Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
return roachprod.FixLongRunningAWSHostnames(context.Background(), roachprodLibraryLogger, args[0])
}),
}

func main() {
loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr}
var loggerError error
Expand Down Expand Up @@ -1105,6 +1117,7 @@ func main() {
grafanaDumpCmd,
grafanaURLCmd,
rootStorageCmd,
fixLongRunningAWSHostnamesCmd,
)
setBashCompletionFunction()

Expand Down
48 changes: 45 additions & 3 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,19 @@ func (c *SyncedCluster) roachprodEnvRegex(node Node) string {
// By wrapping every command with a hostname check as is done here, we
// ensure that the cached cluster information is still correct.
func (c *SyncedCluster) validateHostnameCmd(cmd string, node Node) string {
// TODO(renato): remove this logic once we no longer have AWS clusters
// created with the default hostnames.
var awsNote string
nodeVM := c.VMs[node-1]
if nodeVM.Provider == aws.ProviderName {
awsNote = fmt.Sprintf(
"\nNOTE: host validation failed in AWS cluster. If you are sure this cluster still "+
"exists (i.e., you can see it when you run 'roachprod list'), then please run:\n\t"+
"roachprod fix-long-running-aws-hostnames %s",
c.Name,
)
}

isValidHost := fmt.Sprintf("[[ `hostname` == '%s' ]]", vm.Name(c.Name, int(node)))
errMsg := fmt.Sprintf("expected host to be part of %s, but is `hostname`", c.Name)
elseBranch := "fi"
Expand All @@ -343,10 +356,10 @@ fi

return fmt.Sprintf(`
if ! %s; then
echo "%s"
echo "%s%s"
exit 1
%s
`, isValidHost, errMsg, elseBranch)
`, isValidHost, errMsg, awsNote, elseBranch)
}

// validateHost will run `validateHostnameCmd` on the node passed to
Expand All @@ -373,12 +386,15 @@ func (c *SyncedCluster) newSession(
node: node,
user: c.user(node),
host: c.Host(node),
cmd: c.validateHostnameCmd(cmd, node),
cmd: cmd,
}

for _, opt := range options {
opt(command)
}
if !command.hostValidationDisabled {
command.cmd = c.validateHostnameCmd(cmd, node)
}
return newRemoteSession(l, command)
}

Expand Down Expand Up @@ -2542,6 +2558,32 @@ func (c *SyncedCluster) Init(ctx context.Context, l *logger.Logger, node Node) e
return nil
}

// FixLongRunningAWSHostnames updates the hostname on an AWS cluster
// that was created with the default hostname.
//
// TODO(renato): remove this function (and corresponding roachprod
// command) once we no longer have clusters created with the default
// hostnames.
func (c *SyncedCluster) FixLongRunningAWSHostnames(ctx context.Context, l *logger.Logger) error {
for i, nodeVM := range c.VMs {
node := Node(i + 1)
newHostname := vm.Name(c.Name, int(node))
if nodeVM.Provider == aws.ProviderName {
l.Printf("changing hostname of VM %d", node)
cmd := fmt.Sprintf("sudo hostnamectl set-hostname %s", newHostname)
s := c.newSession(l, node, cmd, withHostValidationDisabled())
defer s.Close()
out, err := s.CombinedOutput(ctx)
if err != nil {
return fmt.Errorf("could not fix hostname for node %d: %w\n%s", node, err, string(out))
}
l.Printf("hostname successfully changed to %s", newHostname)
}
}

return nil
}

// GenFilenameFromArgs given a list of cmd args, returns an alphahumeric string up to
// `maxLen` in length with hyphen delimiters, suitable for use in a filename.
// e.g. ["/bin/bash", "-c", "'sudo dmesg > dmesg.txt'"] -> binbash-c-sudo-dmesg
Expand Down
10 changes: 10 additions & 0 deletions pkg/roachprod/install/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ type remoteCommand struct {
cmd string
debugDisabled bool
debugName string

// TODO(renato): remove this option once we no longer have AWS
// clusters with default hostnames.
hostValidationDisabled bool
}

type remoteSessionOption = func(c *remoteCommand)
Expand All @@ -70,6 +74,12 @@ func withDebugName(name string) remoteSessionOption {
}
}

func withHostValidationDisabled() remoteSessionOption {
return func(c *remoteCommand) {
c.hostValidationDisabled = true
}
}

func newRemoteSession(l *logger.Logger, command *remoteCommand) *remoteSession {
var loggingArgs []string

Expand Down
16 changes: 16 additions & 0 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -1811,3 +1811,19 @@ func createAttachMountVolumes(
}
return nil
}

func FixLongRunningAWSHostnames(ctx context.Context, l *logger.Logger, clusterName string) error {
if err := LoadClusters(); err != nil {
return err
}
c, err := newCluster(l, clusterName)
if err != nil {
return err
}

if err := c.FixLongRunningAWSHostnames(ctx, l); err != nil {
return err
}
l.Printf("Done! You are now able to use your AWS cluster normally.")
return nil
}

0 comments on commit e6cf25d

Please sign in to comment.