Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cmd: wholesale replace roachprod, roachtest, workload from master #55865

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/cmd/roachprod/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
roachprod
43 changes: 43 additions & 0 deletions pkg/cmd/roachprod/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,29 @@ marc-foo: 23h59m42s remaining
Syncing...
```

#### Choosing a Provider

Use the `--clouds` flag to set which cloud provider(s) to use. Ex:

```
$ roachprod create foo --clouds gce,aws
```

#### Node Distribution Options

There are a couple flags that interact to create nodes in one zone or in
geographically distributed zones:

- `--geo`
- the `--[provider]-zones` flags (`--gce-zones`, `--aws-zones`, `--azure-locations`)

Here's what to expect when the options are combined:

- _If neither are set_: nodes are all placed within one of the the provider's default zones
- _`--geo` only_: nodes are spread across the provider's default zones
- _`--[provider]-zones` or `--geo --[provider]-zones`_: nodes are spread across
all the specified zones

### Interact using crl-prod tools

`roachprod` populates hosts files in `~/.roachprod/hosts`. These are used by
Expand Down Expand Up @@ -150,6 +173,26 @@ OK

See `roachprod help <command>` for further details.

## Return Codes

`roachprod` uses return codes to provide information about the exit status.
These are the codes and what they mean:

- 0: everything ran as expected
- 1: an unclassified roachprod error
- 10: a problem with an SSH connection to a server in the cluster
- 20: a problem running a non-cockroach command on a remote cluster server or on a local node
- 30: a problem running a cockroach command on a remote cluster server or a local node

Each of these codes has a corresponding easy-to-search-for string that is
emitted to output when an error of that type occurs. The strings are emitted
near the end of output and for each error that happens during an ssh
connection to a remote cluster node. The strings for each error code are:

- 1: `UNCLASSIFIED_PROBLEM`
- 10: `SSH_PROBLEM`
- 20: `COMMAND_PROBLEM`
- 30: `DEAD_ROACH_PROBLEM`

# Future improvements

Expand Down
10 changes: 7 additions & 3 deletions pkg/cmd/roachprod/cloud/cluster_cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm"
"github.com/pkg/errors"
"github.com/cockroachdb/errors"
)

const vmNameFormat = "user-<clusterid>-<nodeid>"
Expand Down Expand Up @@ -212,7 +212,7 @@ func ListCloud() (*Cloud, error) {
}

// CreateCluster TODO(peter): document
func CreateCluster(name string, nodes int, opts vm.CreateOpts) error {
func CreateCluster(nodes int, opts vm.CreateOpts) error {
providerCount := len(opts.VMProviders)
if providerCount == 0 {
return errors.New("no VMProviders configured")
Expand All @@ -222,7 +222,7 @@ func CreateCluster(name string, nodes int, opts vm.CreateOpts) error {
vmLocations := map[string][]string{}
for i, p := 1, 0; i <= nodes; i++ {
pName := opts.VMProviders[p]
vmName := vm.Name(name, i)
vmName := vm.Name(opts.ClusterName, i)
vmLocations[pName] = append(vmLocations[pName], vmName)

p = (p + 1) % providerCount
Expand All @@ -236,6 +236,10 @@ func CreateCluster(name string, nodes int, opts vm.CreateOpts) error {
// DestroyCluster TODO(peter): document
func DestroyCluster(c *Cluster) error {
return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error {
// Enable a fast-path for providers that can destroy a cluster in one shot.
if x, ok := p.(vm.DeleteCluster); ok {
return x.DeleteCluster(c.Name)
}
return p.Delete(vms)
})
}
Expand Down
29 changes: 13 additions & 16 deletions pkg/cmd/roachprod/cloud/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/nlopes/slack"
)

var errNoSlackClient = fmt.Errorf("no Slack client")

type status struct {
good []*Cluster
warn []*Cluster
Expand Down Expand Up @@ -100,23 +103,15 @@ func findChannel(client *slack.Client, name string) (string, error) {
}

func findUserChannel(client *slack.Client, email string) (string, error) {
if client != nil {
// TODO(peter): GetUserByEmail doesn't seem to work. Why?
users, err := client.GetUsers()
if err != nil {
return "", err
}
for _, user := range users {
if user.Profile.Email == email {
_, _, channelID, err := client.OpenIMChannel(user.ID)
if err != nil {
return "", err
}
return channelID, nil
}
}
if client == nil {
return "", errNoSlackClient
}
return "", fmt.Errorf("not found")
u, err := client.GetUserByEmail(email)
if err != nil {
return "", err
}
_, _, channelID, err := client.OpenIMChannel(u.ID)
return channelID, err
}

func postStatus(client *slack.Client, channel string, dryrun bool, s *status, badVMs vm.List) {
Expand Down Expand Up @@ -320,6 +315,8 @@ func GCClusters(cloud *Cloud, dryrun bool) error {
userChannel, err := findUserChannel(client, user+config.EmailDomain)
if err == nil {
postStatus(client, userChannel, dryrun, status, nil)
} else if !errors.Is(err, errNoSlackClient) {
log.Printf("could not deliver Slack DM to %s: %v", user+config.EmailDomain, err)
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,7 @@ const (
DefaultHostDir = "${HOME}/.roachprod/hosts"
EmailDomain = "@cockroachlabs.com"
Local = "local"

// SharedUser is the linux username for shared use on all vms.
SharedUser = "ubuntu"
)
5 changes: 5 additions & 0 deletions pkg/cmd/roachprod/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM golang:1.13
WORKDIR /build
COPY . .
RUN ["/build/build.sh"]
ENTRYPOINT ["/build/entrypoint.sh"]
11 changes: 11 additions & 0 deletions pkg/cmd/roachprod/docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Roachprod Docker Image

This dockerfile will build roachprod from master and create an image
with the supporting CLI toolchains. The entrypoint for the image will
configure the CLI tools using files to be mounted into the `/secrets`
directory.

The easiest way to build this is to run the following from this directory.
```
gcloud builds submit -t gcr.io/cockroach-dev-inf/cockroachlabs/roachprod:master
```
32 changes: 32 additions & 0 deletions pkg/cmd/roachprod/docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
# This script is used to build the docker image.

set -e
set -o pipefail

# Install AWS, Azure, GCP SDKs per
# https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu
# https://docs.microsoft.com/en-us/cli/azure/install-azure-cli-apt?view=azure-cli-latest
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" |
tee -a /etc/apt/sources.list.d/google-cloud-sdk.list

curl https://packages.cloud.google.com/apt/doc/apt-key.gpg |
apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -

# Azure
apt-get update -y
apt-get install -y lsb-release

curl -sL https://packages.microsoft.com/keys/microsoft.asc |
apt-key --keyring /usr/share/keyrings/microsoft.gpg add -

AZ_REPO=$(lsb_release -cs)
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/microsoft.gpg] https://packages.microsoft.com/repos/azure-cli/ $AZ_REPO main" |
tee /etc/apt/sources.list.d/azure-cli.list

# Install packages and clean up
apt-get update -y
apt-get install google-cloud-sdk awscli azure-cli -y
rm -rf /var/lib/apt/lists/*

go get github.com/cockroachdb/cockroach/pkg/cmd/roachprod
9 changes: 9 additions & 0 deletions pkg/cmd/roachprod/docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -e

# Unpack all of the keys, configs, etc. and then start roachperf
gcloud auth activate-service-account --key-file /secrets/gcloud.json
aws configure set aws_access_key_id $(cat /secrets/aws_access_key_id)
aws configure set aws_secret_access_key $(cat /secrets/aws_secret_access_key)
az login --service-principal -u $(cat /secrets/azure_user_id) -p $(cat /secrets/azure_password) -t $(cat /secrets/azure_tenant_id)
exec roachprod $@
178 changes: 178 additions & 0 deletions pkg/cmd/roachprod/errors/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// Copyright 2020 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package errors

import (
"fmt"
"os/exec"

"github.com/cockroachdb/errors"
)

// Error is an interface for error types used by the main.wrap() function
// to output correctly classified log messages and exit codes.
type Error interface {
error

// The exit code for the error when exiting roachprod.
ExitCode() int
}

// Exit codes for the errors
const (
cmdExitCode = 20
sshExitCode = 10
unclassifiedExitCode = 1
)

// Cmd wraps errors that result from a command run against the cluster.
type Cmd struct {
Err error
}

func (e Cmd) Error() string {
return fmt.Sprintf("COMMAND_PROBLEM: %s", e.Err.Error())
}

// ExitCode gives the process exit code to return for non-cockroach command
// errors.
func (e Cmd) ExitCode() int {
return cmdExitCode
}

// Format passes formatting responsibilities to cockroachdb/errors
func (e Cmd) Format(s fmt.State, verb rune) {
errors.FormatError(e, s, verb)
}

// Unwrap the wrapped the non-cockroach command error.
func (e Cmd) Unwrap() error {
return e.Err
}

// SSH wraps ssh-specific errors from connections to remote hosts.
type SSH struct {
Err error
}

func (e SSH) Error() string {
return fmt.Sprintf("SSH_PROBLEM: %s", e.Err.Error())
}

// ExitCode gives the process exit code to return for SSH errors.
func (e SSH) ExitCode() int {
return sshExitCode
}

// Format passes formatting responsibilities to cockroachdb/errors
func (e SSH) Format(s fmt.State, verb rune) {
errors.FormatError(e, s, verb)
}

// Unwrap the wrapped SSH error.
func (e SSH) Unwrap() error {
return e.Err
}

// Unclassified wraps roachprod and unclassified errors.
type Unclassified struct {
Err error
}

func (e Unclassified) Error() string {
return fmt.Sprintf("UNCLASSIFIED_PROBLEM: %s", e.Err.Error())
}

// ExitCode gives the process exit code to return for unclassified errors.
func (e Unclassified) ExitCode() int {
return unclassifiedExitCode
}

// Format passes formatting responsibilities to cockroachdb/errors
func (e Unclassified) Format(s fmt.State, verb rune) {
errors.FormatError(e, s, verb)
}

// Unwrap the wrapped unclassified error.
func (e Unclassified) Unwrap() error {
return e.Err
}

// ClassifyCmdError classifies an error received while executing a
// non-cockroach command remotely over an ssh connection to the right Error
// type.
func ClassifyCmdError(err error) Error {
if err == nil {
return nil
}

if exitErr, ok := asExitError(err); ok {
if exitErr.ExitCode() == 255 {
return SSH{err}
}
return Cmd{err}
}

return Unclassified{err}
}

// Extract the an ExitError from err's error tree or (nil, false) if none exists.
func asExitError(err error) (*exec.ExitError, bool) {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
return exitErr, true
}
return nil, false
}

// AsError extracts the Error from err's error tree or (nil, false) if none exists.
func AsError(err error) (Error, bool) {
var e Error
if errors.As(err, &e) {
return e, true
}
return nil, false
}

// SelectPriorityError selects an error from the list in this priority order:
//
// - the Error with the highest exit code
// - one of the `error`s
// - nil
func SelectPriorityError(errors []error) error {
var result Error
for _, err := range errors {
if err == nil {
continue
}

rpErr, _ := AsError(err)
if result == nil {
result = rpErr
continue
}

if rpErr.ExitCode() > result.ExitCode() {
result = rpErr
}
}

if result != nil {
return result
}

for _, err := range errors {
if err != nil {
return err
}
}
return nil
}
Loading