Skip to content

Commit

Permalink
roachprod: create geo-distributed clusters with extra nodes placed at…
Browse files Browse the repository at this point in the history
… the end

Prior to this change, roachprod would create geo-distributed clusters by
placing nodes in AZs in contiguous chunks. If the number of total nodes
was not evenly divisible by the number of regions, the first regions
would be allocated one additional node. This allocation pattern is
rarely desirable. A user will commonly allocate a single extra node as a
load generator and would generally like that load node to be the final
node and for that final node to be the extra node.

This changes the allocation where the extra nodes are placed in the same
regions as before but are given node indices at the end rather than with
the other nodes in their region.

After this change a cluster created with `roachprod create $CLUSTER -n 7 --geo`
will look like:

```
ajwerner-test-roachprod-gce: [gce] 12h47m58s remaining
  ajwerner-test-roachprod-gce-0001      ajwerner-test-roachprod-gce-0001.us-east1-b.cockroach-ephemeral 10.142.0.70      34.74.58.108
  ajwerner-test-roachprod-gce-0002      ajwerner-test-roachprod-gce-0002.us-east1-b.cockroach-ephemeral 10.142.0.5       35.237.74.155
  ajwerner-test-roachprod-gce-0003      ajwerner-test-roachprod-gce-0003.us-west1-b.cockroach-ephemeral 10.138.0.99      35.199.159.104
  ajwerner-test-roachprod-gce-0004      ajwerner-test-roachprod-gce-0004.us-west1-b.cockroach-ephemeral 10.138.0.100     35.197.94.83
  ajwerner-test-roachprod-gce-0005      ajwerner-test-roachprod-gce-0005.europe-west2-b.cockroach-ephemeral      10.154.15.237   35.230.143.190
  ajwerner-test-roachprod-gce-0006      ajwerner-test-roachprod-gce-0006.europe-west2-b.cockroach-ephemeral      10.154.15.236   35.234.156.121
  ajwerner-test-roachprod-gce-0007      ajwerner-test-roachprod-gce-0007.us-east1-b.cockroach-ephemeral 10.142.0.33      35.185.62.76
```

Instead of the previous:

```
ajwerner-test-old: [gce] 12h19m21s remaining
  ajwerner-test-old-0001        ajwerner-test-old-0001.us-east1-b.cockroach-ephemeral   10.142.0.139    34.74.150.216
  ajwerner-test-old-0002        ajwerner-test-old-0002.us-east1-b.cockroach-ephemeral   10.142.0.140    34.73.154.246
  ajwerner-test-old-0003        ajwerner-test-old-0003.us-east1-b.cockroach-ephemeral   10.142.0.141    35.243.176.131
  ajwerner-test-old-0004        ajwerner-test-old-0004.us-west1-b.cockroach-ephemeral   10.138.0.71     34.83.16.1
  ajwerner-test-old-0005        ajwerner-test-old-0005.us-west1-b.cockroach-ephemeral   10.138.0.60     34.83.78.172
  ajwerner-test-old-0006        ajwerner-test-old-0006.europe-west2-b.cockroach-ephemeral       10.154.15.200    35.234.148.191
  ajwerner-test-old-0007        ajwerner-test-old-0007.europe-west2-b.cockroach-ephemeral       10.154.15.199    35.242.179.144
```

Release note: None
  • Loading branch information
ajwerner committed May 23, 2019
1 parent 390c25e commit 4fb25e8
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 112 deletions.
44 changes: 17 additions & 27 deletions pkg/cmd/roachprod/vm/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"encoding/json"
"fmt"
"log"
"math"
"math/rand"
"os"
"os/exec"
Expand Down Expand Up @@ -228,38 +227,29 @@ func (p *Provider) Create(names []string, opts vm.CreateOpts) error {
regions = []string{regions[0]}
}

nodeCount := len(names)

var g errgroup.Group
// We're looping over regions to create all of the nodes in one region
// in the same iteration so they're contiguous.
node := 0
const rateLimit = 2 // per second
limiter := rate.NewLimiter(rateLimit, 2 /* buckets */)
// Choose a random availability zone for each region.
zones := make([]string, len(regions))
for i, region := range regions {
zones, err := p.regionZones(region, p.opts.CreateZones)
regionZones, err := p.regionZones(region, p.opts.CreateZones)
if err != nil {
return err
}
nodesPerRegion := int(math.Ceil(float64(nodeCount-node) / float64(len(regions)-i)))
// We're choosing a random availability zone now which will be consistent
// per region.
availabilityZone := rand.Int31n(int32(len(zones)))
for j := 0; j < nodesPerRegion; j++ {
if node >= nodeCount {
break
}
capName := names[node]
placement := zones[availabilityZone]
res := limiter.Reserve()
g.Go(func() error {
time.Sleep(res.Delay())
return p.runInstance(capName, placement, opts)
})
node++
}
zones[i] = regionZones[rand.Int31n(int32(len(regionZones)))]
}
nodeZones := vm.ZonePlacement(len(zones), len(names))

var g errgroup.Group
const rateLimit = 2 // per second
limiter := rate.NewLimiter(rateLimit, 2 /* buckets */)
for i := range names {
capName := names[i]
placement := zones[nodeZones[i]]
res := limiter.Reserve()
g.Go(func() error {
time.Sleep(res.Delay())
return p.runInstance(capName, placement, opts)
})
}
return g.Wait()
}

Expand Down
31 changes: 9 additions & 22 deletions pkg/cmd/roachprod/vm/gce/gcloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"regexp"
Expand Down Expand Up @@ -332,13 +331,6 @@ func (p *Provider) Create(names []string, opts vm.CreateOpts) error {
p.opts.Zones = []string{p.opts.Zones[0]}
}

totalNodes := float64(len(names))
totalZones := float64(len(p.opts.Zones))
nodesPerZone := int(math.Ceil(totalNodes / totalZones))

ct := int(0)
i := 0

// Fixed args.
args := []string{
"compute", "instances", "create",
Expand Down Expand Up @@ -384,20 +376,15 @@ func (p *Provider) Create(names []string, opts vm.CreateOpts) error {

var g errgroup.Group

// This is calculating the number of machines to allocate per zone by taking the ceiling of the the total number
// of machines left divided by the number of zones left. If the the number of machines isn't
// divisible by the number of zones, then the extra machines will be allocated one per zone until there are
// no more extra machines left.
for i < len(names) {
argsWithZone := append(args[:len(args):len(args)], "--zone", p.opts.Zones[ct])
ct++
argsWithZone = append(argsWithZone, names[i:i+nodesPerZone]...)
i += nodesPerZone

totalNodes -= float64(nodesPerZone)
totalZones--
nodesPerZone = int(math.Ceil(totalNodes / totalZones))

nodeZones := vm.ZonePlacement(len(p.opts.Zones), len(names))
zoneHostNames := make([][]string, len(p.opts.Zones))
for i, name := range names {
zone := nodeZones[i]
zoneHostNames[zone] = append(zoneHostNames[zone], name)
}
for i, zoneHosts := range zoneHostNames {
argsWithZone := append(args[:len(args):len(args)], "--zone", p.opts.Zones[i])
argsWithZone = append(argsWithZone, zoneHosts...)
g.Go(func() error {
cmd := exec.Command("gcloud", argsWithZone...)

Expand Down
23 changes: 23 additions & 0 deletions pkg/cmd/roachprod/vm/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,26 @@ func ProvidersSequential(named []string, action func(Provider) error) error {
}
return nil
}

// ZonePlacement allocates zones to numNodes in an equally sized groups in the
// same order as zones. If numNodes is not divisible by len(zones) the remainder
// is allocated in a round-robin fashion and placed at the end of the returned
// slice. The returned slice has a length of numNodes where each value is in
// [0, numZones).
//
// For example:
//
// ZonePlacement(3, 8) = []int{0, 0, 1, 1, 2, 2, 0, 1}
//
func ZonePlacement(numZones, numNodes int) (nodeZones []int) {
numPerZone := numNodes / numZones
extraStartIndex := numPerZone * numZones
nodeZones = make([]int, numNodes)
for i := 0; i < numNodes; i++ {
nodeZones[i] = i / numPerZone
if i >= extraStartIndex {
nodeZones[i] = i % numZones
}
}
return nodeZones
}
40 changes: 40 additions & 0 deletions pkg/cmd/roachprod/vm/vm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License. See the AUTHORS file
// for names of contributors.

package vm

import (
"strconv"
"testing"

"github.com/stretchr/testify/assert"
)

func TestZonePlacement(t *testing.T) {
for i, c := range []struct {
numZones, numNodes int
expected []int
}{
{1, 1, []int{0}},
{1, 2, []int{0, 0}},
{2, 4, []int{0, 0, 1, 1}},
{2, 5, []int{0, 0, 1, 1, 0}},
{3, 11, []int{0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 1}},
} {
t.Run(strconv.Itoa(i), func(t *testing.T) {
assert.EqualValues(t, c.expected, ZonePlacement(c.numZones, c.numNodes))
})
}
}
53 changes: 53 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1988,3 +1988,56 @@ func waitForFullReplication(t *test, db *gosql.DB) {
}
}
}

type loadGroup struct {
roachNodes nodeListOption
loadNodes nodeListOption
}

type loadGroupList []loadGroup

func (lg loadGroupList) roachNodes() nodeListOption {
var roachNodes nodeListOption
for _, g := range lg {
roachNodes = roachNodes.merge(g.roachNodes)
}
return roachNodes
}

func (lg loadGroupList) loadNodes() nodeListOption {
var loadNodes nodeListOption
for _, g := range lg {
loadNodes = loadNodes.merge(g.loadNodes)
}
return loadNodes
}

// makeLoadGroups create a loadGroupList that has an equal number of cockroach
// nodes per zone. It assumes that numLoadNodes <= numZones and that numZones is
// divisible by numLoadNodes.
func makeLoadGroups(c *cluster, numZones, numRoachNodes, numLoadNodes int) loadGroupList {
if numLoadNodes > numZones {
panic("cannot have more than one load node per zone")
} else if numZones%numLoadNodes != 0 {
panic("numZones must be divisible by numLoadNodes")
}
// roachprod allocates nodes over regions in a round-robin fashion.
// If the number of nodes is not divisible by the number of regions, the
// extra nodes are allocated in a round-robin fashion over the regions at
// the end of cluster.
loadNodesAtTheEnd := numLoadNodes%numZones != 0
loadGroups := make(loadGroupList, numLoadNodes)
roachNodesPerGroup := numRoachNodes / numLoadNodes
for i := range loadGroups {
if loadNodesAtTheEnd {
first := i*roachNodesPerGroup + 1
loadGroups[i].roachNodes = c.Range(first, first+roachNodesPerGroup-1)
loadGroups[i].loadNodes = c.Node(numRoachNodes + i + 1)
} else {
first := i*(roachNodesPerGroup+1) + 1
loadGroups[i].roachNodes = c.Range(first, first+roachNodesPerGroup-1)
loadGroups[i].loadNodes = c.Node((i + 1) * (roachNodesPerGroup + 1))
}
}
return loadGroups
}
73 changes: 73 additions & 0 deletions pkg/cmd/roachtest/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"
)

func TestClusterNodes(t *testing.T) {
Expand Down Expand Up @@ -253,3 +254,75 @@ func TestClusterMachineType(t *testing.T) {
})
}
}

func TestLoadGroups(t *testing.T) {
cfg := &loggerConfig{stdout: os.Stdout, stderr: os.Stderr}
logger, err := cfg.newLogger("" /* path */)
if err != nil {
t.Fatal(err)
}
for _, tc := range []struct {
numZones, numRoachNodes, numLoadNodes int
loadGroups loadGroupList
}{
{
3, 9, 3,
loadGroupList{
{
nodeListOption{1, 2, 3},
nodeListOption{4},
},
{
nodeListOption{5, 6, 7},
nodeListOption{8},
},
{
nodeListOption{9, 10, 11},
nodeListOption{12},
},
},
},
{
3, 9, 1,
loadGroupList{
{
nodeListOption{1, 2, 3, 4, 5, 6, 7, 8, 9},
nodeListOption{10},
},
},
},
{
4, 8, 2,
loadGroupList{
{
nodeListOption{1, 2, 3, 4},
nodeListOption{9},
},
{
nodeListOption{5, 6, 7, 8},
nodeListOption{10},
},
},
},
} {
t.Run(fmt.Sprintf("%d/%d/%d", tc.numZones, tc.numRoachNodes, tc.numLoadNodes),
func(t *testing.T) {
c := &cluster{t: testWrapper{t}, l: logger, nodes: tc.numRoachNodes + tc.numLoadNodes}
lg := makeLoadGroups(c, tc.numZones, tc.numRoachNodes, tc.numLoadNodes)
require.EqualValues(t, lg, tc.loadGroups)
})
}
t.Run("panics with too many load nodes", func(t *testing.T) {
require.Panics(t, func() {

numZones, numRoachNodes, numLoadNodes := 2, 4, 3
makeLoadGroups(nil, numZones, numRoachNodes, numLoadNodes)
}, "Failed to panic when number of load nodes exceeded number of zones")
})
t.Run("panics with unequal zones per load node", func(t *testing.T) {
require.Panics(t, func() {
numZones, numRoachNodes, numLoadNodes := 4, 4, 3
makeLoadGroups(nil, numZones, numRoachNodes, numLoadNodes)
}, "Failed to panic when number of zones is not divisible by number of load nodes")
})
}
16 changes: 3 additions & 13 deletions pkg/cmd/roachtest/indexes.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,9 @@ func registerNIndexes(r *registry, secondaryIndexes int) {
Cluster: makeClusterSpec(nodes+1, cpu(16), geo(), zones(geoZonesStr)),
Run: func(ctx context.Context, t *test, c *cluster) {
firstAZ := geoZones[0]
lastNodeInFirstAZ := nodes / 3
var roachNodes, gatewayNodes, loadNode nodeListOption
for i := 0; i < c.nodes; i++ {
n := c.Node(i + 1)
if i == lastNodeInFirstAZ {
loadNode = n
} else {
roachNodes = roachNodes.merge(n)
if i < lastNodeInFirstAZ {
gatewayNodes = gatewayNodes.merge(n)
}
}
}
roachNodes := c.Range(1, nodes)
gatewayNodes := c.Range(1, nodes/3)
loadNode := c.Node(nodes + 1)

c.Put(ctx, cockroach, "./cockroach", roachNodes)
c.Put(ctx, workload, "./workload", loadNode)
Expand Down
19 changes: 10 additions & 9 deletions pkg/cmd/roachtest/interleavedpartitioned.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,16 @@ func registerInterleaved(r *registry) {
c *cluster,
config config,
) {
cockroachWest := c.Range(1, 3)
workloadWest := c.Node(4)
cockroachEast := c.Range(5, 7)
workloadEast := c.Node(8)
cockroachCentral := c.Range(9, 11)
workloadCentral := c.Node(12)

cockroachNodes := cockroachWest.merge(cockroachEast.merge(cockroachCentral))
workloadNodes := workloadWest.merge(workloadEast.merge(workloadCentral))
numZones, numRoachNodes, numLoadNodes := 3, 9, 3
loadGroups := makeLoadGroups(c, numZones, numRoachNodes, numLoadNodes)
cockroachWest := loadGroups[0].roachNodes
workloadWest := loadGroups[0].loadNodes
cockroachEast := loadGroups[1].roachNodes
workloadEast := loadGroups[1].loadNodes
cockroachCentral := loadGroups[2].roachNodes
workloadCentral := loadGroups[2].loadNodes
cockroachNodes := loadGroups.roachNodes()
workloadNodes := loadGroups.loadNodes()

c.l.Printf("cockroach nodes: %s", cockroachNodes.String()[1:])
c.l.Printf("workload nodes: %s", workloadNodes.String()[1:])
Expand Down
16 changes: 3 additions & 13 deletions pkg/cmd/roachtest/ledger.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,9 @@ func registerLedger(r *registry) {
Name: fmt.Sprintf("ledger/nodes=%d/multi-az", nodes),
Cluster: makeClusterSpec(nodes+1, cpu(16), geo(), zones(azs)),
Run: func(ctx context.Context, t *test, c *cluster) {
lastNodeInFirstAZ := nodes / 3
var roachNodes, gatewayNodes, loadNode nodeListOption
for i := 0; i < c.nodes; i++ {
n := c.Node(i + 1)
if i == lastNodeInFirstAZ {
loadNode = n
} else {
roachNodes = roachNodes.merge(n)
if i < lastNodeInFirstAZ {
gatewayNodes = gatewayNodes.merge(n)
}
}
}
roachNodes := c.Range(1, nodes)
gatewayNodes := c.Range(1, nodes/3)
loadNode := c.Node(nodes + 1)

c.Put(ctx, cockroach, "./cockroach", roachNodes)
c.Put(ctx, workload, "./workload", loadNode)
Expand Down
Loading

0 comments on commit 4fb25e8

Please sign in to comment.