Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto-scale DynamoDB provision based on Prometheus metrics #841

Merged
merged 33 commits into from
Aug 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7dc58c4
Refactor: Move AWS-specific function into aws directory
bboreham May 29, 2018
9edb50f
Refactor: push DynamoDB-specific check into aws directory
bboreham May 29, 2018
d39ca13
Refactor: pull DynamoDB expected table results out to functions
bboreham Jun 7, 2018
f61ec92
Refactor: pull DynamoDB fixture config out to functions
bboreham Jun 7, 2018
f945f12
Refactor: Simplify test fixtures
bboreham Jun 7, 2018
907623d
Extract test function
bboreham Jun 7, 2018
ee97484
Auto-scale DynamoDB provision based on Prometheus metrics
bboreham May 30, 2018
8005779
Vendor required prometheus libraries
bboreham Jul 28, 2018
2d3c342
Improve logging of DynamoDB settings updates
bboreham Jun 8, 2018
6985ed2
Respect max and min capacity
bboreham Jun 8, 2018
672cfa8
Warn but don't fail on limit-exceeded error
bboreham Jun 8, 2018
c6b7cc0
Allow scaling parameters to be changed from command-line
bboreham Jun 8, 2018
b2489e6
Make test trigger max-capacity check
bboreham Jun 8, 2018
eb457a4
Respect cooldown periods in metrics autoscaling
bboreham Jun 8, 2018
0c85d76
Refactor: extract function to extract per-table rates from matrix
bboreham Jun 9, 2018
325f9fd
Change promQL test fixture to take slices
bboreham Jun 9, 2018
c9b1712
don't hard-code name of table in test
bboreham Jun 9, 2018
8c48093
Base scale-down on actual capacity usage
bboreham Jun 9, 2018
fe774f0
Scale back on zero errors
bboreham Jun 9, 2018
1d5dbfb
Reject small scale-downs since AWS rate-limits them
bboreham Jun 10, 2018
ef5a54a
Smooth out metrics used to control DynamoDB
bboreham Jun 10, 2018
208ccac
Check that the ingesters are doing some work before scale-down
bboreham Jun 11, 2018
541c85f
Bail-out of scale-up in simple cases
bboreham Jun 11, 2018
194c504
Make scale-up factor configurable
bboreham Jun 14, 2018
110fcc6
Change cooldown default from 50 to 30 minutes
bboreham Jun 14, 2018
d2c4d8d
Ensure metrics is nil if no MetricsURL set
bboreham Jun 21, 2018
e21e2ae
Remove redundant check on nil map
bboreham Jun 21, 2018
c9e88d9
Move methods from dynamoTableClient to metricsData
bboreham Jun 21, 2018
655345a
Put DynamoDB autoscaling behind an interface
bboreham Jun 21, 2018
1cb8877
Review feedback - move and rename
bboreham Jul 6, 2018
0b8e0ac
Scale up more when current setting is low.
bboreham Jul 28, 2018
dc1a5ca
Move metrics autoscaling flags to metrics_autoscaling.go
bboreham Jul 28, 2018
f93ace9
Make metrics query expressions configurable
bboreham Jul 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions cmd/table-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ func main() {
schemaConfig.IndexTables.WriteScale.Enabled ||
schemaConfig.ChunkTables.InactiveWriteScale.Enabled ||
schemaConfig.IndexTables.InactiveWriteScale.Enabled) &&
storageConfig.AWSStorageConfig.ApplicationAutoScaling.URL == nil {
level.Error(util.Logger).Log("msg", "WriteScale is enabled but no ApplicationAutoScaling URL has been provided")
(storageConfig.AWSStorageConfig.ApplicationAutoScaling.URL == nil && storageConfig.AWSStorageConfig.Metrics.URL == "") {
level.Error(util.Logger).Log("msg", "WriteScale is enabled but no ApplicationAutoScaling or Metrics URL has been provided")
os.Exit(1)
}

Expand Down
226 changes: 226 additions & 0 deletions pkg/chunk/aws/aws_autoscaling.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
package aws

import (
"context"
"fmt"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/applicationautoscaling"
"github.com/aws/aws-sdk-go/service/applicationautoscaling/applicationautoscalingiface"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/client_golang/prometheus"

"github.com/weaveworks/common/instrument"
"github.com/weaveworks/cortex/pkg/chunk"
"github.com/weaveworks/cortex/pkg/util"
)

const (
autoScalingPolicyNamePrefix = "DynamoScalingPolicy_cortex_"
)

var applicationAutoScalingRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "cortex",
Name: "application_autoscaling_request_duration_seconds",
Help: "Time spent doing ApplicationAutoScaling requests.",

// AWS latency seems to range from a few ms to a few sec. So use 8 buckets
// from 128us to 2s. TODO: Confirm that this is the case for ApplicationAutoScaling.
Buckets: prometheus.ExponentialBuckets(0.000128, 4, 8),
}, []string{"operation", "status_code"})

func init() {
prometheus.MustRegister(applicationAutoScalingRequestDuration)
}

type awsAutoscale struct {
call callManager
ApplicationAutoScaling applicationautoscalingiface.ApplicationAutoScalingAPI
}

func newAWSAutoscale(cfg DynamoDBConfig, callManager callManager) (*awsAutoscale, error) {
session, err := awsSessionFromURL(cfg.ApplicationAutoScaling.URL)
if err != nil {
return nil, err
}
return &awsAutoscale{
call: callManager,
ApplicationAutoScaling: applicationautoscaling.New(session),
}, nil
}

func (a *awsAutoscale) PostCreateTable(ctx context.Context, desc chunk.TableDesc) error {
if desc.WriteScale.Enabled {
return a.enableAutoScaling(ctx, desc)
}
return nil
}

func (a *awsAutoscale) DescribeTable(ctx context.Context, desc *chunk.TableDesc) error {
err := a.call.backoffAndRetry(ctx, func(ctx context.Context) error {
return instrument.TimeRequestHistogram(ctx, "ApplicationAutoScaling.DescribeScalableTargetsWithContext", applicationAutoScalingRequestDuration, func(ctx context.Context) error {
out, err := a.ApplicationAutoScaling.DescribeScalableTargetsWithContext(ctx, &applicationautoscaling.DescribeScalableTargetsInput{
ResourceIds: []*string{aws.String("table/" + desc.Name)},
ScalableDimension: aws.String("dynamodb:table:WriteCapacityUnits"),
ServiceNamespace: aws.String("dynamodb"),
})
if err != nil {
return err
}
switch l := len(out.ScalableTargets); l {
case 0:
return err
case 1:
desc.WriteScale.Enabled = true
if target := out.ScalableTargets[0]; target != nil {
if target.RoleARN != nil {
desc.WriteScale.RoleARN = *target.RoleARN
}
if target.MinCapacity != nil {
desc.WriteScale.MinCapacity = *target.MinCapacity
}
if target.MaxCapacity != nil {
desc.WriteScale.MaxCapacity = *target.MaxCapacity
}
}
return err
default:
return fmt.Errorf("more than one scalable target found for DynamoDB table")
}
})
})
if err != nil {
return err
}

err = a.call.backoffAndRetry(ctx, func(ctx context.Context) error {
return instrument.TimeRequestHistogram(ctx, "ApplicationAutoScaling.DescribeScalingPoliciesWithContext", applicationAutoScalingRequestDuration, func(ctx context.Context) error {
out, err := a.ApplicationAutoScaling.DescribeScalingPoliciesWithContext(ctx, &applicationautoscaling.DescribeScalingPoliciesInput{
PolicyNames: []*string{aws.String(autoScalingPolicyNamePrefix + desc.Name)},
ResourceId: aws.String("table/" + desc.Name),
ScalableDimension: aws.String("dynamodb:table:WriteCapacityUnits"),
ServiceNamespace: aws.String("dynamodb"),
})
if err != nil {
return err
}
switch l := len(out.ScalingPolicies); l {
case 0:
return err
case 1:
config := out.ScalingPolicies[0].TargetTrackingScalingPolicyConfiguration
if config != nil {
if config.ScaleInCooldown != nil {
desc.WriteScale.InCooldown = *config.ScaleInCooldown
}
if config.ScaleOutCooldown != nil {
desc.WriteScale.OutCooldown = *config.ScaleOutCooldown
}
if config.TargetValue != nil {
desc.WriteScale.TargetValue = *config.TargetValue
}
}
return err
default:
return fmt.Errorf("more than one scaling policy found for DynamoDB table")
}
})
})
return err
}

func (a *awsAutoscale) UpdateTable(ctx context.Context, current chunk.TableDesc, expected *chunk.TableDesc) error {
var err error
if !current.WriteScale.Enabled {
if expected.WriteScale.Enabled {
level.Info(util.Logger).Log("msg", "enabling autoscaling on table", "table")
err = a.enableAutoScaling(ctx, *expected)
}
} else {
if !expected.WriteScale.Enabled {
level.Info(util.Logger).Log("msg", "disabling autoscaling on table", "table")
err = a.disableAutoScaling(ctx, *expected)
} else if current.WriteScale != expected.WriteScale {
level.Info(util.Logger).Log("msg", "enabling autoscaling on table", "table")
err = a.enableAutoScaling(ctx, *expected)
}
}
return err
}

func (a *awsAutoscale) enableAutoScaling(ctx context.Context, desc chunk.TableDesc) error {
// Registers or updates a scalable target
if err := a.call.backoffAndRetry(ctx, func(ctx context.Context) error {
return instrument.TimeRequestHistogram(ctx, "ApplicationAutoScaling.RegisterScalableTarget", applicationAutoScalingRequestDuration, func(ctx context.Context) error {
input := &applicationautoscaling.RegisterScalableTargetInput{
MinCapacity: aws.Int64(desc.WriteScale.MinCapacity),
MaxCapacity: aws.Int64(desc.WriteScale.MaxCapacity),
ResourceId: aws.String("table/" + desc.Name),
RoleARN: aws.String(desc.WriteScale.RoleARN),
ScalableDimension: aws.String("dynamodb:table:WriteCapacityUnits"),
ServiceNamespace: aws.String("dynamodb"),
}
_, err := a.ApplicationAutoScaling.RegisterScalableTarget(input)
if err != nil {
return err
}
return nil
})
}); err != nil {
return err
}

// Puts or updates a scaling policy
return a.call.backoffAndRetry(ctx, func(ctx context.Context) error {
return instrument.TimeRequestHistogram(ctx, "ApplicationAutoScaling.PutScalingPolicy", applicationAutoScalingRequestDuration, func(ctx context.Context) error {
input := &applicationautoscaling.PutScalingPolicyInput{
PolicyName: aws.String(autoScalingPolicyNamePrefix + desc.Name),
PolicyType: aws.String("TargetTrackingScaling"),
ResourceId: aws.String("table/" + desc.Name),
ScalableDimension: aws.String("dynamodb:table:WriteCapacityUnits"),
ServiceNamespace: aws.String("dynamodb"),
TargetTrackingScalingPolicyConfiguration: &applicationautoscaling.TargetTrackingScalingPolicyConfiguration{
PredefinedMetricSpecification: &applicationautoscaling.PredefinedMetricSpecification{
PredefinedMetricType: aws.String("DynamoDBWriteCapacityUtilization"),
},
ScaleInCooldown: aws.Int64(desc.WriteScale.InCooldown),
ScaleOutCooldown: aws.Int64(desc.WriteScale.OutCooldown),
TargetValue: aws.Float64(desc.WriteScale.TargetValue),
},
}
_, err := a.ApplicationAutoScaling.PutScalingPolicy(input)
return err
})
})
}

func (a *awsAutoscale) disableAutoScaling(ctx context.Context, desc chunk.TableDesc) error {
// Deregister scalable target
if err := a.call.backoffAndRetry(ctx, func(ctx context.Context) error {
return instrument.TimeRequestHistogram(ctx, "ApplicationAutoScaling.DeregisterScalableTarget", applicationAutoScalingRequestDuration, func(ctx context.Context) error {
input := &applicationautoscaling.DeregisterScalableTargetInput{
ResourceId: aws.String("table/" + desc.Name),
ScalableDimension: aws.String("dynamodb:table:WriteCapacityUnits"),
ServiceNamespace: aws.String("dynamodb"),
}
_, err := a.ApplicationAutoScaling.DeregisterScalableTarget(input)
return err
})
}); err != nil {
return err
}

// Delete scaling policy
return a.call.backoffAndRetry(ctx, func(ctx context.Context) error {
return instrument.TimeRequestHistogram(ctx, "ApplicationAutoScaling.DeleteScalingPolicy", applicationAutoScalingRequestDuration, func(ctx context.Context) error {
input := &applicationautoscaling.DeleteScalingPolicyInput{
PolicyName: aws.String(autoScalingPolicyNamePrefix + desc.Name),
ResourceId: aws.String("table/" + desc.Name),
ScalableDimension: aws.String("dynamodb:table:WriteCapacityUnits"),
ServiceNamespace: aws.String("dynamodb"),
}
_, err := a.ApplicationAutoScaling.DeleteScalingPolicy(input)
return err
})
})
}
Loading