Skip to content

Commit

Permalink
Add compatibility check based on output feature flags
Browse files Browse the repository at this point in the history
We may in the future store configuration inside the cluster, but
until then we can track the effect of configuration on nodegroups
using outputs, the theory is that only very few things actually
break backwards compatibility in relation to configuration of
nodegroups.
  • Loading branch information
errordeveloper committed Jan 16, 2019
1 parent abbda99 commit 1ec6ae2
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 64 deletions.
3 changes: 3 additions & 0 deletions pkg/cfn/builder/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ func (n *NodeGroupResourceSet) AddAllResources() error {
n.spec.AMIFamily, n.spec.AllowSSH, n.spec.SubnetTopology(),
templateDescriptionSuffix)

n.rs.newOutput(CfnOutputNodeGroupFeaturePrivateNetworking, n.spec.PrivateNetworking, false)
n.rs.newOutput(CfnOutputNodeGroupFeatureSharedSecurityGroup, n.spec.SharedSecurityGroup, false)

n.vpc = makeImportValue(n.clusterStackName, CfnOutputClusterVPC)

userData, err := nodebootstrap.NewUserData(n.clusterSpec, n.spec)
Expand Down
5 changes: 5 additions & 0 deletions pkg/cfn/builder/outputs.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ const (

// outputs from nodegroup stack
CfnOutputNodeGroupInstanceRoleARN = "InstanceRoleARN"
// outputs to indicate configuration attributes that may have critical effect
// on integration with the rest of the cluster and/or forward-compatibility
// with respect to e.g. upgrades
CfnOutputNodeGroupFeaturePrivateNetworking = "FeaturePrivateNetworking"
CfnOutputNodeGroupFeatureSharedSecurityGroup = "FeatureSharedSecurityGroup"
)

// newOutput defines a new output and optionally exports it
Expand Down
17 changes: 13 additions & 4 deletions pkg/cfn/manager/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,21 @@ func (c *StackCollection) DescribeNodeGroupStacks() ([]*Stack, error) {
return nodeGroupStacks, nil
}

// DescribeResourcesOfNodeGroupStacks calls DescribeNodeGroupStacks and fetches all resources,
// NodeGroupStacksAndResources hold the stack along with resources
type NodeGroupStacksAndResources struct {
Stack *Stack
Resources []*cfn.StackResource
}

// DescribeNodeGroupStacksAndResources calls DescribeNodeGroupStacks and fetches all resources,
// then returns it in a map by nodegroup name
func (c *StackCollection) DescribeResourcesOfNodeGroupStacks() (map[string]cfn.DescribeStackResourcesOutput, error) {
func (c *StackCollection) DescribeNodeGroupStacksAndResources() (map[string]NodeGroupStacksAndResources, error) {
stacks, err := c.DescribeNodeGroupStacks()
if err != nil {
return nil, err
}

allResources := make(map[string]cfn.DescribeStackResourcesOutput)
allResources := make(map[string]NodeGroupStacksAndResources)

for _, s := range stacks {
input := &cfn.DescribeStackResourcesInput{
Expand All @@ -101,7 +107,10 @@ func (c *StackCollection) DescribeResourcesOfNodeGroupStacks() (map[string]cfn.D
if err != nil {
return nil, errors.Wrapf(err, "getting all resources for %q stack", *s.StackName)
}
allResources[getNodeGroupName(s)] = *resources
allResources[getNodeGroupName(s)] = NodeGroupStacksAndResources{
Resources: resources.StackResources,
Stack: s,
}
}

return allResources, nil
Expand Down
9 changes: 7 additions & 2 deletions pkg/ctl/create/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ func doCreateNodeGroup(p *api.ProviderConfig, cfg *api.ClusterConfig, ng *api.No
return err
}

stackManager := ctl.NewStackManager(cfg)

if err := ctl.ValidateClusterForCompatibility(cfg, stackManager); err != nil {
return errors.Wrap(err, "cluster compatibility check failed")
}

if err := ctl.GetClusterVPC(cfg); err != nil {
return errors.Wrapf(err, "getting VPC configuration for cluster %q", cfg.Metadata.Name)
}
Expand All @@ -116,7 +122,6 @@ func doCreateNodeGroup(p *api.ProviderConfig, cfg *api.ClusterConfig, ng *api.No
}

{
stackManager := ctl.NewStackManager(cfg)
logger.Info("will create a Cloudformation stack for nodegroup %s in cluster %s", ng.Name, cfg.Metadata.Name)
errs := stackManager.CreateOneNodeGroup(ng)
if len(errs) > 0 {
Expand Down Expand Up @@ -157,7 +162,7 @@ func doCreateNodeGroup(p *api.ProviderConfig, cfg *api.ClusterConfig, ng *api.No
logger.Success("created nodegroup %q in cluster %q", ng.Name, cfg.Metadata.Name)

logger.Info("will inspect security group configuration for all nodegroups")
if err := ctl.ValidateConfigForExistingNodeGroups(cfg); err != nil {
if err := ctl.ValidateExistingNodeGroupsForCompatibility(cfg, stackManager); err != nil {
logger.Critical("failed checking nodegroups", err.Error())
}

Expand Down
71 changes: 71 additions & 0 deletions pkg/eks/compatibility.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package eks

import (
"fmt"
"strings"

"github.com/kris-nova/logger"
"github.com/pkg/errors"

api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha3"
"github.com/weaveworks/eksctl/pkg/cfn/builder"
"github.com/weaveworks/eksctl/pkg/cfn/manager"
)

// ValidateClusterForCompatibility looks at the cluster stack and check if it's
// compatible with current nodegroup configuration, if it find issues it returns an error
func (c *ClusterProvider) ValidateClusterForCompatibility(cfg *api.ClusterConfig, stackManager *manager.StackCollection) error {
// TODO: must move this before we try to create the nodegroup actually
cluster, err := stackManager.DescribeClusterStack()
if err != nil {
return errors.Wrap(err, "getting cluster stacks")
}

sharedClusterNodeSG := ""
for _, x := range cluster.Outputs {
if *x.OutputKey == builder.CfnOutputClusterSharedNodeSecurityGroup {
sharedClusterNodeSG = *x.OutputValue
}
}

if sharedClusterNodeSG == "" {
return fmt.Errorf("cluster %q does not have shared node security group", cfg.Metadata.Name)
}

return nil
}

// ValidateExistingNodeGroupsForCompatibility looks at each of the existing nodegroups and
// validates configuration, if it find issues it logs messages
func (c *ClusterProvider) ValidateExistingNodeGroupsForCompatibility(cfg *api.ClusterConfig, stackManager *manager.StackCollection) error {
resourcesByNodeGroup, err := stackManager.DescribeNodeGroupStacksAndResources()
if err != nil {
return errors.Wrap(err, "getting resources for of all nodegroup stacks")
}

incompatibleNodeGroups := []string{}
for ng, resources := range resourcesByNodeGroup {
compatible := false
for _, x := range resources.Stack.Outputs {
if *x.OutputKey == builder.CfnOutputNodeGroupFeatureSharedSecurityGroup {
compatible = true
}
}
if !compatible {
incompatibleNodeGroups = append(incompatibleNodeGroups, ng)
}
}

numIncompatibleNodeGroups := len(incompatibleNodeGroups)
if numIncompatibleNodeGroups == 0 {
return nil
}

logger.Critical("found %d nodegroup(s) (%s) without shared security group, cluster networking maybe be broken",
numIncompatibleNodeGroups, strings.Join(incompatibleNodeGroups, ", "))
logger.Critical("it's recommended to delete these nodegroups and create new ones instead")
logger.Critical("as a temporary fix, you can patch the configuration and add each of these nodegroup(s) to %q",
cfg.VPC.SharedNodeSecurityGroup)

return nil
}
6 changes: 6 additions & 0 deletions pkg/eks/eks.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ func (c *ClusterProvider) GetClusterVPC(spec *api.ClusterConfig) error {
return fmt.Errorf(requiredKeyErrFmt, builder.CfnOutputClusterSecurityGroup)
}

if sharedNodeSecurityGroup, ok := outputs[builder.CfnOutputClusterSharedNodeSecurityGroup]; ok {
spec.VPC.SharedNodeSecurityGroup = sharedNodeSecurityGroup
} else {
return fmt.Errorf(requiredKeyErrFmt, builder.CfnOutputClusterSharedNodeSecurityGroup)
}

for _, topology := range api.SubnetTopologies() {
// either of subnet topologies are optional
if subnets, ok := outputs[builder.CfnOutputClusterSubnets+string(topology)]; ok {
Expand Down
58 changes: 0 additions & 58 deletions pkg/eks/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ import (
"fmt"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/kris-nova/logger"
"github.com/pkg/errors"

Expand Down Expand Up @@ -128,59 +126,3 @@ func (c *ClusterProvider) WaitForNodes(clientSet *clientset.Clientset, ng *api.N

return nil
}

// ValidateConfigForExistingNodeGroups looks at each of the existing nodegroups and
// validates configuration, if it find issues it logs messages
func (c *ClusterProvider) ValidateConfigForExistingNodeGroups(cfg *api.ClusterConfig) error {
stackManager := c.NewStackManager(cfg)
resourcesByNodeGroup, err := stackManager.DescribeResourcesOfNodeGroupStacks()
if err != nil {
return errors.Wrap(err, "getting resources for of all nodegroup stacks")
}

{
securityGroupIDs := []string{}
securityGroupIDsToNodeGroup := map[string]string{}
for ng, resources := range resourcesByNodeGroup {
for n := range resources.StackResources {
r := resources.StackResources[n]
if *r.ResourceType == "AWS::EC2::SecurityGroup" {
securityGroupIDs = append(securityGroupIDs, *r.PhysicalResourceId)
securityGroupIDsToNodeGroup[*r.PhysicalResourceId] = ng
}
}
}

input := &ec2.DescribeSecurityGroupsInput{
GroupIds: aws.StringSlice(securityGroupIDs),
}
output, err := c.Provider.EC2().DescribeSecurityGroups(input)
if err != nil {
return errors.Wrap(err, "describing security groups")
}

for _, sg := range output.SecurityGroups {
id := *sg.GroupId
ng := securityGroupIDsToNodeGroup[id]
logger.Debug("%s/%s = %#v", ng, id, sg)
hasDNS := 0
for _, p := range sg.IpPermissions {
if p.FromPort != nil && *p.FromPort == 53 && p.ToPort != nil && *p.ToPort == 53 {
if *p.IpProtocol == "tcp" || *p.IpProtocol == "udp" {
// we cannot check p.IpRanges as we don't have VPC CIDR info when
// we create the nodegroup, it may become important, but for now
// it does't appear critical to check it
hasDNS++
}
}
}
if hasDNS != 2 {
logger.Critical("nodegroup %q may not have DNS port open to other nodegroups, so cluster DNS maybe be broken", ng)
logger.Critical("it's recommended to delete the nodegroup %q and use new one instead")
logger.Critical("check/update %q ingress rules - port 53 (TCP & UDP) has to be open for all sources inside the VPC", sg)
}
}
}

return nil
}

0 comments on commit 1ec6ae2

Please sign in to comment.