Skip to content

Commit

Permalink
Try all available fault domains in case of out of host capacity
Browse files Browse the repository at this point in the history
  • Loading branch information
shyamradhakrishnan committed Aug 30, 2022
1 parent f638fc5 commit a670a80
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 3 deletions.
6 changes: 6 additions & 0 deletions cloud/ociutil/ociutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"net/http"
"strings"
"time"

nlb "github.com/oracle/cluster-api-provider-oci/cloud/services/networkloadbalancer"
Expand All @@ -38,6 +39,7 @@ const (
CreatedBy = "CreatedBy"
OCIClusterAPIProvider = "OCIClusterAPIProvider"
ClusterResourceIdentifier = "ClusterResourceIdentifier"
OutOfHostCapacityErr = "Out of host capacity"
)

// ErrNotFound is for simulation during testing, OCI SDK does not have a way
Expand All @@ -58,6 +60,10 @@ func IsNotFound(err error) bool {
return ok && serviceErr.GetHTTPStatusCode() == http.StatusNotFound
}

func IsOutOfHostCapacity(err error) bool {
return strings.Contains(err.Error(), OutOfHostCapacityErr)
}

// AwaitLBWorkRequest waits for the LB work request to either succeed, fail. See k8s.io/apimachinery/pkg/util/wait
func AwaitLBWorkRequest(ctx context.Context, networkLoadBalancerClient nlb.NetworkLoadBalancerClient, workRequestId *string) (*networkloadbalancer.WorkRequest, error) {
var wr *networkloadbalancer.WorkRequest
Expand Down
42 changes: 39 additions & 3 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ import (
"encoding/base64"
"fmt"
"math/rand"
"sigs.k8s.io/cluster-api/util/conditions"
"strconv"
"time"

"sigs.k8s.io/cluster-api/util/conditions"

"github.com/oracle/cluster-api-provider-oci/cloud/services/vcn"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -246,16 +247,51 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
if (shapeConfig != core.LaunchInstanceShapeConfigDetails{}) {
launchDetails.ShapeConfig = &shapeConfig
}
if faultDomain != "" {
launchDetails.FaultDomain = common.String(faultDomain)
initialFaultDomain := faultDomain
adMap := m.OCICluster.Status.AvailabilityDomains[availabilityDomain]
if initialFaultDomain == "" {
// pick a random fault domain
rand.Seed(time.Now().UnixNano())
// rand.Intn(3) will produce a random number from 0(inclusive) to 3(exclusive)
// ee add one to get a number from 1 to 3
faultDomainIndex := rand.Intn(3) + 1
initialFaultDomain = adMap.FaultDomains[faultDomainIndex]
}

m.Logger.Info("Fault Domain being used", "fault-domain", initialFaultDomain)
m.Logger.Info("AD being used", "ad", availabilityDomain)

launchDetails.FaultDomain = common.String(initialFaultDomain)
if nsgId != nil {
launchDetails.CreateVnicDetails.NsgIds = []string{*nsgId}
}
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
resp, err := m.ComputeClient.LaunchInstance(ctx, req)
if err != nil {
// try other fault domains unless user specified a specific one
if ociutil.IsOutOfHostCapacity(err) && faultDomain != "" {
m.Logger.Info("The chosen fault domain did not have capacity, trying other fault domains")
for fdIndex, fd := range adMap.FaultDomains {
if fd != faultDomain {
m.Logger.Info("Fault Domain being used for retry", "fault-domain", fd)
launchDetails.FaultDomain = common.String(fd)
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
resp, err = m.ComputeClient.LaunchInstance(ctx, req)
if err != nil {
// if another out of host error comes, try other fault domains
// till we are out of fault domains, in which case return the last error
if ociutil.IsOutOfHostCapacity(err) && fdIndex != (len(adMap.FaultDomains)-1) {
continue
} else {
return nil, err
}
}
return &resp.Instance, nil
}
}
}
return nil, err
} else {
return &resp.Instance, nil
Expand Down
95 changes: 95 additions & 0 deletions cloud/scope/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

var fdList = []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}

func TestInstanceReconciliation(t *testing.T) {
var (
ms *MachineScope
Expand Down Expand Up @@ -277,6 +279,47 @@ func TestInstanceReconciliation(t *testing.T) {
})).Return(core.LaunchInstanceResponse{}, nil)
},
},
{
name: "try all fds",
errorExpected: true,
matchError: TestError{errorString: ociutil.OutOfHostCapacityErr},
testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) {
setupAllParams(ms)
computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{
DisplayName: common.String("name"),
CompartmentId: common.String("test"),
})).Return(core.ListInstancesResponse{}, nil)

computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return instanceFDMatcher(request, "FAULT-DOMAIN-1")
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return instanceFDMatcher(request, "FAULT-DOMAIN-2")
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return instanceFDMatcher(request, "FAULT-DOMAIN-3")
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
},
},
{
name: "second fd works",
errorExpected: false,
matchError: TestError{errorString: ociutil.OutOfHostCapacityErr},
testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) {
setupAllParams(ms)
computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{
DisplayName: common.String("name"),
CompartmentId: common.String("test"),
})).Return(core.ListInstancesResponse{}, nil)

computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return anyFdMatcher(request)
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return anyFdMatcher(request)
})).Return(core.LaunchInstanceResponse{}, nil)
},
},
{
name: "check compartment at cluster",
errorExpected: false,
Expand Down Expand Up @@ -341,6 +384,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -379,6 +423,7 @@ func TestInstanceReconciliation(t *testing.T) {
},
Shape: common.String("shape"),
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -425,6 +470,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -473,6 +519,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -525,6 +572,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -578,6 +626,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -636,6 +685,30 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error {
return nil
}

func instanceFDMatcher(request interface{}, matchStr string) error {
r, ok := request.(core.LaunchInstanceRequest)
if !ok {
return errors.New("expecting LaunchInstanceRequest type")
}
if *r.LaunchInstanceDetails.FaultDomain != matchStr {
return errors.New(fmt.Sprintf("expecting fd as %s", matchStr))
}
return nil
}

func anyFdMatcher(request interface{}) error {
r, ok := request.(core.LaunchInstanceRequest)
if !ok {
return errors.New("expecting LaunchInstanceRequest type")
}
for _, f := range fdList {
if f == *r.FaultDomain {
return nil
}
}
return errors.New(fmt.Sprintf("invalid fd"))
}

func TestLBReconciliationCreation(t *testing.T) {
var (
ms *MachineScope
Expand Down Expand Up @@ -1304,6 +1377,7 @@ func setupAllParams(ms *MachineScope) {
"2": {
Attributes: map[string]string{
"AvailabilityDomain": "ad2",
"FaultDomain": "FAULT-DOMAIN-2",
},
},
"3": {
Expand All @@ -1312,6 +1386,17 @@ func setupAllParams(ms *MachineScope) {
},
},
}
ms.OCICluster.Status.AvailabilityDomains = map[string]infrastructurev1beta1.OCIAvailabilityDomain{
"ad1": {
FaultDomains: fdList,
},
"ad2": {
FaultDomains: fdList,
},
"ad3": {
FaultDomains: fdList,
},
}
ms.Machine.Spec.FailureDomain = common.String("2")
ms.OCICluster.Spec.NetworkSpec.Vcn.Subnets = []*infrastructurev1beta1.Subnet{
{
Expand All @@ -1323,3 +1408,13 @@ func setupAllParams(ms *MachineScope) {
ms.OCICluster.Spec.OCIResourceIdentifier = "resource_uid"
ms.OCIMachine.UID = "machineuid"
}

// The error built-in interface type is the conventional interface for
// representing an error condition, with the nil value representing no error.
type TestError struct {
errorString string
}

func (t TestError) Error() string {
return t.errorString
}

0 comments on commit a670a80

Please sign in to comment.