Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try all available fault domains in case of out of host capacity #134

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cloud/ociutil/ociutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"net/http"
"strings"
"time"

nlb "github.com/oracle/cluster-api-provider-oci/cloud/services/networkloadbalancer"
Expand All @@ -38,6 +39,7 @@ const (
CreatedBy = "CreatedBy"
OCIClusterAPIProvider = "OCIClusterAPIProvider"
ClusterResourceIdentifier = "ClusterResourceIdentifier"
OutOfHostCapacityErr = "Out of host capacity"
)

// ErrNotFound is for simulation during testing, OCI SDK does not have a way
Expand All @@ -58,6 +60,10 @@ func IsNotFound(err error) bool {
return ok && serviceErr.GetHTTPStatusCode() == http.StatusNotFound
}

func IsOutOfHostCapacity(err error) bool {
return strings.Contains(err.Error(), OutOfHostCapacityErr)
}

// AwaitLBWorkRequest waits for the LB work request to either succeed, fail. See k8s.io/apimachinery/pkg/util/wait
func AwaitLBWorkRequest(ctx context.Context, networkLoadBalancerClient nlb.NetworkLoadBalancerClient, workRequestId *string) (*networkloadbalancer.WorkRequest, error) {
var wr *networkloadbalancer.WorkRequest
Expand Down
41 changes: 38 additions & 3 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ import (
"encoding/base64"
"fmt"
"math/rand"
"sigs.k8s.io/cluster-api/util/conditions"
"strconv"
"time"

"sigs.k8s.io/cluster-api/util/conditions"

"github.com/oracle/cluster-api-provider-oci/cloud/services/vcn"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -246,16 +247,50 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
if (shapeConfig != core.LaunchInstanceShapeConfigDetails{}) {
launchDetails.ShapeConfig = &shapeConfig
}
if faultDomain != "" {
launchDetails.FaultDomain = common.String(faultDomain)
initialFaultDomain := faultDomain
adMap := m.OCICluster.Status.AvailabilityDomains[availabilityDomain]
if initialFaultDomain == "" {
// pick a random fault domain
rand.Seed(time.Now().UnixNano())
// rand.Intn(3) will produce a random number from 0(inclusive) to 3(exclusive)
faultDomainIndex := rand.Intn(3)
initialFaultDomain = adMap.FaultDomains[faultDomainIndex]
}

m.Logger.Info("Fault Domain being used", "fault-domain", initialFaultDomain)
m.Logger.Info("AD being used", "ad", availabilityDomain)

launchDetails.FaultDomain = common.String(initialFaultDomain)
if nsgId != nil {
launchDetails.CreateVnicDetails.NsgIds = []string{*nsgId}
}
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
resp, err := m.ComputeClient.LaunchInstance(ctx, req)
if err != nil {
// try other fault domains unless user specified a specific one
if ociutil.IsOutOfHostCapacity(err) && faultDomain != "" {
m.Logger.Info("The chosen fault domain did not have capacity, trying other fault domains")
for fdIndex, fd := range adMap.FaultDomains {
if fd != faultDomain {
m.Logger.Info("Fault Domain being used for retry", "fault-domain", fd)
launchDetails.FaultDomain = common.String(fd)
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
resp, err = m.ComputeClient.LaunchInstance(ctx, req)
if err != nil {
// if another out of host error comes, try other fault domains
// till we are out of fault domains, in which case return the last error
if ociutil.IsOutOfHostCapacity(err) && fdIndex != (len(adMap.FaultDomains)-1) {
continue
} else {
return nil, err
}
}
return &resp.Instance, nil
}
}
}
return nil, err
} else {
return &resp.Instance, nil
Expand Down
95 changes: 95 additions & 0 deletions cloud/scope/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

var fdList = []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}

func TestInstanceReconciliation(t *testing.T) {
var (
ms *MachineScope
Expand Down Expand Up @@ -277,6 +279,47 @@ func TestInstanceReconciliation(t *testing.T) {
})).Return(core.LaunchInstanceResponse{}, nil)
},
},
{
name: "try all fds",
errorExpected: true,
matchError: TestError{errorString: ociutil.OutOfHostCapacityErr},
testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) {
setupAllParams(ms)
computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{
DisplayName: common.String("name"),
CompartmentId: common.String("test"),
})).Return(core.ListInstancesResponse{}, nil)

computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return instanceFDMatcher(request, "FAULT-DOMAIN-1")
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return instanceFDMatcher(request, "FAULT-DOMAIN-2")
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return instanceFDMatcher(request, "FAULT-DOMAIN-3")
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
},
},
{
name: "second fd works",
errorExpected: false,
matchError: TestError{errorString: ociutil.OutOfHostCapacityErr},
testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) {
setupAllParams(ms)
computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{
DisplayName: common.String("name"),
CompartmentId: common.String("test"),
})).Return(core.ListInstancesResponse{}, nil)

computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return anyFdMatcher(request)
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
return anyFdMatcher(request)
})).Return(core.LaunchInstanceResponse{}, nil)
},
},
{
name: "check compartment at cluster",
errorExpected: false,
Expand Down Expand Up @@ -341,6 +384,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -379,6 +423,7 @@ func TestInstanceReconciliation(t *testing.T) {
},
Shape: common.String("shape"),
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -425,6 +470,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -473,6 +519,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -525,6 +572,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -578,6 +626,7 @@ func TestInstanceReconciliation(t *testing.T) {
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
},
AvailabilityDomain: common.String("ad2"),
FaultDomain: common.String("FAULT-DOMAIN-2"),
CompartmentId: common.String("test"),
IsPvEncryptionInTransitEnabled: common.Bool(true),
DefinedTags: map[string]map[string]interface{}{},
Expand Down Expand Up @@ -636,6 +685,30 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error {
return nil
}

func instanceFDMatcher(request interface{}, matchStr string) error {
r, ok := request.(core.LaunchInstanceRequest)
if !ok {
return errors.New("expecting LaunchInstanceRequest type")
}
if *r.LaunchInstanceDetails.FaultDomain != matchStr {
return errors.New(fmt.Sprintf("expecting fd as %s", matchStr))
}
return nil
}

func anyFdMatcher(request interface{}) error {
r, ok := request.(core.LaunchInstanceRequest)
if !ok {
return errors.New("expecting LaunchInstanceRequest type")
}
for _, f := range fdList {
if f == *r.FaultDomain {
return nil
}
}
return errors.New(fmt.Sprintf("invalid fd"))
}

func TestLBReconciliationCreation(t *testing.T) {
var (
ms *MachineScope
Expand Down Expand Up @@ -1304,6 +1377,7 @@ func setupAllParams(ms *MachineScope) {
"2": {
Attributes: map[string]string{
"AvailabilityDomain": "ad2",
"FaultDomain": "FAULT-DOMAIN-2",
},
},
"3": {
Expand All @@ -1312,6 +1386,17 @@ func setupAllParams(ms *MachineScope) {
},
},
}
ms.OCICluster.Status.AvailabilityDomains = map[string]infrastructurev1beta1.OCIAvailabilityDomain{
"ad1": {
FaultDomains: fdList,
},
"ad2": {
FaultDomains: fdList,
},
"ad3": {
FaultDomains: fdList,
},
}
ms.Machine.Spec.FailureDomain = common.String("2")
ms.OCICluster.Spec.NetworkSpec.Vcn.Subnets = []*infrastructurev1beta1.Subnet{
{
Expand All @@ -1323,3 +1408,13 @@ func setupAllParams(ms *MachineScope) {
ms.OCICluster.Spec.OCIResourceIdentifier = "resource_uid"
ms.OCIMachine.UID = "machineuid"
}

// The error built-in interface type is the conventional interface for
// representing an error condition, with the nil value representing no error.
type TestError struct {
errorString string
}

func (t TestError) Error() string {
return t.errorString
}