From b53df268e1688254d8c94e835510e63eab6b8452 Mon Sep 17 00:00:00 2001 From: Shyam Radhakrishnan Date: Tue, 30 Aug 2022 12:49:17 +0530 Subject: [PATCH] Try all available fault domains in case of out of host capacity --- cloud/ociutil/ociutil.go | 6 +++ cloud/scope/machine.go | 41 ++++++++++++++-- cloud/scope/machine_test.go | 95 +++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 3 deletions(-) diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index f42a9d6a..0613133e 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "net/http" + "strings" "time" nlb "github.com/oracle/cluster-api-provider-oci/cloud/services/networkloadbalancer" @@ -38,6 +39,7 @@ const ( CreatedBy = "CreatedBy" OCIClusterAPIProvider = "OCIClusterAPIProvider" ClusterResourceIdentifier = "ClusterResourceIdentifier" + OutOfHostCapacityErr = "Out of host capacity" ) // ErrNotFound is for simulation during testing, OCI SDK does not have a way @@ -58,6 +60,10 @@ func IsNotFound(err error) bool { return ok && serviceErr.GetHTTPStatusCode() == http.StatusNotFound } +func IsOutOfHostCapacity(err error) bool { + return strings.Contains(err.Error(), OutOfHostCapacityErr) +} + // AwaitLBWorkRequest waits for the LB work request to either succeed, fail. See k8s.io/apimachinery/pkg/util/wait func AwaitLBWorkRequest(ctx context.Context, networkLoadBalancerClient nlb.NetworkLoadBalancerClient, workRequestId *string) (*networkloadbalancer.WorkRequest, error) { var wr *networkloadbalancer.WorkRequest diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index da00c346..f3490f87 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -21,10 +21,11 @@ import ( "encoding/base64" "fmt" "math/rand" - "sigs.k8s.io/cluster-api/util/conditions" "strconv" "time" + "sigs.k8s.io/cluster-api/util/conditions" + "github.com/oracle/cluster-api-provider-oci/cloud/services/vcn" "github.com/go-logr/logr" @@ -246,9 +247,20 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, if (shapeConfig != core.LaunchInstanceShapeConfigDetails{}) { launchDetails.ShapeConfig = &shapeConfig } - if faultDomain != "" { - launchDetails.FaultDomain = common.String(faultDomain) + initialFaultDomain := faultDomain + adMap := m.OCICluster.Status.AvailabilityDomains[availabilityDomain] + if initialFaultDomain == "" { + // pick a random fault domain + rand.Seed(time.Now().UnixNano()) + // rand.Intn(3) will produce a random number from 0(inclusive) to 3(exclusive) + faultDomainIndex := rand.Intn(3) + initialFaultDomain = adMap.FaultDomains[faultDomainIndex] } + + m.Logger.Info("Fault Domain being used", "fault-domain", initialFaultDomain) + m.Logger.Info("AD being used", "ad", availabilityDomain) + + launchDetails.FaultDomain = common.String(initialFaultDomain) if nsgId != nil { launchDetails.CreateVnicDetails.NsgIds = []string{*nsgId} } @@ -256,6 +268,29 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))} resp, err := m.ComputeClient.LaunchInstance(ctx, req) if err != nil { + // try other fault domains unless user specified a specific one + if ociutil.IsOutOfHostCapacity(err) && faultDomain != "" { + m.Logger.Info("The chosen fault domain did not have capacity, trying other fault domains") + for fdIndex, fd := range adMap.FaultDomains { + if fd != faultDomain { + m.Logger.Info("Fault Domain being used for retry", "fault-domain", fd) + launchDetails.FaultDomain = common.String(fd) + req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails, + OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))} + resp, err = m.ComputeClient.LaunchInstance(ctx, req) + if err != nil { + // if another out of host error comes, try other fault domains + // till we are out of fault domains, in which case return the last error + if ociutil.IsOutOfHostCapacity(err) && fdIndex != (len(adMap.FaultDomains)-1) { + continue + } else { + return nil, err + } + } + return &resp.Instance, nil + } + } + } return nil, err } else { return &resp.Instance, nil diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index f4edfd38..93c89346 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -40,6 +40,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" ) +var fdList = []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"} + func TestInstanceReconciliation(t *testing.T) { var ( ms *MachineScope @@ -277,6 +279,47 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.LaunchInstanceResponse{}, nil) }, }, + { + name: "try all fds", + errorExpected: true, + matchError: TestError{errorString: ociutil.OutOfHostCapacityErr}, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return instanceFDMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr}) + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return instanceFDMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr}) + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return instanceFDMatcher(request, "FAULT-DOMAIN-3") + })).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr}) + }, + }, + { + name: "second fd works", + errorExpected: false, + matchError: TestError{errorString: ociutil.OutOfHostCapacityErr}, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return anyFdMatcher(request) + })).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr}) + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return anyFdMatcher(request) + })).Return(core.LaunchInstanceResponse{}, nil) + }, + }, { name: "check compartment at cluster", errorExpected: false, @@ -341,6 +384,7 @@ func TestInstanceReconciliation(t *testing.T) { BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8, }, AvailabilityDomain: common.String("ad2"), + FaultDomain: common.String("FAULT-DOMAIN-2"), CompartmentId: common.String("test"), IsPvEncryptionInTransitEnabled: common.Bool(true), DefinedTags: map[string]map[string]interface{}{}, @@ -379,6 +423,7 @@ func TestInstanceReconciliation(t *testing.T) { }, Shape: common.String("shape"), AvailabilityDomain: common.String("ad2"), + FaultDomain: common.String("FAULT-DOMAIN-2"), CompartmentId: common.String("test"), IsPvEncryptionInTransitEnabled: common.Bool(true), DefinedTags: map[string]map[string]interface{}{}, @@ -425,6 +470,7 @@ func TestInstanceReconciliation(t *testing.T) { BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8, }, AvailabilityDomain: common.String("ad2"), + FaultDomain: common.String("FAULT-DOMAIN-2"), CompartmentId: common.String("test"), IsPvEncryptionInTransitEnabled: common.Bool(true), DefinedTags: map[string]map[string]interface{}{}, @@ -473,6 +519,7 @@ func TestInstanceReconciliation(t *testing.T) { BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8, }, AvailabilityDomain: common.String("ad2"), + FaultDomain: common.String("FAULT-DOMAIN-2"), CompartmentId: common.String("test"), IsPvEncryptionInTransitEnabled: common.Bool(true), DefinedTags: map[string]map[string]interface{}{}, @@ -525,6 +572,7 @@ func TestInstanceReconciliation(t *testing.T) { BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8, }, AvailabilityDomain: common.String("ad2"), + FaultDomain: common.String("FAULT-DOMAIN-2"), CompartmentId: common.String("test"), IsPvEncryptionInTransitEnabled: common.Bool(true), DefinedTags: map[string]map[string]interface{}{}, @@ -578,6 +626,7 @@ func TestInstanceReconciliation(t *testing.T) { BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8, }, AvailabilityDomain: common.String("ad2"), + FaultDomain: common.String("FAULT-DOMAIN-2"), CompartmentId: common.String("test"), IsPvEncryptionInTransitEnabled: common.Bool(true), DefinedTags: map[string]map[string]interface{}{}, @@ -636,6 +685,30 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error { return nil } +func instanceFDMatcher(request interface{}, matchStr string) error { + r, ok := request.(core.LaunchInstanceRequest) + if !ok { + return errors.New("expecting LaunchInstanceRequest type") + } + if *r.LaunchInstanceDetails.FaultDomain != matchStr { + return errors.New(fmt.Sprintf("expecting fd as %s", matchStr)) + } + return nil +} + +func anyFdMatcher(request interface{}) error { + r, ok := request.(core.LaunchInstanceRequest) + if !ok { + return errors.New("expecting LaunchInstanceRequest type") + } + for _, f := range fdList { + if f == *r.FaultDomain { + return nil + } + } + return errors.New(fmt.Sprintf("invalid fd")) +} + func TestLBReconciliationCreation(t *testing.T) { var ( ms *MachineScope @@ -1304,6 +1377,7 @@ func setupAllParams(ms *MachineScope) { "2": { Attributes: map[string]string{ "AvailabilityDomain": "ad2", + "FaultDomain": "FAULT-DOMAIN-2", }, }, "3": { @@ -1312,6 +1386,17 @@ func setupAllParams(ms *MachineScope) { }, }, } + ms.OCICluster.Status.AvailabilityDomains = map[string]infrastructurev1beta1.OCIAvailabilityDomain{ + "ad1": { + FaultDomains: fdList, + }, + "ad2": { + FaultDomains: fdList, + }, + "ad3": { + FaultDomains: fdList, + }, + } ms.Machine.Spec.FailureDomain = common.String("2") ms.OCICluster.Spec.NetworkSpec.Vcn.Subnets = []*infrastructurev1beta1.Subnet{ { @@ -1323,3 +1408,13 @@ func setupAllParams(ms *MachineScope) { ms.OCICluster.Spec.OCIResourceIdentifier = "resource_uid" ms.OCIMachine.UID = "machineuid" } + +// The error built-in interface type is the conventional interface for +// representing an error condition, with the nil value representing no error. +type TestError struct { + errorString string +} + +func (t TestError) Error() string { + return t.errorString +}