diff --git a/DESIGN.md b/DESIGN.md index 120bed221..2df851232 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -1,2 +1 @@ # Admiral - diff --git a/Makefile b/Makefile index 3d5ca6244..e430d3796 100644 --- a/Makefile +++ b/Makefile @@ -167,4 +167,4 @@ gen-yaml: cp ./install/sample/gtp_topology.yaml ./out/yaml/gtp_topology.yaml cp ./install/sample/grpc-client.yaml ./out/yaml/grpc-client.yaml cp ./install/prometheus/prometheus.yaml ./out/yaml/prometheus.yaml - cp ./install/scripts/*.sh ./out/scripts/ + cp ./install/scripts/*.sh ./out/scripts/ \ No newline at end of file diff --git a/admiral/cmd/admiral/cmd/root.go b/admiral/cmd/admiral/cmd/root.go index 8c1661ff0..dc211b589 100644 --- a/admiral/cmd/admiral/cmd/root.go +++ b/admiral/cmd/admiral/cmd/root.go @@ -133,6 +133,10 @@ func GetRootCmd(args []string) *cobra.Command { rootCmd.PersistentFlags().StringVar(¶ms.SecretResolverConfigPath, "secret_resolver_config_path", "/etc/config/resolver_config.yaml", "Path to the secret resolver config") rootCmd.PersistentFlags().BoolVar(¶ms.MetricsEnabled, "metrics", true, "Enable prometheus metrics collections") + rootCmd.PersistentFlags().StringVar(¶ms.AdmiralStateCheckerName,"admiral_state_checker_name","NoOPStateChecker","The value of the admiral_state_checker_name label to configure the DR Strategy for Admiral") + rootCmd.PersistentFlags().StringVar(¶ms.DRStateStoreConfigPath,"dr_state_store_config_path","","Location of config file which has details for data store. Ex:- Dynamo DB connection details") + rootCmd.PersistentFlags().StringVar(¶ms.ServiceEntryIPPrefix,"se_ip_prefix","240.0","IP prefix for the auto generated IPs for service entries. Only the first two octets: Eg- 240.0") + return rootCmd } diff --git a/admiral/pkg/apis/admiral/routes/handlers.go b/admiral/pkg/apis/admiral/routes/handlers.go index 736468196..033c0b3cf 100644 --- a/admiral/pkg/apis/admiral/routes/handlers.go +++ b/admiral/pkg/apis/admiral/routes/handlers.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "strconv" "strings" "github.com/gorilla/mux" @@ -27,10 +28,37 @@ type IdentityServiceEntry struct { ClusterNames []string `json:"Clusters,omitempty"` } -func (opts *RouteOpts) ReturnSuccessGET(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(200) - response := fmt.Sprintf("Heath check method called: %v, URI: %v, Method: %v\n", r.Host, r.RequestURI, r.Method) +/* +We expect the DNS health checker to include the query param checkifreadonly with value set to true. +The query param is used to check if the current Admiral instance is running in Active Mode or Passive Mode (also called read only mode). +If Running in passive mode, the health check returns 502 which forces DNS lookup to always return reference to Admiral in Active state. +*/ +func (opts *RouteOpts) ReturnSuccessGET(w http.ResponseWriter, r *http.Request) { + allQueryParams:= r.URL.Query() + checkIfReadOnlyStringVal := allQueryParams.Get("checkifreadonly") + //Remove all spaces + checkIfReadOnlyStringVal = strings.ReplaceAll(checkIfReadOnlyStringVal," ","") + // checkIfReadOnlyStringVal will be empty in case ""checkifreadonly" query param is not sent in the request. checkIfReadOnlyBoolVal will be false + checkIfReadOnlyBoolVal, err := strconv.ParseBool(checkIfReadOnlyStringVal) + var response string + if len(checkIfReadOnlyStringVal) ==0 || nil==err { + if checkIfReadOnlyBoolVal{ + admiralState := opts.RemoteRegistry.AdmiralState + if(*admiralState).ReadOnly{ + //Force fail health check if Admiral is in Readonly mode + w.WriteHeader(503) + }else { + w.WriteHeader(200) + } + }else { + w.WriteHeader(200) + } + response = fmt.Sprintf("Heath check method called: %v, URI: %v, Method: %v\n", r.Host, r.RequestURI, r.Method) + }else { + w.WriteHeader(400) + response = fmt.Sprintf("Health check method called with bad query param value %v for checkifreadonly",checkIfReadOnlyStringVal) + } _, writeErr := w.Write([]byte(response)) if writeErr != nil { log.Printf("Error writing body: %v", writeErr) diff --git a/admiral/pkg/clusters/DRUtil.go b/admiral/pkg/clusters/DRUtil.go new file mode 100644 index 000000000..832f8d45a --- /dev/null +++ b/admiral/pkg/clusters/DRUtil.go @@ -0,0 +1,47 @@ +package clusters + +import ( + "context" + "github.com/istio-ecosystem/admiral/admiral/pkg/controller/common" + log "github.com/sirupsen/logrus" + "strings" +) +const ReadWriteEnabled = false +const ReadOnlyEnabled = true; + +type AdmiralState struct { + ReadOnly bool +} + +type AdmiralStateChecker interface { + runStateCheck(ctx context.Context,as *AdmiralState) + shouldRunOnIndependentGoRoutine() bool +} +/* +Utility function to start Admiral DR checks. +DR checks can be run either on the main go routine or a new go routine +*/ +func RunAdmiralStateCheck(ctx context.Context,asc AdmiralStateChecker, as *AdmiralState){ + log.Infof("Starting DR checks") + if asc.shouldRunOnIndependentGoRoutine() { + log.Info("Starting Admiral State Checker on a new Go Routine") + go asc.runStateCheck(ctx,as) + }else { + log.Infof("Starting Admiral State Checker on existing Go Routine") + asc.runStateCheck(ctx,as) + } +} + +/* +utility function to identify the Admiral DR implementation based on the program parameters +*/ +func startAdmiralStateChecker (ctx context.Context,params common.AdmiralParams,as *AdmiralState){ + var admiralStateChecker AdmiralStateChecker + switch strings.ToLower(params.AdmiralStateCheckerName) { + case "dynamodbbasedstatechecker": + admiralStateChecker = DynamoDBBasedStateChecker{params.DRStateStoreConfigPath} + default: + admiralStateChecker = NoOPStateChecker{} + } + RunAdmiralStateCheck(ctx,admiralStateChecker,as) +} \ No newline at end of file diff --git a/admiral/pkg/clusters/DynamoDBBasedStateCheck.go b/admiral/pkg/clusters/DynamoDBBasedStateCheck.go new file mode 100644 index 000000000..ef13662c1 --- /dev/null +++ b/admiral/pkg/clusters/DynamoDBBasedStateCheck.go @@ -0,0 +1,109 @@ +package clusters + +import ( + "context" + log "github.com/sirupsen/logrus" + "time" +) + +/* +The skip lease pod can be used for testing DynamoDB based DR. +Update the podname field to "SKIP-LEASE-POD" to test Admiral pods in passive mode. +*/ +const SKIP_LEASE_CHECK_POD_NAME = "SKIP-LEASE-POD" + +type DynamoDBBasedStateChecker struct { + drConfigFileLocation string +} + +func (DynamoDBBasedStateChecker) shouldRunOnIndependentGoRoutine() bool{ + return true; +} + +/* +This method has the logic to update the ReadOnly field within the AdmiralState object based on the lease obtained on the shared lock object +The AdmiralState object is referenced everywhere in the code before trying to create/update/delete Istio custom objects + +Below is the logic for Admiral instance in Active state +1. Get the latest lease information from DynamoDB table +2. If the current pod owns the lease, update the last updated field with current timestamp +3. Update ReadOnly field to false. +4. Sleep for configured duration +5. Admiral instance which is constantly monitoring all the clusters for changes and is responsible to creating , updating and deleting the Istio custom objects +like Service Entry, Destination rule, Virtual Service , Sidecar and others. + +Below is the logic for Admiral instance in Passive state +1. Get the latest lease information from DynamoDB table +2. If the current pod does not own the lease, check if the last updated time field is within the configured wait threshold. +3. If the last updated time field is older than the computed threshold, update self as the owner of the lease with current timestamp as last updated time +4. If the last updated time field is within the computed threshold,mark current pod as read only +5. Sleep for configured duration +*/ +func (dr DynamoDBBasedStateChecker) runStateCheck(ctx context.Context,as *AdmiralState){ + as.ReadOnly = ReadOnlyEnabled + var dynamodbClient *DynamoClient + dynamoDBConfig,err := BuildDynamoDBConfig(dr.drConfigFileLocation) + if nil!= err { + log.Error("DynamoDR: Could not start DynamoDBBasedStateChecker ", err) + panic("Could not start DynamoDBBasedStateChecker") + } + dynamodbClient = NewDynamoClient(dynamoDBConfig) + waitDuration := time.Duration(dynamoDBConfig.WaitTimeInSeconds) * time.Second + ticker := time.NewTicker(waitDuration) + tickChan := ticker.C + + for { + select { + case <-ctx.Done(): + log.Infoln("DynamoDR: context done stopping ticker") + ticker.Stop() + + case <-tickChan: + ExecuteStateCheck(dynamoDBConfig, dynamodbClient, as) + } + } +} + +func ExecuteStateCheck(dynamoDBConfig DynamoDBConfig, dynamodbClient *DynamoClient ,as *AdmiralState){ + leaseName := dynamoDBConfig.LeaseName + podIdentifier := dynamoDBConfig.PodIdentifier + waitTimeInSeconds :=dynamoDBConfig.WaitTimeInSeconds + failureThreshold := dynamoDBConfig.FailureThreshold + log.Infof("DynamoDR: CurrentPod = %v LeaseName = %v WaitTime= %v sec tableName= %v role= %v region= %v" ,podIdentifier, leaseName, waitTimeInSeconds, dynamoDBConfig.TableName,dynamoDBConfig.Role,dynamoDBConfig.Region) + + currentTime := time.Now().UTC().Unix() + log.Infof("DynamoDR: Retrieving latest value of read write value for leaseName : %v , timestamp : %v " , leaseName,currentTime ) + readWriteLeases, err := dynamodbClient.getReadWriteLease() + if nil!=err{ + log.WithFields(log.Fields{ + "error": err.Error(), + }).Error("DynamoDR: Error retrieving the latest lease") + } + readWriteLease := filterOrCreateLeaseIfNotFound(readWriteLeases,leaseName) + if "" == readWriteLease.LeaseOwner { + log.Infof("DynamoDR: Lease with name=%v does not exist. Creating a new lease with owner=%v" , leaseName,podIdentifier) + readWriteLease.LeaseOwner = podIdentifier + readWriteLease.UpdatedTime = currentTime + dynamodbClient.updatedReadWriteLease(readWriteLease,dynamoDBConfig.TableName) + //Not updating read-write mode until we confirm this pod has the lease + }else if SKIP_LEASE_CHECK_POD_NAME == readWriteLease.LeaseOwner { + log.Info("DynamoDR: Lease held by skip lease check pod. Setting Admiral to read only mode") + as.ReadOnly = ReadOnlyEnabled; + }else if podIdentifier == readWriteLease.LeaseOwner { + as.ReadOnly = ReadWriteEnabled + log.Infof("DynamoDR: Lease with name=%v is owned by the current pod. Extending lease ownership till %v. Admiral will write",leaseName, currentTime) + readWriteLease.UpdatedTime = currentTime + dynamodbClient.updatedReadWriteLease(readWriteLease,dynamoDBConfig.TableName) + }else if readWriteLease.UpdatedTime < (currentTime - int64(waitTimeInSeconds*failureThreshold)){ + diffSecs := currentTime -readWriteLease.UpdatedTime + log.Infof("DynamoDR: Current time %v is more than the lastUpdated time of lease %v by %v sec. Taking over the lease from %v.",currentTime, readWriteLease.UpdatedTime,diffSecs, readWriteLease.LeaseOwner) + readWriteLease.LeaseOwner = podIdentifier + readWriteLease.UpdatedTime = currentTime + dynamodbClient.updatedReadWriteLease(readWriteLease,dynamoDBConfig.TableName) + //Not updating read-write mode until we confirm this pod has the lease + }else { + log.Infof("DynamoDR: Lease held by %v till %v . Admiral will not write ", readWriteLease.LeaseOwner, readWriteLease.UpdatedTime) + as.ReadOnly = ReadOnlyEnabled; + } + +} diff --git a/admiral/pkg/clusters/DynamoDBBasedStateCheck_test.go b/admiral/pkg/clusters/DynamoDBBasedStateCheck_test.go new file mode 100644 index 000000000..0cd5c19f0 --- /dev/null +++ b/admiral/pkg/clusters/DynamoDBBasedStateCheck_test.go @@ -0,0 +1,199 @@ +package clusters + +import ( + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/dynamodb" + "github.com/aws/aws-sdk-go/service/dynamodb/dynamodbiface" + "github.com/stretchr/testify/assert" + "testing" +) + +type mockDynamoClient struct { + dynamodbiface.DynamoDBAPI +} + +func (m *mockDynamoClient) Scan(input *dynamodb.ScanInput) (*dynamodb.ScanOutput, error){ + output := &dynamodb.ScanOutput{ + Items: []map[string]*dynamodb.AttributeValue{ + {"leaseName": { S: aws.String("testLease1"),}, "leaseOwner": {S: aws.String("testPod"),},"notes": {S: aws.String("test1"),},"updatedTime": {N: aws.String("1655875287"),},}, + {"leaseName": { S: aws.String("testLease2"),}, "leaseOwner": {S: aws.String("someotherPod"),},"notes": {S: aws.String("test2"),},"updatedTime": {N: aws.String("9999999999"),},}, + {"leaseName": { S: aws.String("testLease3"),}, "leaseOwner": {S: aws.String("someOtherPod"),},"notes": {S: aws.String("test3"),},"updatedTime": {N: aws.String("11111"),},}, + {"leaseName": { S: aws.String("skipLease"),}, "leaseOwner": {S: aws.String("SKIP-LEASE-POD"),},"notes": {S: aws.String("test3"),},"updatedTime": {N: aws.String("11111"),},}, + }, + } + + return output, nil +} + + +func (m *mockDynamoClient) GetItem(input *dynamodb.GetItemInput) (*dynamodb.GetItemOutput, error){ + output := &dynamodb.GetItemOutput{ + Item: map[string]*dynamodb.AttributeValue{"accountNumber": { S: aws.String("123123123"),}, "roleName": {S: aws.String("PowerUser"),},}, + } + return output, nil +} + +func (m *mockDynamoClient) PutItem(input *dynamodb.PutItemInput) (*dynamodb.PutItemOutput, error) { + return &dynamodb.PutItemOutput{}, nil +} + + +func Test_RunStateCheckReadOnlyToReadWriteTransition(t *testing.T) { + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "testLease1", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadOnlyEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadWriteEnabled) +} + +func Test_RunStateCheckReadWriteToReadOnlyTransition(t *testing.T) { + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "testLease2", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadWriteEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadOnlyEnabled) +} + +func Test_RunStateCheckReadWriteToReadWrite(t *testing.T) { + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "testLease1", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadWriteEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadWriteEnabled) +} + +func Test_RunStateCheckReadOnlyToReadOnly(t *testing.T) { + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "testLease2", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadOnlyEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadOnlyEnabled) +} + +func Test_RunStateCheckReadOnlyModeGrabbingLock(t *testing.T){ + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "testLease3", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadOnlyEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadOnlyEnabled) +} + +func Test_RunStateCheckNewLockUseCase(t *testing.T){ + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "testnewlease", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadOnlyEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadOnlyEnabled) +} + +func Test_RunStateCheckReadWriteModeSkipLeaseTransition(t *testing.T){ + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "skipLease", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadWriteEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadOnlyEnabled) +} + +func Test_RunStateCheckReadOnlyModeSkipLeaseNoChange(t *testing.T){ + dynamoDbConfig:=DynamoDBConfig{ + LeaseName: "skipLease", + PodIdentifier: "testPod", + WaitTimeInSeconds: 15, + FailureThreshold: 3, + TableName: "admiral-lease", + Role: "dummyRole", + Region: "us-west-2", + } + dynamodbClient := DynamoClient{ + &mockDynamoClient{}, + } + as:= &AdmiralState{ + ReadOnly: ReadOnlyEnabled, + } + ExecuteStateCheck(dynamoDbConfig,&dynamodbClient,as) + assert.Equal(t, as.ReadOnly, ReadOnlyEnabled) +} \ No newline at end of file diff --git a/admiral/pkg/clusters/DynamoDBUtil.go b/admiral/pkg/clusters/DynamoDBUtil.go new file mode 100644 index 000000000..a72828917 --- /dev/null +++ b/admiral/pkg/clusters/DynamoDBUtil.go @@ -0,0 +1,170 @@ +package clusters + +import ( + "fmt" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials/stscreds" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/dynamodb" + "github.com/aws/aws-sdk-go/service/dynamodb/dynamodbattribute" + "github.com/aws/aws-sdk-go/service/dynamodb/dynamodbiface" + log "github.com/sirupsen/logrus" + "gopkg.in/yaml.v2" + "io/ioutil" + "strconv" + "time" +) + +/* +Utility function to block the go-routine for duration specified +*/ +func sleep(sleepDuration time.Duration, sleepSeconds int){ + log.Info("Sleeping for ", sleepSeconds, " seconds") + time.Sleep(sleepDuration) +} + +/* +Utility function to filter lease from all the leases returned from DynamoDB +The DynamoDB table maybe used for multiple environments +*/ +func filterOrCreateLeaseIfNotFound(allLeases []ReadWriteLease, leaseName string) ReadWriteLease { + for _, readWriteLease := range allLeases { + if readWriteLease.LeaseName == leaseName { + return readWriteLease + } + } + readWriteLease := ReadWriteLease{} + readWriteLease.LeaseName = leaseName; + readWriteLease.Notes ="Created at "+strconv.FormatInt(time.Now().UTC().Unix(), 10) + return readWriteLease +} + + +type ReadWriteLease struct { + LeaseName string `json:"leaseName"` + LeaseOwner string `json:"leaseOwner"` + UpdatedTime int64 `json:"updatedTime"` + Notes string `json:"notes"` +} + +type DynamoClient struct { + svc dynamodbiface.DynamoDBAPI +} + +func NewDynamoClient(dynamoDBConfig DynamoDBConfig) *DynamoClient { + return &DynamoClient{ + svc: GetDynamoSvc(dynamoDBConfig.Role,dynamoDBConfig.Region), + } +} + +/* +Utility function to update lease duration . +This will be called in configured interval by Active instance +Passive instance calls this when it finds the existing Active instance has not udpated the lease within the duration specified. +*/ +func (client *DynamoClient) updatedReadWriteLease(lease ReadWriteLease, tableName string) error { + svc := client.svc + av, err := dynamodbattribute.MarshalMap(lease) + if err != nil { + log.WithFields(log.Fields{ + "error" : err.Error(), + }).Error("Error marshalling readWriteLease item.") + return err + } + + input := &dynamodb.PutItemInput{ + Item: av, + TableName: aws.String(tableName), + } + _, err = svc.PutItem(input) + if err != nil { + log.WithFields(log.Fields{ + "error": err.Error(), + }).Error("Got error calling PutItem:") + return err + } + log.WithFields(log.Fields{ + "leaseName": lease.LeaseName, + "leaseOwner": lease.LeaseOwner, + "updatedTime": lease.UpdatedTime, + "notes": lease.Notes, + }).Info("Successfully added item to table " + tableName) + + return err +} + +/* +Utility function to get all the entries from the Dynamo DB table +*/ +func (client *DynamoClient) getReadWriteLease() ([]ReadWriteLease, error) { + var readWriteLeases []ReadWriteLease + svc := client.svc + log.Info("Fetching existing readWrite entries...") + readWriteLeaseEntries, err := svc.Scan(&dynamodb.ScanInput{ + TableName: aws.String("admiral-lease"), + }) + if err != nil { + log.WithFields(log.Fields{ + "error": err.Error(), + }).Error("Failed to scan dynamo table") + return nil, err + } + + log.WithFields(log.Fields{ + "readWriteLeaseEntries": readWriteLeaseEntries, + }).Debug("retrieved records...") + + item := ReadWriteLease{} + + for _, v := range readWriteLeaseEntries.Items { + err = dynamodbattribute.UnmarshalMap(v, &item) + if err != nil { + log.WithFields(log.Fields{ + "error": err.Error(), + }).Panic("Failed to unmarshall record") + } + readWriteLeases = append(readWriteLeases, item) + } + return readWriteLeases, nil +} + +/* +Utility function to initialize AWS seassion for DynamoDB connection +*/ +func GetDynamoSvc(dynamoArn string,region string) *dynamodb.DynamoDB { + log.Info("dynamoArn: "+dynamoArn) + sess := session.Must(session.NewSession()) + // Create the credentials from AssumeRoleProvider to assume the role + // referenced by the "myRoleARN" ARN. + creds := stscreds.NewCredentials(sess, dynamoArn) + // Create a Session with a custom region + dynamoSession := session.Must(session.NewSession(&aws.Config{ + Credentials: creds, + Region: ®ion, + })) + // Create service client value configured for credentials + // from assumed role. + svc := dynamodb.New(dynamoSession) + return svc +} +/* +utility function to read the yaml file containing the DynamoDB configuration. +The file will be present inside the pod. File name should be provided as a program argument. +*/ +func BuildDynamoDBConfig(configFile string) (DynamoDBConfig, error) { + + data, err := ioutil.ReadFile(configFile) + dynamoDBConfigWrapper := &DynamoDBConfigWrapper{} + + if err != nil { + return DynamoDBConfig{}, fmt.Errorf("error reading config file to build Dynamo DB config: %v", err) + } + + err = yaml.Unmarshal(data, &dynamoDBConfigWrapper) + + if err != nil { + return DynamoDBConfig{}, fmt.Errorf("error unmarshaling config file err: %v", err) + } + + return dynamoDBConfigWrapper.DynamoDBConfig,nil +} diff --git a/admiral/pkg/clusters/DynamoDBUtil_test.go b/admiral/pkg/clusters/DynamoDBUtil_test.go new file mode 100644 index 000000000..5535bc81b --- /dev/null +++ b/admiral/pkg/clusters/DynamoDBUtil_test.go @@ -0,0 +1,124 @@ +package clusters + +import ( + "errors" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/dynamodb" + "github.com/aws/aws-sdk-go/service/dynamodb/dynamodbiface" + "github.com/stretchr/testify/assert" + "testing" +) + +type mockDynamoDBClient struct { + dynamodbiface.DynamoDBAPI +} + +func (m *mockDynamoDBClient) Scan(input *dynamodb.ScanInput) (*dynamodb.ScanOutput, error){ + output := &dynamodb.ScanOutput{ + Items: []map[string]*dynamodb.AttributeValue{ + {"leaseName": { S: aws.String("qal"),}, "leaseOwner": {S: aws.String("qal-west"),},"notes": {S: aws.String("test1"),},"updatedTime": {N: aws.String("1655875287"),},}, + {"leaseName": { S: aws.String("e2e"),}, "leaseOwner": {S: aws.String("e2e-west"),},"notes": {S: aws.String("test2"),},"updatedTime": {N: aws.String("1655875287"),},}, + }, + } + + return output, nil +} + + +func (m *mockDynamoDBClient) GetItem(input *dynamodb.GetItemInput) (*dynamodb.GetItemOutput, error){ + output := &dynamodb.GetItemOutput{ + Item: map[string]*dynamodb.AttributeValue{"accountNumber": { S: aws.String("123123123"),}, "roleName": {S: aws.String("PowerUser"),},}, + } + return output, nil +} + +func (m *mockDynamoDBClient) PutItem(input *dynamodb.PutItemInput) (*dynamodb.PutItemOutput, error) { + return &dynamodb.PutItemOutput{}, nil +} + + +type mockDynamoDBClientWithErrors struct { + dynamodbiface.DynamoDBAPI +} + +func (m *mockDynamoDBClientWithErrors) GetItem(input *dynamodb.GetItemInput) (*dynamodb.GetItemOutput, error){ + return &dynamodb.GetItemOutput{}, errors.New("error occurred retrieving the item") +} + +func (m *mockDynamoDBClientWithErrors) Scan(input *dynamodb.ScanInput) (*dynamodb.ScanOutput, error){ + return &dynamodb.ScanOutput{}, errors.New("dynamodb scan error") +} + +func (m *mockDynamoDBClientWithErrors) PutItem(input *dynamodb.PutItemInput) (*dynamodb.PutItemOutput, error) { + return &dynamodb.PutItemOutput{}, errors.New("error occured adding record to dynamodb") +} + +func Test_GetReadWriteLease(t *testing.T) { + + DynamodbClient := DynamoClient{ + &mockDynamoDBClient{}, + } + readWriteRecords, err := DynamodbClient.getReadWriteLease() + + assert.Nil(t, err) + assert.Equal(t, len(readWriteRecords), 2) +} + +func Test_GetReadWriteLeaseError(t *testing.T) { + + DynamodbClient := DynamoClient{ + &mockDynamoDBClientWithErrors{}, + } + whitelistRecords, err := DynamodbClient.getReadWriteLease() + + assert.NotNil(t, err) + assert.Equal(t, len(whitelistRecords), 0) + assert.Equal(t, err.Error(), "dynamodb scan error") +} + +func Test_WriteWhitelistRecord(t *testing.T) { + + lease := ReadWriteLease{ + LeaseName: "qal", + LeaseOwner: "qal-west", + Notes: "tester", + UpdatedTime: 1655875287, + } + DynamodbClient := DynamoClient{ + &mockDynamoDBClient{}, + } + + err := DynamodbClient.updatedReadWriteLease(lease,"test-table") + + assert.Nil(t, err) +} + +func Test_WriteWhitelistRecordWithError(t *testing.T) { + + lease := ReadWriteLease{ + LeaseName: "qal", + LeaseOwner: "qal-west", + Notes: "tester", + UpdatedTime: 1655875287, + } + DynamodbClient := DynamoClient{ + &mockDynamoDBClientWithErrors{}, + } + + err := DynamodbClient.updatedReadWriteLease(lease,"test-table") + + assert.NotNil(t, err) +} + +func Test_BuildDynamoDBConfig(t *testing.T) { + dynamoDBConfig,err := BuildDynamoDBConfig("testdata/fake-dynamodb-config.yaml") + assert.Nil(t, err) + assert.NotNil(t, dynamoDBConfig) + assert.Equal(t,"somelease" ,dynamoDBConfig.LeaseName) + assert.Equal(t,"arn:aws:iam::someaccount:role/somerole" ,dynamoDBConfig.Role) + assert.Equal(t,"sometable" ,dynamoDBConfig.TableName) + assert.Equal(t,3 ,dynamoDBConfig.FailureThreshold) + assert.Equal(t,"somename" ,dynamoDBConfig.PodIdentifier) + assert.Equal(t,"us-west-2" ,dynamoDBConfig.Region) + assert.Equal(t,15 ,dynamoDBConfig.WaitTimeInSeconds) +} \ No newline at end of file diff --git a/admiral/pkg/clusters/DynamoDbTypes.go b/admiral/pkg/clusters/DynamoDbTypes.go new file mode 100644 index 000000000..ff745a3f7 --- /dev/null +++ b/admiral/pkg/clusters/DynamoDbTypes.go @@ -0,0 +1,18 @@ +package clusters + +type DynamoDBConfigWrapper struct { + DynamoDBConfig DynamoDBConfig `yaml:"dynamoDB,omitempty"` +} + +/* +Reference struct used to unmarshall the DynamoDB config present in the yaml config file +*/ +type DynamoDBConfig struct { + LeaseName string `yaml:"leaseName,omitempty"` + PodIdentifier string `yaml:"podIdentifier,omitempty"` + WaitTimeInSeconds int `yaml:"waitTimeInSeconds,omitempty"` + FailureThreshold int `yaml:"failureThreshold,omitempty"` + TableName string `yaml:"tableName,omitempty"` + Role string `yaml:"role,omitempty"` + Region string `yaml:"region,omitempty"` +} diff --git a/admiral/pkg/clusters/NoOpDR.go b/admiral/pkg/clusters/NoOpDR.go new file mode 100644 index 000000000..7062d317e --- /dev/null +++ b/admiral/pkg/clusters/NoOpDR.go @@ -0,0 +1,21 @@ +package clusters + +import ( + "fmt" + "context" +) +/* +Default implementation of the interface defined for DR +*/ + +type NoOPStateChecker struct {} + +func (NoOPStateChecker) shouldRunOnIndependentGoRoutine() bool{ + return false; +} + +func (NoOPStateChecker) runStateCheck(ctx context.Context,as *AdmiralState){ + fmt.Print("NoOP State Checker called. Marking Admiral state as Read/Write enabled") + as.ReadOnly = ReadWriteEnabled +} + diff --git a/admiral/pkg/clusters/handler.go b/admiral/pkg/clusters/handler.go index 8e61f72c8..e51b989d0 100644 --- a/admiral/pkg/clusters/handler.go +++ b/admiral/pkg/clusters/handler.go @@ -323,33 +323,15 @@ func handleDestinationRuleEvent(obj *v1alpha3.DestinationRule, dh *DestinationRu if event == common.Delete { - err := rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(syncNamespace).Delete(obj.Name, &v12.DeleteOptions{}) - if err != nil { - log.Infof(LogFormat, "Delete", "DestinationRule", obj.Name, clusterId, "success") - } else { - log.Error(LogFormat, err) - } - err = rc.ServiceEntryController.IstioClient.NetworkingV1alpha3().ServiceEntries(syncNamespace).Delete(seName, &v12.DeleteOptions{}) - if err != nil { - log.Infof(LogFormat, "Delete", "ServiceEntry", seName, clusterId, "success") - } else { - log.Error(LogFormat, err) - } + deleteDestinationRulePostStateCheck(rc,syncNamespace,obj.Name, clusterId,r.AdmiralState) + + deleteServiceEntriesPostStateCheck(rc,syncNamespace,seName,clusterId,r.AdmiralState) + for _, subset := range destinationRule.Subsets { sseName := seName + common.Dash + subset.Name - err = rc.ServiceEntryController.IstioClient.NetworkingV1alpha3().ServiceEntries(syncNamespace).Delete(sseName, &v12.DeleteOptions{}) - if err != nil { - log.Infof(LogFormat, "Delete", "ServiceEntry", sseName, clusterId, "success") - } else { - log.Error(LogFormat, err) - } - } - err = rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(syncNamespace).Delete(localDrName, &v12.DeleteOptions{}) - if err != nil { - log.Infof(LogFormat, "Delete", "DestinationRule", localDrName, clusterId, "success") - } else { - log.Error(LogFormat, err) + deleteServiceEntriesPostStateCheck(rc,syncNamespace,sseName,clusterId,r.AdmiralState) } + deleteDestinationRulePostStateCheck(rc,syncNamespace,localDrName,clusterId,r.AdmiralState) } else { @@ -357,7 +339,7 @@ func handleDestinationRuleEvent(obj *v1alpha3.DestinationRule, dh *DestinationRu //copy destination rule only to other clusters if dependentCluster != clusterId { - addUpdateDestinationRule(obj, exist, syncNamespace, rc) + addUpdateDestinationRule(obj, exist, syncNamespace, rc,r.AdmiralState) } for _seName, se := range drServiceEntries { @@ -367,7 +349,7 @@ func handleDestinationRuleEvent(obj *v1alpha3.DestinationRule, dh *DestinationRu log.Warnf(LogErrFormat, "Create", "ServiceEntry", seName, clusterId, err) } if newServiceEntry != nil { - addUpdateServiceEntry(newServiceEntry, existsServiceEntry, syncNamespace, rc) + addUpdateServiceEntry(newServiceEntry, existsServiceEntry, syncNamespace, rc,r.AdmiralState) r.AdmiralCache.SeClusterCache.Put(newServiceEntry.Spec.Hosts[0], rc.ClusterID, rc.ClusterID) } //cache the subset service entries for updating them later for pod events @@ -378,7 +360,7 @@ func handleDestinationRuleEvent(obj *v1alpha3.DestinationRule, dh *DestinationRu if dependentCluster == clusterId { //we need a destination rule with local fqdn for destination rules created with cnames to work in local cluster - createDestinationRuleForLocal(rc, localDrName, localIdentityId, clusterId, &destinationRule) + createDestinationRuleForLocal(rc, localDrName, localIdentityId, clusterId, &destinationRule,r.AdmiralState) } } @@ -392,22 +374,17 @@ func handleDestinationRuleEvent(obj *v1alpha3.DestinationRule, dh *DestinationRu for _, rc := range r.RemoteControllers { if rc.ClusterID != clusterId { if event == common.Delete { - err := rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(syncNamespace).Delete(obj.Name, &v12.DeleteOptions{}) - if err != nil { - log.Infof(LogErrFormat, "Delete", "DestinationRule", obj.Name, clusterId, err) - } else { - log.Infof(LogFormat, "Delete", "DestinationRule", obj.Name, clusterId, "Success") - } + deleteDestinationRulePostStateCheck(rc,syncNamespace,obj.Name,clusterId,r.AdmiralState) } else { exist, _ := rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(syncNamespace).Get(obj.Name, v12.GetOptions{}) - addUpdateDestinationRule(obj, exist, syncNamespace, rc) + addUpdateDestinationRule(obj, exist, syncNamespace, rc,r.AdmiralState) } } } } func createDestinationRuleForLocal(remoteController *RemoteController, localDrName string, identityId string, clusterId string, - destinationRule *v1alpha32.DestinationRule) { + destinationRule *v1alpha32.DestinationRule,admiralState *AdmiralState) { deployment := remoteController.DeploymentController.Cache.Get(identityId) @@ -442,7 +419,7 @@ func createDestinationRuleForLocal(remoteController *RemoteController, localDrNa newDestinationRule := createDestinationRuleSkeletion(*destinationRule, localDrName, syncNamespace) if newDestinationRule != nil { - addUpdateDestinationRule(newDestinationRule, existsDestinationRule, syncNamespace, remoteController) + addUpdateDestinationRule(newDestinationRule, existsDestinationRule, syncNamespace, remoteController,admiralState) } } } @@ -523,7 +500,7 @@ func handleVirtualServiceEvent(obj *v1alpha3.VirtualService, vh *VirtualServiceH } } - addUpdateVirtualService(obj, exist, syncNamespace, rc) + addUpdateVirtualService(obj, exist, syncNamespace, rc,r.AdmiralState) } } } @@ -537,31 +514,45 @@ func handleVirtualServiceEvent(obj *v1alpha3.VirtualService, vh *VirtualServiceH for _, rc := range r.RemoteControllers { if rc.ClusterID != clusterId { if event == common.Delete { - err := rc.VirtualServiceController.IstioClient.NetworkingV1alpha3().VirtualServices(syncNamespace).Delete(obj.Name, &v12.DeleteOptions{}) - if err != nil { - log.Infof(LogErrFormat, "Delete", "VirtualService", obj.Name, clusterId, err) + err:= deleteVirtualServicePostStateCheck(rc,syncNamespace,obj.Name,clusterId,r.AdmiralState) + if nil!= err { return err - } else { - log.Infof(LogFormat, "Delete", "VirtualService", obj.Name, clusterId, "Success") } } else { exist, _ := rc.VirtualServiceController.IstioClient.NetworkingV1alpha3().VirtualServices(syncNamespace).Get(obj.Name, v12.GetOptions{}) - addUpdateVirtualService(obj, exist, syncNamespace, rc) + addUpdateVirtualService(obj, exist, syncNamespace, rc,r.AdmiralState) } } } return nil } +/* +Add/Update Virtual service after checking if the current pod is in ReadOnly mode. +Virtual Service object is not added/updated if the current pod is in ReadOnly mode. +*/ -func addUpdateVirtualService(obj *v1alpha3.VirtualService, exist *v1alpha3.VirtualService, namespace string, rc *RemoteController) { +func addUpdateVirtualService(obj *v1alpha3.VirtualService, exist *v1alpha3.VirtualService, namespace string, rc *RemoteController,admiralState *AdmiralState) { var err error var op string if obj.Annotations == nil { obj.Annotations = map[string]string{} } obj.Annotations["app.kubernetes.io/created-by"] = "admiral" - if exist == nil || len(exist.Spec.Hosts) == 0 { + vsIsNew:= (exist == nil || len(exist.Spec.Hosts) == 0) + + // If current Admiral pod is in read-only mode, do not create/update/delete virtual service objects + if (*admiralState).ReadOnly { + if vsIsNew { + op = "Add" + }else { + op = "Update" + } + log.Infof(LogFormat, op, "VirtualService", obj.Name, rc.ClusterID, "Skipped as Admiral pod is in read only mode") + return + } + + if vsIsNew { obj.Namespace = namespace obj.ResourceVersion = "" _, err = rc.VirtualServiceController.IstioClient.NetworkingV1alpha3().VirtualServices(namespace).Create(obj) @@ -580,15 +571,31 @@ func addUpdateVirtualService(obj *v1alpha3.VirtualService, exist *v1alpha3.Virtu log.Infof(LogFormat, op, "VirtualService", obj.Name, rc.ClusterID, "Success") } } - -func addUpdateServiceEntry(obj *v1alpha3.ServiceEntry, exist *v1alpha3.ServiceEntry, namespace string, rc *RemoteController) { +/* +Add/Update Service Entry objects after checking if the current pod is in ReadOnly mode. +Service Entry object is not added/updated if the current pod is in ReadOnly mode. +*/ +func addUpdateServiceEntry(obj *v1alpha3.ServiceEntry, exist *v1alpha3.ServiceEntry, namespace string, rc *RemoteController, admiralState *AdmiralState) { var err error var op string if obj.Annotations == nil { obj.Annotations = map[string]string{} } obj.Annotations["app.kubernetes.io/created-by"] = "admiral" - if exist == nil || exist.Spec.Hosts == nil { + seIsNew:= (exist == nil || exist.Spec.Hosts == nil) + + // If current Admiral pod is in read-only mode, do not create/update/delete service entry objects + if (*admiralState).ReadOnly { + if seIsNew { + op = "Add" + }else { + op = "Update" + } + log.Infof(LogFormat, op, "ServiceEntry", obj.Name, rc.ClusterID, "Skipped as Admiral pod is in read only mode") + return + } + + if seIsNew { obj.Namespace = namespace obj.ResourceVersion = "" _, err = rc.ServiceEntryController.IstioClient.NetworkingV1alpha3().ServiceEntries(namespace).Create(obj) @@ -670,9 +677,17 @@ func getServiceEntryDiff(new *v1alpha3.ServiceEntry, old *v1alpha3.ServiceEntry) diff = buffer.String() return destructive, diff } +/* +Delete Service entry after checking if the current pod is in ReadOnly mode. +Service Entry object is not deleted if the current pod is in ReadOnly mode. +*/ -func deleteServiceEntry(exist *v1alpha3.ServiceEntry, namespace string, rc *RemoteController) { +func deleteServiceEntry(exist *v1alpha3.ServiceEntry, namespace string, rc *RemoteController,admiralState *AdmiralState) { if exist != nil { + if (*admiralState).ReadOnly { + log.Infof(LogFormat, "Delete", "ServiceEntry", exist.Name, rc.ClusterID, "Skipped as Admiral pod is in read only mode") + return + } err := rc.ServiceEntryController.IstioClient.NetworkingV1alpha3().ServiceEntries(namespace).Delete(exist.Name, &v12.DeleteOptions{}) if err != nil { log.Errorf(LogErrFormat, "Delete", "ServiceEntry", exist.Name, rc.ClusterID, err) @@ -682,14 +697,26 @@ func deleteServiceEntry(exist *v1alpha3.ServiceEntry, namespace string, rc *Remo } } -func addUpdateDestinationRule(obj *v1alpha3.DestinationRule, exist *v1alpha3.DestinationRule, namespace string, rc *RemoteController) { +func addUpdateDestinationRule(obj *v1alpha3.DestinationRule, exist *v1alpha3.DestinationRule, namespace string, rc *RemoteController,admiralState *AdmiralState) { var err error var op string if obj.Annotations == nil { obj.Annotations = map[string]string{} } obj.Annotations["app.kubernetes.io/created-by"] = "admiral" - if exist == nil || exist.Name == "" || exist.Spec.Host == "" { + // If current Admiral pod is in read-only mode, do not create/update/delete destination rule objects + drIsNew:=(exist == nil || exist.Name == "" || exist.Spec.Host == "") + if (*admiralState).ReadOnly { + if drIsNew { + op = "Add" + }else { + op = "Update" + } + log.Infof(LogFormat, op, "DestinationRule", obj.Name, rc.ClusterID, "Skipped as Admiral pod is in read only mode") + return + } + + if drIsNew { obj.Namespace = namespace obj.ResourceVersion = "" _, err = rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(namespace).Create(obj) @@ -708,9 +735,16 @@ func addUpdateDestinationRule(obj *v1alpha3.DestinationRule, exist *v1alpha3.Des log.Infof(LogFormat, op, "DestinationRule", obj.Name, rc.ClusterID, "Success") } } - -func deleteDestinationRule(exist *v1alpha3.DestinationRule, namespace string, rc *RemoteController) { +/* +Delete Destination rule after checking if the current pod is in ReadOnly mode. +Destination rule object is not deleted if the current pod is in ReadOnly mode. +*/ +func deleteDestinationRule(exist *v1alpha3.DestinationRule, namespace string, rc *RemoteController,admiralState *AdmiralState) { if exist != nil { + if (*admiralState).ReadOnly { + log.Infof(LogFormat, "Delete", "DestinationRule", exist.Name, rc.ClusterID, "Skipped as Admiral pod is in read only mode") + return + } err := rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(namespace).Delete(exist.Name, &v12.DeleteOptions{}) if err != nil { log.Errorf(LogErrFormat, "Delete", "DestinationRule", exist.Name, rc.ClusterID, err) @@ -945,3 +979,58 @@ func getServiceForRollout(rc *RemoteController, rollout *argo.Rollout) map[strin } return matchedServices } + +/* +Utility function to delete Destination rule after checking if the current Admiral instance is in ReadOnly mode +Destination rule is not deleted if the Admiral instance is in Readonly mode +*/ +func deleteDestinationRulePostStateCheck(rc *RemoteController,syncNamespace string, objName string, clusterId string, admiralState *AdmiralState){ + if(*admiralState).ReadOnly { + log.Infof(LogFormat, "Delete", "DestinationRule", objName, clusterId, "Skipped deleting as Admiral pod is in read only ") + return + } + err := rc.DestinationRuleController.IstioClient.NetworkingV1alpha3().DestinationRules(syncNamespace).Delete(objName, &v12.DeleteOptions{}) + + if err != nil { + log.Infof(LogErrFormat, "Delete", "DestinationRule", objName, clusterId, err) + } else { + log.Infof(LogFormat, "Delete", "DestinationRule", objName, clusterId, "Success") + } +} + +/* +Utility function to delete Service Entry after checking if the current Admiral instance is in ReadOnly mode +Service Entry is not deleted if the Admiral instance is in Readonly mode +*/ +func deleteServiceEntriesPostStateCheck(rc *RemoteController,syncNamespace string, objName string, clusterId string, admiralState *AdmiralState){ + if(*admiralState).ReadOnly { + log.Infof(LogFormat, "Delete", "ServiceEntry", objName, clusterId, "Skipped deleting as Admiral pod is in read only") + return + } + err := rc.ServiceEntryController.IstioClient.NetworkingV1alpha3().ServiceEntries(syncNamespace).Delete(objName, &v12.DeleteOptions{}) + if err != nil { + log.Infof(LogErrFormat, "Delete", "ServiceEntry", objName, clusterId, err) + } else { + log.Infof(LogFormat, "Delete", "ServiceEntry", objName, clusterId, "Success") + } + +} + +/* +Utility function to delete Virtual Service after checking if the current Admiral instance is in ReadOnly mode +Virtual Service is not deleted if the Admiral instance is in Readonly mode +*/ +func deleteVirtualServicePostStateCheck(rc *RemoteController,syncNamespace string, objName string, clusterId string, admiralState *AdmiralState) (e error){ + if(*admiralState).ReadOnly { + log.Infof(LogFormat, "Delete", "VirtualService", objName, clusterId, "Skipped deleting as Admiral pod is in read only") + return nil + } + err := rc.VirtualServiceController.IstioClient.NetworkingV1alpha3().VirtualServices(syncNamespace).Delete(objName, &v12.DeleteOptions{}) + if err != nil { + log.Infof(LogErrFormat, "Delete", "VirtualService", objName, clusterId, err) + return err + } else { + log.Infof(LogFormat, "Delete", "VirtualService", objName, clusterId, "Success") + } + return nil +} diff --git a/admiral/pkg/clusters/handler_test.go b/admiral/pkg/clusters/handler_test.go index b49e0bc8b..dca4109ec 100644 --- a/admiral/pkg/clusters/handler_test.go +++ b/admiral/pkg/clusters/handler_test.go @@ -645,7 +645,9 @@ func TestHandleVirtualServiceEvent(t *testing.T) { //Run the test for every provided case for _, c := range testCases { t.Run(c.name, func(t *testing.T) { - + c.handler.RemoteRegistry.AdmiralState =&AdmiralState{ + ReadOnly: ReadOnlyEnabled, + } err := handleVirtualServiceEvent(c.vs, c.handler, c.event, common.VirtualService) if err != c.expectedError { t.Fatalf("Error mismatch, expected %v but got %v", c.expectedError, err) @@ -1524,7 +1526,7 @@ func TestAddUpdateServiceEntry(t *testing.T) { //Run the test for every provided case for _, c := range testCases { t.Run(c.name, func(t *testing.T) { - addUpdateServiceEntry(c.newSe, c.oldSe, "namespace", c.rc) + addUpdateServiceEntry(c.newSe, c.oldSe, "namespace", c.rc,&AdmiralState{ReadWriteEnabled}) if c.skipDestructive { //verify the update did not go through se, _ := c.rc.ServiceEntryController.IstioClient.NetworkingV1alpha3().ServiceEntries("namespace").Get(c.oldSe.Name, v12.GetOptions{}) diff --git a/admiral/pkg/clusters/registry.go b/admiral/pkg/clusters/registry.go index c8265b623..945175254 100644 --- a/admiral/pkg/clusters/registry.go +++ b/admiral/pkg/clusters/registry.go @@ -21,14 +21,19 @@ const ( LogErrFormat = "op=%s type=%v name=%v cluster=%s, e=%v" ) + func InitAdmiral(ctx context.Context, params common.AdmiralParams) (*RemoteRegistry, error) { log.Infof("Initializing Admiral with params: %v", params) common.InitializeConfig(params) + as:= AdmiralState{ReadOnlyEnabled} + startAdmiralStateChecker(ctx,params,&as) + w := RemoteRegistry{ ctx: ctx, + AdmiralState: &as, } wd := DependencyHandler{ @@ -67,7 +72,7 @@ func InitAdmiral(ctx context.Context, params common.AdmiralParams) (*RemoteRegis log.Info("argo rollouts disabled") } - configMapController, err := admiral.NewConfigMapController() + configMapController, err := admiral.NewConfigMapController(params.ServiceEntryIPPrefix) if err != nil { return nil, fmt.Errorf(" Error with configmap controller init: %v", err) } diff --git a/admiral/pkg/clusters/registry_test.go b/admiral/pkg/clusters/registry_test.go index e05fa7753..de222c202 100644 --- a/admiral/pkg/clusters/registry_test.go +++ b/admiral/pkg/clusters/registry_test.go @@ -153,7 +153,7 @@ func TestCreateDestinationRuleForLocalNoDeployLabel(t *testing.T) { }, } - createDestinationRuleForLocal(&rc, "local.name", "identity", "cluster1", &des) + createDestinationRuleForLocal(&rc, "local.name", "identity", "cluster1", &des,&AdmiralState{ReadWriteEnabled}) } @@ -175,7 +175,7 @@ func TestCreateDestinationRuleForLocal(t *testing.T) { }, } - createDestinationRuleForLocal(rc, "local.name", "bar", "cluster1", &des) + createDestinationRuleForLocal(rc, "local.name", "bar", "cluster1", &des,&AdmiralState{ReadWriteEnabled}) } diff --git a/admiral/pkg/clusters/serviceentry.go b/admiral/pkg/clusters/serviceentry.go index bb7a17ef9..5476f7dc6 100644 --- a/admiral/pkg/clusters/serviceentry.go +++ b/admiral/pkg/clusters/serviceentry.go @@ -175,7 +175,7 @@ func modifyServiceEntryForNewServiceOrPod(event admiral.EventType, env string, s for key, serviceEntry := range serviceEntries { if len(serviceEntry.Endpoints) == 0 { AddServiceEntriesWithDr(remoteRegistry.AdmiralCache, map[string]string{sourceCluster: sourceCluster}, remoteRegistry.RemoteControllers, - map[string]*networking.ServiceEntry{key: serviceEntry}) + map[string]*networking.ServiceEntry{key: serviceEntry},remoteRegistry.AdmiralState) } clusterIngress, _ := rc.ServiceController.Cache.GetLoadBalancer(common.GetAdmiralParams().LabelSet.GatewayApp, common.NamespaceIstioSystem) for _, ep := range serviceEntry.Endpoints { @@ -187,7 +187,7 @@ func modifyServiceEntryForNewServiceOrPod(event admiral.EventType, env string, s updateEndpointsForBlueGreen(sourceRollouts[sourceCluster], sourceWeightedServices[sourceCluster], cnames, ep, sourceCluster, key) AddServiceEntriesWithDr(remoteRegistry.AdmiralCache, map[string]string{sourceCluster: sourceCluster}, remoteRegistry.RemoteControllers, - map[string]*networking.ServiceEntry{key: serviceEntry}) + map[string]*networking.ServiceEntry{key: serviceEntry},remoteRegistry.AdmiralState) //swap it back to use for next iteration ep.Address = clusterIngress ep.Ports = oldPorts @@ -197,13 +197,13 @@ func modifyServiceEntryForNewServiceOrPod(event admiral.EventType, env string, s var se = copyServiceEntry(serviceEntry) updateEndpointsForWeightedServices(se, sourceWeightedServices[sourceCluster], clusterIngress, meshPorts) AddServiceEntriesWithDr(remoteRegistry.AdmiralCache, map[string]string{sourceCluster: sourceCluster}, remoteRegistry.RemoteControllers, - map[string]*networking.ServiceEntry{key: se}) + map[string]*networking.ServiceEntry{key: se},remoteRegistry.AdmiralState) } else { ep.Address = localFqdn oldPorts := ep.Ports ep.Ports = meshPorts AddServiceEntriesWithDr(remoteRegistry.AdmiralCache, map[string]string{sourceCluster: sourceCluster}, remoteRegistry.RemoteControllers, - map[string]*networking.ServiceEntry{key: serviceEntry}) + map[string]*networking.ServiceEntry{key: serviceEntry},remoteRegistry.AdmiralState) //swap it back to use for next iteration ep.Address = clusterIngress ep.Ports = oldPorts @@ -213,7 +213,7 @@ func modifyServiceEntryForNewServiceOrPod(event admiral.EventType, env string, s } if common.GetWorkloadSidecarUpdate() == "enabled" { - modifySidecarForLocalClusterCommunication(serviceInstance.Namespace, remoteRegistry.AdmiralCache.DependencyNamespaceCache.Get(sourceIdentity), rc) + modifySidecarForLocalClusterCommunication(serviceInstance.Namespace, remoteRegistry.AdmiralCache.DependencyNamespaceCache.Get(sourceIdentity), rc,remoteRegistry.AdmiralState) } for _, val := range dependents { @@ -235,7 +235,7 @@ func modifyServiceEntryForNewServiceOrPod(event admiral.EventType, env string, s remoteRegistry.AdmiralCache.CnameDependentClusterCache.Put(cname, clusterId, clusterId) } - AddServiceEntriesWithDr(remoteRegistry.AdmiralCache, dependentClusters, remoteRegistry.RemoteControllers, serviceEntries) + AddServiceEntriesWithDr(remoteRegistry.AdmiralCache, dependentClusters, remoteRegistry.RemoteControllers, serviceEntries,remoteRegistry.AdmiralState) util.LogElapsedTimeSince("WriteServiceEntryToDependentClusters", sourceIdentity, env, "", start) @@ -330,7 +330,7 @@ func updateEndpointsForWeightedServices(serviceEntry *networking.ServiceEntry, w serviceEntry.Endpoints = endpoints } -func modifySidecarForLocalClusterCommunication(sidecarNamespace string, sidecarEgressMap map[string]common.SidecarEgress, rc *RemoteController) { +func modifySidecarForLocalClusterCommunication(sidecarNamespace string, sidecarEgressMap map[string]common.SidecarEgress, rc *RemoteController,admiralState *AdmiralState) { //get existing sidecar from the cluster sidecarConfig := rc.SidecarController @@ -369,21 +369,25 @@ func modifySidecarForLocalClusterCommunication(sidecarNamespace string, sidecarE //insert into cluster if newSidecarConfig != nil { - addUpdateSidecar(newSidecarConfig, sidecar, sidecarNamespace, rc) + addUpdateSidecar(newSidecarConfig, sidecar, sidecarNamespace, rc,admiralState) } } -func addUpdateSidecar(obj *v1alpha3.Sidecar, exist *v1alpha3.Sidecar, namespace string, rc *RemoteController) { +func addUpdateSidecar(obj *v1alpha3.Sidecar, exist *v1alpha3.Sidecar, namespace string, rc *RemoteController, admiralState *AdmiralState) { var err error exist.Labels = obj.Labels exist.Annotations = obj.Annotations exist.Spec = obj.Spec + if(*admiralState).ReadOnly { + log.Infof(LogErrFormat, "Update", "Sidecar", obj.Name, rc.ClusterID, "Skipped as Admiral pod is in read only mode") + return + } _, err = rc.SidecarController.IstioClient.NetworkingV1alpha3().Sidecars(namespace).Update(exist) if err != nil { - log.Errorf(LogErrFormat, "Update", "Sidecar", obj.Name, rc.ClusterID, err) + log.Infof(LogErrFormat, "Update", "Sidecar", obj.Name, rc.ClusterID, err) } else { - log.Infof(LogFormat, "Update", "Sidecar", obj.Name, rc.ClusterID, "Success") + log.Infof(LogErrFormat, "Update", "Sidecar", obj.Name, rc.ClusterID, "Success") } } @@ -439,7 +443,7 @@ func createSeWithDrLabels(remoteController *RemoteController, localCluster bool, } //This will create the default service entries and also additional ones specified in GTP -func AddServiceEntriesWithDr(cache *AdmiralCache, sourceClusters map[string]string, rcs map[string]*RemoteController, serviceEntries map[string]*networking.ServiceEntry) { +func AddServiceEntriesWithDr(cache *AdmiralCache, sourceClusters map[string]string, rcs map[string]*RemoteController, serviceEntries map[string]*networking.ServiceEntry,admiralState *AdmiralState) { syncNamespace := common.GetSyncNamespace() for _, se := range serviceEntries { @@ -490,22 +494,22 @@ func AddServiceEntriesWithDr(cache *AdmiralCache, sourceClusters map[string]stri } if len(seDr.ServiceEntry.Endpoints) == 0 { - deleteServiceEntry(oldServiceEntry, syncNamespace, rc) + deleteServiceEntry(oldServiceEntry, syncNamespace, rc,admiralState) cache.SeClusterCache.Delete(seDr.ServiceEntry.Hosts[0]) // after deleting the service entry, destination rule also need to be deleted if the service entry host no longer exists - deleteDestinationRule(oldDestinationRule, syncNamespace, rc) + deleteDestinationRule(oldDestinationRule, syncNamespace, rc,admiralState) } else { newServiceEntry := createServiceEntrySkeletion(*seDr.ServiceEntry, seDr.SeName, syncNamespace) if newServiceEntry != nil { newServiceEntry.Annotations = map[string]string{common.GetWorkloadIdentifier(): fmt.Sprintf("%v", identityId)} - addUpdateServiceEntry(newServiceEntry, oldServiceEntry, syncNamespace, rc) + addUpdateServiceEntry(newServiceEntry, oldServiceEntry, syncNamespace, rc,admiralState) cache.SeClusterCache.Put(newServiceEntry.Spec.Hosts[0], rc.ClusterID, rc.ClusterID) } newDestinationRule := createDestinationRuleSkeletion(*seDr.DestinationRule, seDr.DrName, syncNamespace) // if event was deletion when this function was called, then GlobalTrafficCache should already deleted the cache globalTrafficPolicy is an empty shell object - addUpdateDestinationRule(newDestinationRule, oldDestinationRule, syncNamespace, rc) + addUpdateDestinationRule(newDestinationRule, oldDestinationRule, syncNamespace, rc,admiralState) } } } @@ -633,7 +637,7 @@ func GenerateNewAddressAndAddToConfigMap(seName string, configMapController admi secondIndex := (len(newAddressState.Addresses) / 255) + 10 firstIndex := (len(newAddressState.Addresses) % 255) + 1 - address := common.LocalAddressPrefix + common.Sep + strconv.Itoa(secondIndex) + common.Sep + strconv.Itoa(firstIndex) + address := configMapController.GetIPPrefixForServiceEntries() + common.Sep + strconv.Itoa(secondIndex) + common.Sep + strconv.Itoa(firstIndex) for util.Contains(newAddressState.Addresses, address) { if firstIndex < 255 { @@ -642,7 +646,7 @@ func GenerateNewAddressAndAddToConfigMap(seName string, configMapController admi secondIndex++ firstIndex = 0 } - address = common.LocalAddressPrefix + common.Sep + strconv.Itoa(secondIndex) + common.Sep + strconv.Itoa(firstIndex) + address = configMapController.GetIPPrefixForServiceEntries() + common.Sep + strconv.Itoa(secondIndex) + common.Sep + strconv.Itoa(firstIndex) } newAddressState.Addresses = append(newAddressState.Addresses, address) newAddressState.EntryAddresses[seName] = address diff --git a/admiral/pkg/clusters/serviceentry_test.go b/admiral/pkg/clusters/serviceentry_test.go index 8d00b93de..8a5cd8bb4 100644 --- a/admiral/pkg/clusters/serviceentry_test.go +++ b/admiral/pkg/clusters/serviceentry_test.go @@ -125,8 +125,8 @@ func TestAddServiceEntriesWithDr(t *testing.T) { }, } - AddServiceEntriesWithDr(&admiralCache, map[string]string{"cl1": "cl1"}, map[string]*RemoteController{"cl1": rc}, map[string]*istionetworkingv1alpha3.ServiceEntry{"se1": &se}) - AddServiceEntriesWithDr(&admiralCache, map[string]string{"cl1": "cl1"}, map[string]*RemoteController{"cl1": rc}, map[string]*istionetworkingv1alpha3.ServiceEntry{"se1": &emptyEndpointSe}) + AddServiceEntriesWithDr(&admiralCache, map[string]string{"cl1": "cl1"}, map[string]*RemoteController{"cl1": rc}, map[string]*istionetworkingv1alpha3.ServiceEntry{"se1": &se},&AdmiralState{ReadWriteEnabled}) + AddServiceEntriesWithDr(&admiralCache, map[string]string{"cl1": "cl1"}, map[string]*RemoteController{"cl1": rc}, map[string]*istionetworkingv1alpha3.ServiceEntry{"se1": &emptyEndpointSe},&AdmiralState{ReadWriteEnabled}) } func TestCreateSeAndDrSetFromGtp(t *testing.T) { @@ -505,7 +505,7 @@ func TestModifyNonExistingSidecarForLocalClusterCommunication(t *testing.T) { sidecarEgressMap := make(map[string]common.SidecarEgress) sidecarEgressMap["test-dependency-namespace"] = common.SidecarEgress{Namespace: "test-dependency-namespace", FQDN: "test-local-fqdn"} - modifySidecarForLocalClusterCommunication("test-sidecar-namespace", sidecarEgressMap, remoteController) + modifySidecarForLocalClusterCommunication("test-sidecar-namespace", sidecarEgressMap, remoteController,&AdmiralState{ReadWriteEnabled}) sidecarObj, _ := sidecarController.IstioClient.NetworkingV1alpha3().Sidecars("test-sidecar-namespace").Get(common.GetWorkloadSidecarName(), v12.GetOptions{}) @@ -540,7 +540,7 @@ func TestModifyExistingSidecarForLocalClusterCommunication(t *testing.T) { sidecarEgressMap := make(map[string]common.SidecarEgress) sidecarEgressMap["test-dependency-namespace"] = common.SidecarEgress{Namespace: "test-dependency-namespace", FQDN: "test-local-fqdn", CNAMEs: map[string]string{"test.myservice.global": "1"}} - modifySidecarForLocalClusterCommunication("test-sidecar-namespace", sidecarEgressMap, remoteController) + modifySidecarForLocalClusterCommunication("test-sidecar-namespace", sidecarEgressMap, remoteController,&AdmiralState{ReadWriteEnabled}) updatedSidecar, err := sidecarController.IstioClient.NetworkingV1alpha3().Sidecars("test-sidecar-namespace").Get("default", v12.GetOptions{}) diff --git a/admiral/pkg/clusters/testdata/fake-dynamodb-config.yaml b/admiral/pkg/clusters/testdata/fake-dynamodb-config.yaml new file mode 100644 index 000000000..5598ce06c --- /dev/null +++ b/admiral/pkg/clusters/testdata/fake-dynamodb-config.yaml @@ -0,0 +1,8 @@ +dynamoDB: + leaseName: somelease + podIdentifier: somename + waitTimeInSeconds: 15 + failureThreshold: 3 + tableName: sometable + role: arn:aws:iam::someaccount:role/somerole + region: us-west-2 \ No newline at end of file diff --git a/admiral/pkg/clusters/types.go b/admiral/pkg/clusters/types.go index 5e2b79bc5..c0693ca64 100644 --- a/admiral/pkg/clusters/types.go +++ b/admiral/pkg/clusters/types.go @@ -59,6 +59,7 @@ type RemoteRegistry struct { secretClient k8s.Interface ctx context.Context AdmiralCache *AdmiralCache + AdmiralState *AdmiralState } func (r *RemoteRegistry) shutdown() { diff --git a/admiral/pkg/controller/admiral/configmap.go b/admiral/pkg/controller/admiral/configmap.go index 76f53873c..7f730c805 100644 --- a/admiral/pkg/controller/admiral/configmap.go +++ b/admiral/pkg/controller/admiral/configmap.go @@ -15,18 +15,21 @@ const configmapName = "se-address-configmap" type ConfigMapControllerInterface interface { GetConfigMap() (*v1.ConfigMap, error) PutConfigMap(newMap *v1.ConfigMap) error + GetIPPrefixForServiceEntries()(seIPPrefix string) } type ConfigMapController struct { K8sClient kubernetes.Interface ConfigmapNamespace string + ServiceEntryIPPrefix string } //todo this is a temp state, eventually changes will have to be made to give each cluster it's own configmap -func NewConfigMapController() (*ConfigMapController, error) { +func NewConfigMapController(seIPPrefix string) (*ConfigMapController, error) { kubeconfigPath := common.GetKubeconfigPath() namespaceToUse := common.GetSyncNamespace() + if kubeconfigPath == "" { config, err := rest.InClusterConfig() if err != nil { @@ -39,6 +42,7 @@ func NewConfigMapController() (*ConfigMapController, error) { controller := ConfigMapController{ K8sClient: client, ConfigmapNamespace: namespaceToUse, + ServiceEntryIPPrefix: seIPPrefix, } return &controller, nil } else { @@ -56,6 +60,7 @@ func NewConfigMapController() (*ConfigMapController, error) { controller := ConfigMapController{ K8sClient: client, ConfigmapNamespace: namespaceToUse, + ServiceEntryIPPrefix: seIPPrefix, } return &controller, nil } @@ -85,3 +90,7 @@ func (c *ConfigMapController) PutConfigMap(newMap *v1.ConfigMap) error { _, err := c.K8sClient.CoreV1().ConfigMaps(c.ConfigmapNamespace).Update(newMap) return err } + +func (c *ConfigMapController)GetIPPrefixForServiceEntries() (string) { + return c.ServiceEntryIPPrefix +} diff --git a/admiral/pkg/controller/admiral/configmap_test.go b/admiral/pkg/controller/admiral/configmap_test.go index fefe55863..cbb2eb364 100644 --- a/admiral/pkg/controller/admiral/configmap_test.go +++ b/admiral/pkg/controller/admiral/configmap_test.go @@ -121,7 +121,7 @@ func TestNewConfigMapController(t *testing.T) { for _, c := range testCases { t.Run(c.name, func(t *testing.T) { common.SetKubeconfigPath(c.kubeconfigPath) - controller, err := NewConfigMapController() + controller, err := NewConfigMapController("240.0") if err == nil && c.expectedError == nil { //only do these in an error-less context if c.namespace != controller.ConfigmapNamespace { diff --git a/admiral/pkg/controller/common/types.go b/admiral/pkg/controller/common/types.go index 97d91d871..68f7a697b 100644 --- a/admiral/pkg/controller/common/types.go +++ b/admiral/pkg/controller/common/types.go @@ -37,15 +37,18 @@ type AdmiralParams struct { SyncNamespace string EnableSAN bool SANPrefix string - SecretResolver string - SecretResolverConfigPath string - LabelSet *LabelSet - LogLevel int - HostnameSuffix string - PreviewHostnamePrefix string - MetricsEnabled bool - WorkloadSidecarUpdate string - WorkloadSidecarName string + SecretResolver string + SecretResolverConfigPath string + LabelSet *LabelSet + LogLevel int + HostnameSuffix string + PreviewHostnamePrefix string + MetricsEnabled bool + WorkloadSidecarUpdate string + WorkloadSidecarName string + AdmiralStateCheckerName string + DRStateStoreConfigPath string + ServiceEntryIPPrefix string } func (b AdmiralParams) String() string { @@ -56,7 +59,10 @@ func (b AdmiralParams) String() string { fmt.Sprintf("EnableSAN=%v ", b.EnableSAN) + fmt.Sprintf("SANPrefix=%v ", b.SANPrefix) + fmt.Sprintf("LabelSet=%v ", b.LabelSet) + - fmt.Sprintf("SecretResolver=%v ", b.SecretResolver) + fmt.Sprintf("SecretResolver=%v ", b.SecretResolver)+ + fmt.Sprintf("AdmiralStateCheckername=%v ", b.AdmiralStateCheckerName)+ + fmt.Sprintf("DRStateStoreConfigPath=%v ", b.DRStateStoreConfigPath)+ + fmt.Sprintf("ServiceEntryIPPrefix=%v ", b.ServiceEntryIPPrefix) } type LabelSet struct { diff --git a/admiral/pkg/test/types.go b/admiral/pkg/test/types.go index e30b2b734..035a64d67 100644 --- a/admiral/pkg/test/types.go +++ b/admiral/pkg/test/types.go @@ -17,3 +17,6 @@ func (c *FakeConfigMapController) GetConfigMap() (*k8sCoreV1.ConfigMap, error) { func (c *FakeConfigMapController) PutConfigMap(newMap *k8sCoreV1.ConfigMap) error { return c.PutError } +func (c *FakeConfigMapController)GetIPPrefixForServiceEntries() (seIpPrefix string) { + return "240.0" +} diff --git a/docs/Admiral-DynamoDB-view.png b/docs/Admiral-DynamoDB-view.png new file mode 100644 index 000000000..fe0c59695 Binary files /dev/null and b/docs/Admiral-DynamoDB-view.png differ diff --git a/docs/Admiral-dynamoDB-DR.png b/docs/Admiral-dynamoDB-DR.png new file mode 100644 index 000000000..91710a0d5 Binary files /dev/null and b/docs/Admiral-dynamoDB-DR.png differ diff --git a/docs/Admiral-health-check.png b/docs/Admiral-health-check.png new file mode 100644 index 000000000..eb07eb4c7 Binary files /dev/null and b/docs/Admiral-health-check.png differ diff --git a/docs/Admiral-state-changes.png b/docs/Admiral-state-changes.png new file mode 100644 index 000000000..2893ef842 Binary files /dev/null and b/docs/Admiral-state-changes.png differ diff --git a/docs/DisasterRecovery.md b/docs/DisasterRecovery.md new file mode 100644 index 000000000..dc85d4014 --- /dev/null +++ b/docs/DisasterRecovery.md @@ -0,0 +1,101 @@ +# Disaster recovery + +## Key terms + +### Active/Read-Write Admiral +Admiral instance which is constantly monitoring all the clusters for changes and is responsible for creating , updating and deleting the Istio custom objects +like Service Entry, Destination rule, Virtual Service , Sidecar and others. + +### Passive/Read-Only Admiral +Admiral instance which constantly monitors all the clusters for changes but does not manipulate any Istio objects. This instance does not create, update or delete Istio custom objects. + +### Lock object +A reference object that is used to define if an Admiral instance is active or passive. + +### Lease +Duration for which the usage rights are obtained on a lock object. + +## What is the need for this ? + +As of this writing, we can only run one instance of Admiral in a Service mesh environment. We risk running into data inconsistency issues if Admiral is scaled out. +Multiple Admiral instances can overwrite Service Entry and other Istio custom objects and cause non-determistic behaviour. +Running only one instance of Admiral exposes us to the risk of not being able to modify Istio custom objects in case the Admiral instance is unresponsive or has network issue due to infrastructure failures. + +In a Service mesh spanning multiple availability zones/DR regions, we need the ability to run passive Admiral instance that can take over the responsibility of +updating and creating Istio custom objects whenever the active instance encounters any issues. + +## Admiral state changes +* An Admiral instance gets promoted to Active state when it obtains a lease on the lock object. +* Once lease is obtained, the Active state Admiral keeps renewing the lease every x seconds. +* An Admiral instance in passive state keeps checking if the lease is updated every x seconds. If the lease is not updated for y*x seconds, where y is the failure threshold, the passive Admiral takes over as the Active state Admiral. + +![](Admiral-state-changes.png) + + +## Changes to health check API - +As of this writing, Admiral exposes REST APIs that can be used to check the current clusters being monitored and also data about various Istio custom objects. +As we consider data from Active state Admiral as the Source of truth, we need a way to direct all REST APIs to the active state Admiral. To support this, we have enhanced the health check endpoint to +include an optional query param ```checkifreadonly```. If this query param is set to true, a passive state Admiral will return a 502 whereas an active state Admiral +returns a successful response. This query param can be used to configure health check diagnostic URL to decide the active instance from a pool of Admiral instances. +Do not use this query param on [readiness or liveness check probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/). + +![](Admiral-health-check.png) + +## Creating custom DR solutions + +To create your own implementation of DR, please create struct which implements below interface methods. + +* ```runStateCheck ``` should have the logic for DR. This take AdmiralState type object as param and modifies the readonly flag to transition Admiral between Active and Passive modes +* ```shouldRunOnIndependentGoRoutine ``` should return true if you want the DR logic in ```runStateCheck``` method to run on a seperate GoRoutine. + +``` +type AdmiralStateChecker interface { + runStateCheck(as *AdmiralState,ctx context.Context) + shouldRunOnIndependentGoRoutine() bool +} +``` +* Once you have the Struct which implements above interface methods, update the ```startAdmiralStateChecker``` function to invoke your handler. +* Invoke your custom implementation by setting the program argument --admiral_state_checker_name to a value used in ```startAdmiralStateChecker``` function +* Please contribute your implementation to this project + +## What is supported today ? +We have introduced Dynamo DB based DR solution for Admiral. The active and passive Admiral instances try to obtain lease on a lock object +maintained on a Dynamo DB table. Dynamo DB supports [Global tables](https://aws.amazon.com/dynamodb/global-tables/) which can help replicate lock object information between regions where Admiral is hosted. +Writes are supported on multiple regions. + +![](Admiral-dynamoDB-DR.png) + +## Configuration changes to use this +Create a DynamoDB table with below specifications. You can follow instructions provided [here](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/V2globaltables.tutorial.html#V2creategt_console) to create Global tables spanning your DR regions +``` +Table name :- admiral-lease +Attribute names +leaseName type-string Partition key +leaseOwner type-string +notes type-string +updatedTime type-Number +``` +![](Admiral-DynamoDB-view.png) + +Below is a sample config file that be used to configure Dynamo DB connection settings. This can be added as a configmap and made available to Admiral pod using [Volume mounts](https://kubernetes.io/docs/tasks/configure-pod-container/configure-volume-storage/) +``` +dynamoDB: +leaseName: mylease +PodIdentifier: myname +waitTimeInSeconds: 15 +failureThreshold: 3 +tableName: admiral-lease +role: arn:aws:iam:::role/ +region: us-west-2 + +``` + +Include below program arguments +``` +--admiralStateCheckerName dynamodbbasedstatechecker +--drConfigFileLocation +--seIPPrefix 242.0 +``` +Please use different values for ```se_ip_prefix ``` per Admiral deployments. This is needed to ensure that the same IP address is not used in diferent service entries which causes 404 issues with Istio. + + diff --git a/docs/Index.md b/docs/Index.md index a62bebc15..8dd47b56e 100644 --- a/docs/Index.md +++ b/docs/Index.md @@ -5,3 +5,5 @@ ### [Examples](Examples.md) ### [Compatibility](Compatibility.md) + +### [Disaster recovery support](DisasterRecovery.md) diff --git a/go.mod b/go.mod index c6e95203e..d5318ef38 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( git.apache.org/thrift.git v0.0.0-20180902110319-2566ecd5d999 // indirect github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d // indirect github.com/argoproj/argo-rollouts v0.9.3 + github.com/aws/aws-sdk-go v1.24.0 // indirect github.com/cenkalti/backoff v2.2.1+incompatible github.com/emicklei/go-restful v2.11.2+incompatible // indirect github.com/go-openapi/spec v0.19.6 // indirect diff --git a/go.sum b/go.sum index 10d4bf5ff..4bd1e5015 100644 --- a/go.sum +++ b/go.sum @@ -49,6 +49,7 @@ github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk5 github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= github.com/antlr/antlr4 v0.0.0-20191011202612-ad2bd05285ca/go.mod h1:T7PbCXFs94rrTttyxjbyT5+/1V8T2TYDejxUfHJjw1Y= +github.com/aws/aws-sdk-go v1.16.26 h1:GWkl3rkRO/JGRTWoLLIqwf7AWC4/W/1hMOUZqmX0js4= github.com/antonmedv/expr v1.4.2/go.mod h1:xesgliOuukGf21740qhh8PvFdN66yZ9lJJ/PzSFAmzI= github.com/argoproj/argo-rollouts v0.7.2 h1:eUtsstL3DWNv+SxjnwJEBOHH2KOF5lu5G/zF5yKgC3A= github.com/argoproj/argo-rollouts v0.7.2/go.mod h1:zjMEXhycwvFGimOzpeiSmt/Cv58I63nGgVuROKFIfB8=