From 3cfbb33da94a8a5b56c535e33682e952456aa60d Mon Sep 17 00:00:00 2001 From: Emika Hammond Date: Fri, 30 Aug 2024 15:37:32 -0400 Subject: [PATCH] Update GCP Verifier Documentation (#266) * add Terraform script to make GCP VPC with firewall * make minor changes to README * add minor change to README * Update README * Update README * Update README * remove optional Terraform options * changed cloud NAT use to private subnet only * increase nat ports for more reliable connection * update GCP verifier docs * update gcp entry_point.go comments * update dummy probe comments * update gcp.go comments * update comments for gcp_verifier.go --- docs/gcp/gcp.md | 18 ++++----- pkg/clients/gcp/gcp.go | 15 ++++---- pkg/probes/dummy/dummy.go | 7 +++- pkg/verifier/gcp/entry_point.go | 15 ++++++-- pkg/verifier/gcp/gcp_verifier.go | 63 ++++++++++++++++---------------- 5 files changed, 63 insertions(+), 55 deletions(-) diff --git a/docs/gcp/gcp.md b/docs/gcp/gcp.md index a8f53814..2100f582 100644 --- a/docs/gcp/gcp.md +++ b/docs/gcp/gcp.md @@ -14,16 +14,14 @@ ### GCP Environment ### Set up your environment to use the correct VPC name, project ID, credentials of the GCP account for the target cluster. - Make sure to have a Service Account with the permissions required within your GCP account (in that project). This can be done in the following ways: - - Follow the steps to use a script as prescibed in [this document.](https://github.com/openshift/ops-sop/blob/master/gcp/create-ccs-project.md) or create a service account manually [this] (https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-gcloud) - - Export these GCP environment variables: + - Follow the steps to use a script as prescibed in [this document.](https://github.com/openshift/ops-sop/blob/master/v4/howto/gcp/create-ccs-project.md) or create a service account manually [this] (https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-gcloud) + - Export GCP project id environment variable: ```shell - export GCP_VPC_NAME= export GCP_PROJECT_ID= ``` - Export any other GCP environment vars: + Export GCP region environment variable or use `--region` flag: ```shell export GCP_REGION= - export GOOGLE_APPLICATION_CREDENTIALS= ```` ### IAM permissions ### @@ -52,25 +50,25 @@ repeat the verification process for each subnet ID. This generates `osd-network-verifier` executable in project root directory. 4. Obtain params: - 1. subnet_id: Obtain the subnet id to be verified. + 1. subnet-id: Obtain the subnet id to be verified. + 2. vpc-name: Obtain the VPC name that the subnet belongs to 5. Execute: ```shell # GCP - ./osd-network-verifier egress --platform gcp-classic --subnet-id $SUBNET_ID + ./osd-network-verifier egress --platform gcp-classic --subnet-id $SUBNET_ID --vpc-name $VPC_NAME Additional optional flags for overriding defaults (image-id, kms-key will be added in the future): ```shell --cloud-tags stringToString (optional) comma-seperated list of tags to assign to cloud resources e.g. --cloud-tags key1=value1,key2=value2 (default [osd-network-verifier=owned,red-hat-managed=true,Name=osd-network-verifier]) --debug (optional) if true, enable additional debug-level logging -- TODO image-id string (optional) cloud image for the compute instance - --instance-type string (optional) compute instance type (default "e2-standard-2") + --instance-type string (optional) compute instance type (default "e2-micro") -- TODO kms-key-id string (optional) ID of KMS key used to encrypt root volumes of compute instances. Defaults to cloud account default key --region string (optional) compute instance region. If absent, environment var GCP_REGION will be used, if set (default "us-east1") - --subnet-id string source subnet ID - --timeout duration (optional) timeout for individual egress verification requests (default 2s). If timeout is less than 2s, it would likely cause false negatives test results. + --timeout duration (optional) timeout for individual egress verification requests (default 5s). ``` Get cli help: diff --git a/pkg/clients/gcp/gcp.go b/pkg/clients/gcp/gcp.go index 57221f39..fc4e944e 100644 --- a/pkg/clients/gcp/gcp.go +++ b/pkg/clients/gcp/gcp.go @@ -13,11 +13,11 @@ type Client struct { } func NewClient(credentials *google.Credentials) (*Client, error) { - //use oauth2 token in credentials struct to create a client, + // Use oauth2 token in credentials struct to create a client, // https://pkg.go.dev/golang.org/x/oauth2/google#Credentials // https://cloud.google.com/docs/authentication/production - //service account credentials order/priority - env variable, service account attached to resource, error + // Service account credentials order/priority - env variable, service account attached to resource, error computeService, err := computev1.NewService(context.TODO()) if err != nil { @@ -27,14 +27,14 @@ func NewClient(credentials *google.Credentials) (*Client, error) { return &Client{computeService: computeService}, nil } -// terminateComputeServiceInstance terminates target ComputeService instance -// uses c.output to store result of the execution +// Terminates target ComputeService instance +// Uses c.output to store result of the execution func (c *Client) TerminateComputeServiceInstance(projectID, zone, instanceName string) error { _, err := c.computeService.Instances.Delete(projectID, zone, instanceName).Context(context.TODO()).Do() return err } -// returns a map of all machineTypes with the machinetype string as the key and bool true if found +// Returns a map of all machineTypes with the machinetype string as the key and bool true if found func (c *Client) ListMachineTypes(projectID, zone string) (map[string]bool, error) { machineTypesMap := map[string]bool{} req := c.computeService.MachineTypes.List(projectID, zone) @@ -59,7 +59,7 @@ func (c *Client) CreateInstance(projectID, zone string, instance *computev1.Inst return nil } -// gets instance given an ID , zone , and instance name +// Gets instance given an ID , zone , and instance name func (c *Client) GetInstance(projectID, zone, instanceName string) (computev1.Instance, error) { instance, err := c.computeService.Instances.Get(projectID, zone, instanceName).Do() if err != nil { @@ -68,7 +68,7 @@ func (c *Client) GetInstance(projectID, zone, instanceName string) (computev1.In return *instance, nil } -// send request to apply tags, return error if tags are invalid +// Send request to apply tags, return error if tags are invalid func (c *Client) SetInstanceLabels(projectID, zone, instanceName string, labelReq *computev1.InstancesSetLabelsRequest) error { _, err := c.computeService.Instances.SetLabels(projectID, zone, instanceName, labelReq).Do() if err != nil { @@ -77,6 +77,7 @@ func (c *Client) SetInstanceLabels(projectID, zone, instanceName string, labelRe return nil } +// Gets serial port output for the specified instance func (c *Client) GetInstancePorts(projectID, zone, instanceName string) (*computev1.SerialPortOutput, error) { resp, err := c.computeService.Instances.GetSerialPortOutput(projectID, zone, instanceName).Do() if err != nil { diff --git a/pkg/probes/dummy/dummy.go b/pkg/probes/dummy/dummy.go index c715d052..b32eb8f0 100644 --- a/pkg/probes/dummy/dummy.go +++ b/pkg/probes/dummy/dummy.go @@ -5,6 +5,9 @@ import ( "github.com/openshift/osd-network-verifier/pkg/output" ) +// dummy.Probe is an implementation of the probes.Probe interface for testing / building the verifier client. +// MachineID and UserData are hardcoded for testing purposes. + type Probe struct{} const ( @@ -20,7 +23,7 @@ func (prb Probe) GetEndingToken() string { return endingToken } // GetMachineImageID returns the string ID of the VM image to be used for the probe instance func (prb Probe) GetMachineImageID(string, cpu.Architecture, string) (string, error) { - return "rhel-9", nil + return "rhel-9-v20240709", nil } // GetExpandedUserData returns a bash-formatted userdata string @@ -28,7 +31,7 @@ func (prb Probe) GetExpandedUserData(map[string]string) (string, error) { return `#!/bin/sh systemctl mask --now serial-getty@ttyS0.service systemctl disable --now syslog.socket rsyslog.service - sysctl -w kernel.printk="0 4 0 7"ss + sysctl -w kernel.printk="0 4 0 7" echo DUMMY_START > /dev/ttyS0 echo "hello world" > /dev/ttyS0 echo DUMMY_END > /dev/ttyS0`, nil diff --git a/pkg/verifier/gcp/entry_point.go b/pkg/verifier/gcp/entry_point.go index f91dc3d5..5f224d5a 100644 --- a/pkg/verifier/gcp/entry_point.go +++ b/pkg/verifier/gcp/entry_point.go @@ -22,7 +22,7 @@ const ( // validateEgress performs validation process for egress // Basic workflow is: // - prepare for ComputeService instance creation -// - create instance and wait till it gets ready, wait for gcpUserData script execution +// - create instance and wait till it gets ready, wait for startup script execution // - find unreachable endpoints & parse output, then terminate instance // - return `g.output` which stores the execution results func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.Output { @@ -30,9 +30,11 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O if vei.PlatformType == "" { vei.PlatformType = helpers.PlatformGCP } + // Validate CPUArchitecture and default to ArchX86 if not specified if !vei.CPUArchitecture.IsValid() { vei.CPUArchitecture = cpu.ArchX86 } + // Default to curl.Probe if no Probe specified if vei.Probe == nil { vei.Probe = curl.Probe{} @@ -55,6 +57,7 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O g.Logger.Debug(vei.Ctx, fmt.Sprintf("defaulted to instance type %s", vei.InstanceType)) } + // Validate machine type if err := g.validateMachineType(vei.GCP.ProjectID, vei.GCP.Zone, vei.InstanceType); err != nil { return g.Output.AddError(fmt.Errorf("instance type %s is invalid: %s", vei.InstanceType, err)) } @@ -103,13 +106,14 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O "value": "$value", "USE_SYSTEMD": "true", } - userData, err := vei.Probe.GetExpandedUserData(userDataVariables) if err != nil { return g.Output.AddError(err) } g.Logger.Debug(vei.Ctx, "Generated userdata script:\n---\n%s\n---", userData) + // if no cloudImageID specified, get string ID of the VM image to be used for the probe instance + // image list https://cloud.google.com/compute/docs/images/os-details#red_hat_enterprise_linux_rhel if vei.CloudImageID == "" { vei.CloudImageID, err = vei.Probe.GetMachineImageID(vei.PlatformType, vei.CPUArchitecture, vei.GCP.Region) if err != nil { @@ -117,7 +121,7 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O } } - //image list https://cloud.google.com/compute/docs/images/os-details#red_hat_enterprise_linux_rhel + // Create the ComputeService instance instance, err := g.createComputeServiceInstance(createComputeServiceInstanceInput{ projectID: vei.GCP.ProjectID, zone: vei.GCP.Zone, @@ -130,12 +134,14 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O tags: vei.Tags, serialportenable: "true", }) + // Try to terminate instance if instance creation fails if err != nil { g.Output.AddError(err) err = g.GcpClient.TerminateComputeServiceInstance(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name) return g.Output.AddError(err) // fatal } + // Wait for the ComputeService instance to be running g.Logger.Debug(vei.Ctx, "Waiting for ComputeService instance %s to be running", instance.Name) if instanceReadyErr := g.waitForComputeServiceInstanceCompletion(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name); instanceReadyErr != nil { // try to terminate instance if instance is not running @@ -146,13 +152,14 @@ func (g *GcpVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O return g.Output.AddError(instanceReadyErr) // fatal } + // Wait for console output and parse g.Logger.Info(vei.Ctx, "Gathering and parsing console log output...") - err = g.findUnreachableEndpoints(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name, vei.Probe) if err != nil { g.Output.AddError(err) } + // Terminate the ComputeService instance after probe output is parsed and stored err = g.GcpClient.TerminateComputeServiceInstance(vei.GCP.ProjectID, vei.GCP.Zone, instance.Name) if err != nil { g.Output.AddError(err) diff --git a/pkg/verifier/gcp/gcp_verifier.go b/pkg/verifier/gcp/gcp_verifier.go index 654d2772..162ddd46 100644 --- a/pkg/verifier/gcp/gcp_verifier.go +++ b/pkg/verifier/gcp/gcp_verifier.go @@ -22,6 +22,20 @@ type GcpVerifier struct { Output output.Output } +type createComputeServiceInstanceInput struct { + projectID string + zone string + vpcSubnetID string + userdata string + machineType string + instanceName string + sourceImage string + networkName string + tags map[string]string + serialportenable string +} + +// Creates new GCP verifier with ocm logger func NewGcpVerifier(creds *google.Credentials, debug bool) (*GcpVerifier, error) { // Create logger builder := ocmlog.NewStdLoggerBuilder() @@ -39,6 +53,7 @@ func NewGcpVerifier(creds *google.Credentials, debug bool) (*GcpVerifier, error) return &GcpVerifier{*gcpClient, logger, output.Output{}}, nil } +// Check that instance type is supported in zone func (g *GcpVerifier) validateMachineType(projectID, zone, instanceType string) error { g.Logger.Debug(context.TODO(), "Gathering description of instance type %s from ComputeService API in zone %s", instanceType, zone) @@ -56,20 +71,7 @@ func (g *GcpVerifier) validateMachineType(projectID, zone, instanceType string) return nil } -type createComputeServiceInstanceInput struct { - projectID string - zone string - vpcSubnetID string - userdata string - machineType string - instanceName string - sourceImage string - networkName string - tags map[string]string - serialportenable string -} - -// this fuciton is a logic function that lieves some where else +// This function is a logic function that lives somewhere else func (g *GcpVerifier) createComputeServiceInstance(input createComputeServiceInstanceInput) (computev1.Instance, error) { req := &computev1.Instance{ @@ -93,7 +95,7 @@ func (g *GcpVerifier) createComputeServiceInstance(input createComputeServiceIns Name: input.networkName, Subnetwork: input.vpcSubnetID, // Only one accessConfigs exist which is ONE_TO_ONE_NAT - // needed for external internet access including egress + // Needed for external internet access including egress AccessConfigs: []*computev1.AccessConfig{ { Name: "External NAT", @@ -123,41 +125,38 @@ func (g *GcpVerifier) createComputeServiceInstance(input createComputeServiceIns }, } - //send request to computeService - + // Send request to create instance err := g.GcpClient.CreateInstance(input.projectID, input.zone, req) if err != nil { return computev1.Instance{}, fmt.Errorf("unable to create instance: %v", err) } - g.Logger.Info(context.TODO(), "Created instance with ID: %s", input.instanceName) - //get fingerprint from instance + // Get fingerprint from instance inst, err := g.GcpClient.GetInstance(input.projectID, input.zone, input.instanceName) if err != nil { g.Logger.Debug(context.TODO(), "Failed to get fingerprint to apply tags to instance %v", err) } - //Add tags - known as labels in gcp + // Add tags - known as labels in gcp g.Logger.Info(context.TODO(), "Applying labels") - labelReq := &computev1.InstancesSetLabelsRequest{ LabelFingerprint: inst.LabelFingerprint, Labels: input.tags, } - //send request to apply tags, return error if tags are invalid + // Send request to apply tags, return error if tags are invalid err = g.GcpClient.SetInstanceLabels(input.projectID, input.zone, input.instanceName, labelReq) if err != nil { return computev1.Instance{}, fmt.Errorf("unable to create labels: %v", err) } - g.Logger.Info(context.TODO(), "Successfully applied labels ") return inst, nil } +// Get the console output from the ComputeService instance and scrape it for the probe's output and parse func (g *GcpVerifier) findUnreachableEndpoints(projectID, zone, instanceName string, probe probes.Probe) error { var consoleOutput string g.Logger.Debug(context.TODO(), "Scraping console output and waiting for user data script to complete...") @@ -170,10 +169,12 @@ func (g *GcpVerifier) findUnreachableEndpoints(projectID, zone, instanceName str return false, err } + // Return and resume waiting if console output is still nil if output == nil { return false, nil } + // In the early stages, an ComputeService instance may be running but the console is not populated with any data if len(output.Contents) == 0 { g.Logger.Debug(context.TODO(), "ComputeService console output not yet populated with data, continuing to wait...") return false, nil @@ -196,7 +197,6 @@ func (g *GcpVerifier) findUnreachableEndpoints(projectID, zone, instanceName str g.Logger.Debug(context.TODO(), "consoleOutput contains data, but probe has not yet printed endingToken, continuing to wait...") return false, nil } - // If we make it this far, we know that both startingTokenSeen and endingTokenSeen are true // Separate the probe's output from the rest of the console output (using startingToken and endingToken) @@ -206,6 +206,7 @@ func (g *GcpVerifier) findUnreachableEndpoints(projectID, zone, instanceName str g.Output.AddException(handledErrors.NewGenericError(fmt.Errorf("probe output corrupted: no data between startingToken and endingToken"))) return false, nil } + // Send probe's output off to the Probe interface for parsing g.Logger.Debug(context.TODO(), "probe output:\n---\n%s\n---", rawProbeOutput) probe.ParseProbeOutput(rawProbeOutput, &g.Output) @@ -216,13 +217,11 @@ func (g *GcpVerifier) findUnreachableEndpoints(projectID, zone, instanceName str return err } +// Describes the instance status +// States: PROVISIONING, STAGING, RUNNING, STOPPING, STOPPED, TERMINATED, SUSPENDED +// https://cloud.google.com/compute/docs/instances/instance-life-cycle +// Error Codes: https://cloud.google.com/apis/design/errors func (c *GcpVerifier) describeComputeServiceInstances(projectID, zone, instanceName string) (string, error) { - // States - //PROVISIONING, STAGING, RUNNING, STOPPING, STOPPED, TERMINATED, SUSPENDED - // https://cloud.google.com/compute/docs/instances/instance-life-cycle - - //Error Codes https://cloud.google.com/apis/design/errors - resp, err := c.GcpClient.GetInstance(projectID, zone, instanceName) if err != nil { c.Logger.Error(context.TODO(), "Errors while describing the instance status: %v", err.Error()) @@ -243,13 +242,13 @@ func (c *GcpVerifier) describeComputeServiceInstances(projectID, zone, instanceN return resp.Status, nil } +// Waits for the ComputeService instance to be in a RUNNING state func (c *GcpVerifier) waitForComputeServiceInstanceCompletion(projectID, zone, instanceName string) error { - //wait for the instance to run err := helpers.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) { code, descError := c.describeComputeServiceInstances(projectID, zone, instanceName) switch code { case "RUNNING": - //instance is running, break + // Instance is running, break c.Logger.Info(context.TODO(), "ComputeService Instance: %s %s", instanceName, code) return true, nil