github-aws-runners · GuptaNavdeep1983 · Nov 8, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
@@ -32,6 +32,7 @@ This [Terraform](https://www.terraform.io/) module creates the required infrastr
 - [Examples](#examples)
 - [Sub modules](#sub-modules)
 - [Logging](#logging)
+- [Tracing](#tracing)
 - [Debugging](#debugging)
 - [Security Considerations](#security-considerations)
 - [Requirements](#requirements)
@@ -427,6 +428,16 @@ An example log message of the scale-up function:
     }
 }
 ```
+## Tracing
+For the distributed architecture of this application it can be difficult to troubleshoot this application.
+We support the option to enable tracing for all the lambda functions created by this application. To enable tracing user can simply provide the `tracing_config` option inside the root module or inner modules.
+
+This tracing config generates timelines for following events:
+- Basic lifecycle of lambda function
+- Traces for Github API calls (can be configured by capture_http_requests).
+- Traces for all AWS SDK calls
+
+
 
 ## Debugging
 
@@ -543,6 +554,7 @@ We welcome any improvement to the standard module to make the default as secure
 | <a name="input_lambda_s3_bucket"></a> [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `string` | `null` | no |
 | <a name="input_lambda_security_group_ids"></a> [lambda\_security\_group\_ids](#input\_lambda\_security\_group\_ids) | List of security group IDs associated with the Lambda function. | `list(string)` | `[]` | no |
 | <a name="input_lambda_subnet_ids"></a> [lambda\_subnet\_ids](#input\_lambda\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | `[]` | no |
+| <a name="input_lambda_tracing_mode"></a> [lambda\_tracing\_mode](#input\_lambda\_tracing\_mode) | DEPRECATED: Replaced by `tracing_config`. | `string` | `null` | no |
 | <a name="input_log_level"></a> [log\_level](#input\_log\_level) | Logging level for lambda logging. Valid values are  'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no |
 | <a name="input_logging_kms_key_id"></a> [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | Specifies the kms key id to encrypt the logs with. | `string` | `null` | no |
 | <a name="input_logging_retention_in_days"></a> [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no |

@@ -39,5 +39,5 @@ export interface RunnerInputParameters {
   };
   numberOfRunners?: number;
   amiIdSsmParameterName?: string;
-  runnerTracingEnabled?: boolean;
+  tracingEnabled?: boolean;
 }
@@ -240,10 +240,10 @@
   it('calls create fleet of 1 instance with runner tracing enabled', async () => {
     tracer.getRootXrayTraceId = jest.fn().mockReturnValue('123');
 
-    await createRunner(createRunnerConfig({ ...defaultRunnerConfig, runnerTracingEnabled: true }));
+    await createRunner(createRunnerConfig({ ...defaultRunnerConfig, tracingEnabled: true }));
 
     expect(mockEC2Client).toHaveReceivedCommandWith(CreateFleetCommand, {
-      ...expectedCreateFleetRequest({ ...defaultExpectedFleetRequestValues, runnerTracingEnabled: true }),
+      ...expectedCreateFleetRequest({ ...defaultExpectedFleetRequestValues, tracingEnabled: true }),
     });
   });
 });
@@ -360,7 +360,7 @@
   allocationStrategy: SpotAllocationStrategy;
   maxSpotPrice?: string;
   amiIdSsmParameterName?: string;
-  runnerTracingEnabled?: boolean;
+  tracingEnabled?: boolean;
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -377,7 +377,7 @@
     },
     subnets: ['subnet-123', 'subnet-456'],
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
-    runnerTracingEnabled: runnerConfig.runnerTracingEnabled,
+    tracingEnabled: runnerConfig.tracingEnabled,
   };
 }
 
@@ -388,7 +388,7 @@
   maxSpotPrice?: string;
   totalTargetCapacity: number;
   imageId?: string;
-  runnerTracingEnabled?: boolean;
+  tracingEnabled?: boolean;
 }
 
 function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues): CreateFleetCommandInput {
@@ -398,10 +398,10 @@
     { Key: 'ghr:Type', Value: expectedValues.type },
     { Key: 'ghr:Owner', Value: REPO_NAME },
   ];
-  if (expectedValues.runnerTracingEnabled) {
+  if (expectedValues.tracingEnabled) {
     const traceId = tracer.getRootXrayTraceId();
     tags.push({ Key: 'ghr:trace_id', Value: traceId! });
   }
  const request: CreateFleetCommandInput = {
    LaunchTemplateConfigs: [
      {

@@ -1,4 +1,4 @@
 import {
  CreateFleetCommand,
  CreateFleetResult,
  DescribeInstancesCommand,
@@ -127,38 +127,38 @@
    },
  });

  const ec2Client = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION }));

  let amiIdOverride = undefined;

  if (runnerParameters.amiIdSsmParameterName) {
    try {
      amiIdOverride = await getParameter(runnerParameters.amiIdSsmParameterName);
      logger.debug(`AMI override SSM parameter (${runnerParameters.amiIdSsmParameterName}) set to: ${amiIdOverride}`);
    } catch (e) {
      logger.error(
        `Failed to lookup runner AMI ID from SSM parameter: ${runnerParameters.amiIdSsmParameterName}. ` +
          'Please ensure that the given parameter exists on this region and contains a valid runner AMI ID',
        { error: e },
      );
      throw e;
    }
  }

  const numberOfRunners = runnerParameters.numberOfRunners ? runnerParameters.numberOfRunners : 1;

  const tags = [
    { Key: 'ghr:Application', Value: 'github-action-runner' },
    { Key: 'ghr:created_by', Value: numberOfRunners === 1 ? 'scale-up-lambda' : 'pool-lambda' },
    { Key: 'ghr:Type', Value: runnerParameters.runnerType },
     { Key: 'ghr:Owner', Value: runnerParameters.runnerOwner },
   ];
 
-  if (runnerParameters.runnerTracingEnabled) {
+  if (runnerParameters.tracingEnabled) {
     const traceId = tracer.getRootXrayTraceId();
     tags.push({ Key: 'ghr:trace_id', Value: traceId! });
   }

  let fleet: CreateFleetResult;
  try {
    // see for spec https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html

@@ -59,6 +59,7 @@ export const addMiddleware = () => {
   middy(scaleUpHandler).use(handler);
   middy(scaleDownHandler).use(handler);
   middy(adjustPool).use(handler);
+  middy(ssmHousekeeper).use(handler);
 };
 addMiddleware();
 

@@ -36,90 +36,90 @@
   const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; // same as AWS default
   const runnerOwner = process.env.RUNNER_OWNER;
   const amiIdSsmParameterName = process.env.AMI_ID_SSM_PARAMETER_NAME;
-  const runnerTracingEnabled = yn(process.env.POWERTOOLS_TRACE_ENABLED, { default: false });
+  const tracingEnabled = yn(process.env.POWERTOOLS_TRACE_ENABLED, { default: false });
 
   let ghesApiUrl = '';
   if (ghesBaseUrl) {
    ghesApiUrl = `${ghesBaseUrl}/api/v3`;
  }

  const installationId = await getInstallationId(ghesApiUrl, runnerOwner);
  const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl);
  const githubInstallationClient = await createOctoClient(ghAuth.token, ghesApiUrl);

  // Look up the runners registered in GitHub, could be also non managed by this module.
  const runners = await githubInstallationClient.paginate(
    githubInstallationClient.actions.listSelfHostedRunnersForOrg,
    {
      org: runnerOwner,
      per_page: 100,
    },
  );
  const runnerStatus = new Map<string, RunnerStatus>();
  for (const runner of runners) {
    runner.name = runnerNamePrefix ? runner.name.replace(runnerNamePrefix, '') : runner.name;
    runnerStatus.set(runner.name, { busy: runner.busy, status: runner.status });
  }

  // Look up the managed ec2 runners in AWS, but running does not mean idle
  const ec2runners = await listEC2Runners({
    environment,
    runnerOwner,
    runnerType: 'Org',
    statuses: ['running'],
  });

  // Runner should be considered idle if it is still booting, or is idle in GitHub
  let numberOfRunnersInPool = 0;
  for (const ec2Instance of ec2runners) {
    if (
      runnerStatus.get(ec2Instance.instanceId)?.busy === false &&
      runnerStatus.get(ec2Instance.instanceId)?.status === 'online'
    ) {
      numberOfRunnersInPool++;
      logger.debug(`Runner ${ec2Instance.instanceId} is idle in GitHub and counted as part of the pool`);
    } else if (runnerStatus.get(ec2Instance.instanceId) != null) {
      logger.debug(`Runner ${ec2Instance.instanceId} is not idle in GitHub and NOT counted as part of the pool`);
    } else if (!bootTimeExceeded(ec2Instance)) {
      numberOfRunnersInPool++;
      logger.info(`Runner ${ec2Instance.instanceId} is still booting and counted as part of the pool`);
    } else {
      logger.debug(
        `Runner ${ec2Instance.instanceId} is not idle in GitHub nor booting and not counted as part of the pool`,
      );
    }
  }

  const topUp = event.poolSize - numberOfRunnersInPool;
  if (topUp > 0) {
    logger.info(`The pool will be topped up with ${topUp} runners.`);
    await createRunners(
      {
        ephemeral,
        enableJitConfig,
        ghesBaseUrl,
        runnerLabels,
        runnerGroup,
        runnerOwner,
        runnerNamePrefix,
        runnerType: 'Org',
        disableAutoUpdate: disableAutoUpdate,
        ssmTokenPath,
        ssmConfigPath,
      },
      {
        ec2instanceCriteria: {
          instanceTypes,
          targetCapacityType: instanceTargetTargetCapacityType,
          maxSpotPrice: instanceMaxSpotPrice,
          instanceAllocationStrategy: instanceAllocationStrategy,
        },
        environment,
        launchTemplateName,
         subnets,
         numberOfRunners: topUp,
         amiIdSsmParameterName,
-        runnerTracingEnabled,
+        tracingEnabled,
       },
       githubInstallationClient,
     );

@@ -77,7 +77,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
     instanceAllocationStrategy: 'lowest-price',
   },
   subnets: ['subnet-123'],
-  runnerTracingEnabled: false,
+  tracingEnabled: false,
 };
 let expectedRunnerParams: RunnerInputParameters;
 

@@ -50,7 +50,7 @@
   ec2instanceCriteria: RunnerInputParameters['ec2instanceCriteria'];
   numberOfRunners?: number;
   amiIdSsmParameterName?: string;
-  runnerTracingEnabled?: boolean;
+  tracingEnabled?: boolean;
 }
 
 function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -236,77 +236,77 @@
   const amiIdSsmParameterName = process.env.AMI_ID_SSM_PARAMETER_NAME;
   const runnerNamePrefix = process.env.RUNNER_NAME_PREFIX || '';
   const ssmConfigPath = process.env.SSM_CONFIG_PATH || '';
-  const runnerTracingEnabled = yn(process.env.POWERTOOLS_TRACE_ENABLED, { default: false });
+  const tracingEnabled = yn(process.env.POWERTOOLS_TRACE_ENABLED, { default: false });
 
   if (ephemeralEnabled && payload.eventType !== 'workflow_job') {
     logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`);
    throw Error(
      `The event type ${payload.eventType} is not supported in combination with ephemeral runners.` +
        `Please ensure you have enabled workflow_job events.`,
    );
  }
  const ephemeral = ephemeralEnabled && payload.eventType === 'workflow_job';
  const runnerType = enableOrgLevel ? 'Org' : 'Repo';
  const runnerOwner = enableOrgLevel ? payload.repositoryOwner : `${payload.repositoryOwner}/${payload.repositoryName}`;

  addPersistentContextToChildLogger({
    runner: {
      type: runnerType,
      owner: runnerOwner,
    },
    github: {
      event: payload.eventType,
      workflow_job_id: payload.id.toString(),
    },
  });

  logger.info(`Received event`);

  let ghesApiUrl = '';
  if (ghesBaseUrl) {
    ghesApiUrl = `${ghesBaseUrl}/api/v3`;
  }

  const installationId = await getInstallationId(ghesApiUrl, enableOrgLevel, payload);
  const ghAuth = await createGithubInstallationAuth(installationId, ghesApiUrl);
  const githubInstallationClient = await createOctoClient(ghAuth.token, ghesApiUrl);
  if (!enableJobQueuedCheck || (await isJobQueued(githubInstallationClient, payload))) {
    const currentRunners = await listEC2Runners({
      environment,
      runnerType,
      runnerOwner,
    });
    logger.info(`Current runners: ${currentRunners.length} of ${maximumRunners}`);

    if (currentRunners.length < maximumRunners) {
      logger.info(`Attempting to launch a new runner`);

      await createRunners(
        {
          ephemeral,
          enableJitConfig,
          ghesBaseUrl,
          runnerLabels,
          runnerGroup,
          runnerNamePrefix,
          runnerOwner,
          runnerType,
          disableAutoUpdate,
          ssmTokenPath,
          ssmConfigPath,
        },
        {
          ec2instanceCriteria: {
            instanceTypes,
            targetCapacityType: instanceTargetTargetCapacityType,
            maxSpotPrice: instanceMaxSpotPrice,
            instanceAllocationStrategy: instanceAllocationStrategy,
          },
          environment,
           launchTemplateName,
           subnets,
           amiIdSsmParameterName,
-          runnerTracingEnabled,
+          tracingEnabled,
         },
         githubInstallationClient,
       );

@@ -65,13 +65,6 @@ cleanup() {
 
   if [ "$exit_code" -ne 0 ]; then
     echo "ERROR: runner-start-failed with exit code $exit_code occurred on $error_location"
-    # Create a CloudWatch metric for error
-    aws cloudwatch put-metric-data \
-      --metric-name "RunnerInstanceUnhealthy" \
-      --namespace "Github Runners metrics" \
-      --value 1 \
-      --region "$region" \
-      --dimensions "InstanceId=$instance_id"
     create_xray_error_segment "$SEGMENT" "runner-start-failed with exit code $exit_code occurred on $error_location - $error_lineno"
   else
     create_xray_success_segment "$SEGMENT"
@@ -160,7 +153,7 @@ if [[ "$xray_trace_id" != "" ]]; then
   # run xray service
   curl https://s3.us-east-2.amazonaws.com/aws-xray-assets.us-east-2/xray-daemon/aws-xray-daemon-linux-3.x.zip -o aws-xray-daemon-linux-3.x.zip
   unzip aws-xray-daemon-linux-3.x.zip -d aws-xray-daemon-linux-3.x
-  sudo chmod +x ./aws-xray-daemon-linux-3.x/xray
+  chmod +x ./aws-xray-daemon-linux-3.x/xray
   ./aws-xray-daemon-linux-3.x/xray -o -n "$region" &
 
 

@@ -0,0 +1,10 @@
+variable "lambda_tracing_mode" {
+  description = "DEPRECATED: Replaced by `tracing_config`."
+  type        = string
+  default     = null
+
+  validation {
+    condition     = anytrue([var.lambda_tracing_mode == null])
+    error_message = "DEPRECATED, Replaced by `tracing_config`."
+  }
+}