wellcomecollection · kenoir · Dec 16, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
@@ -2,6 +2,9 @@ FROM public.ecr.aws/amazoncorretto/amazoncorretto:11 AS base
 
 LABEL maintainer="Wellcome Collection <[email protected]>"
 
+RUN yum install awscli -y
+COPY bash_secrets_extension.sh /opt/extensions/bash_secrets_extension.sh
+
 ADD target/universal/stage /opt/docker
 ENV CLASSPATH="/opt/docker/lib/*"
 WORKDIR /opt/docker
@@ -11,6 +14,7 @@ FROM base AS ecs
 ENTRYPOINT ["/opt/docker/bin/main"]
 
 FROM base AS lambda_rie
+
 ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/download/v1.22/aws-lambda-rie /aws-lambda/aws-lambda-rie
 RUN chmod +x /aws-lambda/aws-lambda-rie
 ENTRYPOINT ["/aws-lambda/aws-lambda-rie"]

@@ -21,13 +21,7 @@ You must provide an accessible profile with access to the index secrets.
 
 You can run the Lambda version locally thus:
 
-Build the appropriate Docker
-
-`docker build --target lambda_rie -t lambda_relation_embedder .`
-
-Run it with the port available
-
-`docker run -p 9000:8080 lambda_relation_embedder`
+`./scripts/run_local.sh <PIPELINE_DATE>`
 
 You can now post JSON SQS messages to it. Because SQS-fed-by-SNS is so awkwardly verbose,
 a convenience script will fill out the boilerplate for you. As with CLIMain, you can pipe some

@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+set -euo pipefail
+
+OWN_FILENAME="$(basename $0)"
+LAMBDA_EXTENSION_NAME="$OWN_FILENAME" # (external) extension name has to match the filename
+TMP_FILE=/tmp/$OWN_FILENAME
+
+# Graceful Shutdown
+_term() {
+  echo "[${LAMBDA_EXTENSION_NAME}] Received SIGTERM"
+  # forward SIGTERM to child procs and exit
+  kill -TERM "$PID" 2>/dev/null
+  echo "[${LAMBDA_EXTENSION_NAME}] Exiting"
+  exit 0
+}
+
+forward_sigterm_and_wait() {
+  trap _term SIGTERM
+  wait "$PID"
+  trap - SIGTERM
+}
+
+# Initialization
+# To run any extension processes that need to start before the runtime initializes, run them before the /register
+echo "[${LAMBDA_EXTENSION_NAME}] Initialization"
+
+# Read secret from AWS Secrets Manager
+get_secret() {
+  local secret_name=$1
+  local secret_value
+  echo "[${LAMBDA_EXTENSION_NAME}] Getting secret: $secret_name" > /dev/tty
+
+  secret_value=$(aws secretsmanager get-secret-value --secret-id "$secret_name" --query SecretString --output text)
+  if [[ -z "$secret_value" ]]; then
+    echo "[${LAMBDA_EXTENSION_NAME}] Secret not found: $secret_name" > /dev/tty
+    exit 1
+  fi
+  echo "$secret_value"
+}
+
+# Extract secret value from environment variable and write to config file
+extract_secret_value_to_config() {
+  local env_var=$1
+  local env_var_value
+  local env_var_key
+  local secret_key
+  local secret_value
+
+  env_var_value=$(echo "$env_var" | cut -d= -f2)
+  env_var_key=$(echo "$env_var" | cut -d= -f1)
+
+  if [[ $env_var_value == "secret:"* ]]; then
+    secret_key=$(echo "$env_var_value" | cut -d: -f2)
+    secret_value=$(get_secret "$secret_key")
+    echo "$env_var_key=\"$secret_value\"" >> /tmp/config
+  fi
+}
+
+# Create a configuration file with secrets
+create_config_file() {
+  local env_vars
+  local env_var
+
+  # Ensure the file exists, and is empty
+  touch /tmp/config
+  echo -n > /tmp/config
+
+  env_vars=$(printenv)
+  for env_var in $env_vars; do
+    extract_secret_value_to_config "$env_var"
+  done
+}
+
+create_config_file
+
+# Registration
+# The extension registration also signals to Lambda to start initializing the runtime.
+HEADERS="$(mktemp)"
+echo "[${LAMBDA_EXTENSION_NAME}] Registering at http://${AWS_LAMBDA_RUNTIME_API}/2020-01-01/extension/register"
+curl -sS -LD "$HEADERS" \
+  -XPOST "http://${AWS_LAMBDA_RUNTIME_API}/2020-01-01/extension/register" \
+  --header "Lambda-Extension-Name: ${LAMBDA_EXTENSION_NAME}" \
+  -d "{ \"events\": [\"SHUTDOWN\"]}" > "$TMP_FILE"
+
+RESPONSE=$(<"$TMP_FILE")
+# Extract Extension ID from response headers
+EXTENSION_ID=$(grep -Fi Lambda-Extension-Identifier "$HEADERS" | tr -d '[:space:]' | cut -d: -f2)
+echo "[${LAMBDA_EXTENSION_NAME}] Registration response: ${RESPONSE} with EXTENSION_ID $(grep -Fi Lambda-Extension-Identifier "$HEADERS" | tr -d '[:space:]' | cut -d: -f2)"
+
+# Event processing
+# Continuous loop to wait for events from Extensions API
+while true
+do
+  echo "[${LAMBDA_EXTENSION_NAME}] Waiting for event. Get /next event from http://${AWS_LAMBDA_RUNTIME_API}/2020-01-01/extension/event/next"
+
+  # Get an event. The HTTP request will block until one is received
+  curl -sS -L -XGET "http://${AWS_LAMBDA_RUNTIME_API}/2020-01-01/extension/event/next" --header "Lambda-Extension-Identifier: ${EXTENSION_ID}" > $TMP_FILE &
+  PID=$!
+  forward_sigterm_and_wait
+
+  EVENT_DATA=$(<"$TMP_FILE")
+  if [[ $EVENT_DATA == *"SHUTDOWN"* ]]; then
+    echo "[extension: ${LAMBDA_EXTENSION_NAME}] Received SHUTDOWN event. Exiting."  1>&2;
+    exit 0 # Exit if we receive a SHUTDOWN event
+  fi
+done
@@ -1,4 +1,5 @@
 version: '3.3'
+
 services:
   localstack:
     image: "public.ecr.aws/localstack/localstack:3.5.0"

@@ -0,0 +1,22 @@
+# Run locally using `docker compose build lambda && docker compose run --rm  --service-ports lambda`
+services:
+  lambda:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      target: lambda_rie
+    volumes:
+      - ~/.aws:/root/.aws
+    ports:
+      - "9000:8080"
+    environment:
+      - AWS_PROFILE=platform-developer
+      - metrics_namespace=catalogue-dev_relation_embedder
+      - affected_works_scroll_size=50
+      - complete_tree_scroll_size=800
+      - index_batch_size=100
+      - index_flush_interval_seconds=60
+      - es_protocol=https
+      - es_port=443
+    env_file:
+      - .env
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+  echo "Usage: $0 <PIPELINE_DATE> [--skip-build]"
+  exit 1
+fi
+
+export PIPELINE_DATE=$1
+SKIP_BUILD=false
+if [ "$#" -eq 2 ] && [ "$2" == "--skip-build" ]; then
+  SKIP_BUILD=true
+fi
+
+PROJECT_NAME="relation_embedder"
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+cd "$DIR"/..
+
+# Read .template.env, substitute variables, and write to .env
+envsubst < template.env > .env
+
+# Build the project, skipping if requested
+if [ "$SKIP_BUILD" = true ]; then
+  echo "Skipping build"
+else
+  pushd ../../..
+  sbt "project $PROJECT_NAME" ";stage"
+  popd
+fi
+
+# Build the docker image
+docker compose -f local.docker-compose.yml \
+  build lambda
+
+# Run the docker image
+docker compose -f local.docker-compose.yml \
+  run --rm  --service-ports lambda
@@ -11,3 +11,4 @@ es.host=${?es_host}
 es.port=${?es_port}
 es.protocol=${?es_protocol}
 es.apikey=${?es_apikey}
+relation_embedder.use_downstream=${?use_downstream}
@@ -3,16 +3,19 @@ package weco.pipeline.relation_embedder
 import com.amazonaws.services.lambda.runtime.{Context, RequestHandler}
 import grizzled.slf4j.Logging
 import com.amazonaws.services.lambda.runtime.events.SQSEvent
-import com.typesafe.config.ConfigFactory
 import org.apache.pekko.actor.ActorSystem
+import weco.pipeline.relation_embedder.lib._
 
 import scala.concurrent.duration.DurationInt
 import scala.concurrent.{Await, ExecutionContext, Future}
 
-object LambdaMain extends RequestHandler[SQSEvent, String] with Logging {
-  import weco.pipeline.relation_embedder.lib.SQSEventOps._
+object LambdaMain
+    extends RequestHandler[SQSEvent, String]
+    with Logging
+    with LambdaConfiguration {
+
+  import SQSEventOps._
 
-  private val config = ConfigFactory.load("application")
   override def handleRequest(
     event: SQSEvent,
     context: Context
@@ -24,6 +27,7 @@ object LambdaMain extends RequestHandler[SQSEvent, String] with Logging {
     val batchProcessor = BatchProcessor(config)
 
     info(s"running relation_embedder lambda, got event: $event")
+
     // Wait here so that lambda can finish executing correctly.
     // 15 minutes is the maximum time allowed for a lambda to run.
     Await.result(

@@ -0,0 +1,33 @@
+package weco.pipeline.relation_embedder.lib
+
+import java.io.File
+import com.typesafe.config.{Config, ConfigFactory}
+
+trait Configuration {
+  val config: Config
+}
+
+trait LambdaConfiguration extends Configuration {
+  private val defaultResolveFromFile: String = "/tmp/config"
+  private val defaultApplicationConfig: String = "application.conf"
+
+  private val lambdaConfigFile: File =
+    new File(defaultResolveFromFile)
+
+  protected val baseConfig: Config =
+    ConfigFactory.load()
+
+  protected val applicationConfig: Config =
+    ConfigFactory.parseResources(defaultApplicationConfig)
+
+  protected val lambdaConfig: Config = if (lambdaConfigFile.exists()) {
+    ConfigFactory.parseFile(lambdaConfigFile)
+  } else {
+    ConfigFactory.empty()
+  }
+
+  lazy val config = lambdaConfig
+    .withFallback(applicationConfig)
+    .withFallback(baseConfig)
+    .resolve()
+}
@@ -0,0 +1,36 @@
+package weco.pipeline.relation_embedder.helpers
+
+import com.typesafe.config.{Config, ConfigFactory}
+import weco.fixtures.TestWith
+import weco.pipeline.relation_embedder.lib.LambdaConfiguration
+
+trait ConfigurationTestHelpers {
+
+  class TestLambdaConfiguration(
+                                 override val baseConfig: Config,
+                                 override val applicationConfig: Config,
+                                 override val lambdaConfig: Config
+                               ) extends LambdaConfiguration
+
+  def createConfig(configString: String): Config = {
+    ConfigFactory.parseString(configString.stripMargin)
+  }
+
+  implicit class AsConfig(config: String) {
+    def asConfig: Config = createConfig(config)
+  }
+
+  val emptyConfig = ConfigFactory.empty()
+
+  def withLayeredConfig[R](
+                            baseConfig: Config = emptyConfig,
+                            applicationConfig: Config = emptyConfig,
+                            lambdaConfig: Config = emptyConfig
+                          )(testWith: TestWith[Config, R]): R = {
+    testWith(new TestLambdaConfiguration(
+      baseConfig,
+      applicationConfig,
+      lambdaConfig
+    ).config)
+  }
+}
@@ -0,0 +1,4 @@
+es_apikey=secret:elasticsearch/pipeline_storage_${PIPELINE_DATE}/relation_embedder/api_key
+es_host=secret:elasticsearch/pipeline_storage_${PIPELINE_DATE}/public_host
+es_denormalised_index=works-denormalised-${PIPELINE_DATE}
+es_merged_index=works-merged-${PIPELINE_DATE}
@@ -9,7 +9,7 @@ module "pipeline_step" {
   description  = var.description
 
   environment = {
-    variables = var.environment_variables
+    variables = local.environment_variables_with_secrets
   }
 }
 

@@ -0,0 +1,38 @@
+locals {
+  secret_env_vars = {
+    for k, v in var.secret_env_vars
+    : k => "secret:${v}"
+  }
+
+  secret_names = values(var.secret_env_vars)
+
+  environment_variables_with_secrets = merge(var.environment_variables, local.secret_env_vars)
+}
+
+data "aws_region" "current" {}
+data "aws_caller_identity" "current" {}
+
+# data block for aws_iam_policy_document to read each secret in secret_names (presuming they are in the same account as the lambda)
+data "aws_iam_policy_document" "secrets_policy" {
+  statement {
+    actions = ["secretsmanager:GetSecretValue"]
+    resources = [
+      for secret_name in local.secret_names
+      : "arn:aws:secretsmanager:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:secret:${secret_name}-*"
+    ]
+  }
+}
+
+resource "aws_iam_policy" "secret_policy" {
+  count = length(local.secret_names) > 0 ? 1 : 0
+
+  name_prefix = "${local.name}-secrets-policy"
+  policy      = data.aws_iam_policy_document.secrets_policy.json
+}
+
+resource "aws_iam_role_policy_attachment" "lambda_secret_role_policy" {
+  count = length(local.secret_names) > 0 ? 1 : 0
+
+  role       = module.pipeline_step.lambda_role.name
+  policy_arn = aws_iam_policy.secret_policy[0].arn
+}
@@ -20,6 +20,10 @@ variable "pipeline_date" {
   type = string
 }
 
+variable "secret_env_vars" {
+  type    = map(string)
+  default = {}
+}
 
 variable "environment_variables" {
   type        = map(string)

@@ -26,6 +26,9 @@ module "embedder_lambda" {
     index_batch_size             = 100 // NOTE: too large results in 413 from ES
     index_flush_interval_seconds = 60
   }
+
+  secret_env_vars = var.pipeline_storage_es_service_secrets["relation_embedder"]
+
   # see comment on fargate service's queue_visibility_timeout_seconds
   # 15 minutes is the max for lambda, is it going to be enough?
   timeout = 60 * 15 # 15 Minutes