From bdb9af9299e028bf9434f9a6f51dd2f9c6882c91 Mon Sep 17 00:00:00 2001 From: Sakari Poussa Date: Thu, 24 Oct 2024 22:38:33 +0300 Subject: [PATCH] terraform: add AWS/EKS deployment for ChatQnA (#480) * terraform: add AWS/EKS deployment for ChatQnA Signed-off-by: Sakari Poussa * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Sakari Poussa Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- cloud-service-provider/aws/eks/README.MD | 90 ++++++++++ .../aws/eks/eks-efs-csi-pvc.yaml | 14 ++ cloud-service-provider/aws/eks/main.tf | 167 ++++++++++++++++++ .../aws/eks/opea-chatqna.tfvars | 4 + cloud-service-provider/aws/eks/outputs.tf | 27 +++ cloud-service-provider/aws/eks/terraform.tf | 12 ++ cloud-service-provider/aws/eks/variables.tf | 59 +++++++ 7 files changed, 373 insertions(+) create mode 100644 cloud-service-provider/aws/eks/README.MD create mode 100644 cloud-service-provider/aws/eks/eks-efs-csi-pvc.yaml create mode 100644 cloud-service-provider/aws/eks/main.tf create mode 100644 cloud-service-provider/aws/eks/opea-chatqna.tfvars create mode 100644 cloud-service-provider/aws/eks/outputs.tf create mode 100644 cloud-service-provider/aws/eks/terraform.tf create mode 100644 cloud-service-provider/aws/eks/variables.tf diff --git a/cloud-service-provider/aws/eks/README.MD b/cloud-service-provider/aws/eks/README.MD new file mode 100644 index 00000000..22aa68b5 --- /dev/null +++ b/cloud-service-provider/aws/eks/README.MD @@ -0,0 +1,90 @@ +# OPEA applications AWS EKS deployment guide + +This guide shows how to deploy OPEA applications on Amazon Web Service (AWS) Elastic Kubernetes Service (EKS) using Terraform. + +## Prerequisites + +- Access to AWS EKS +- [Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli), [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and [Helm](https://helm.sh/docs/helm/helm_install/) installed on your local machine. + +## Setup + +The setup uses Terraform to create EKS cluster with the following properties: + +- 1-node EKS cluster with 50 GB disk and `m7i.x8large` SPOT instance (16 vCPU and 32 GB memory) +- Cluster autoscaling up to 10 nodes +- Storage Class (SC) `efs-sc` and Persistent Volume Claim (PVC) `model-volume` for storing the model data +- `LoadBalancer` address type for the service for external consumption +- Updates the kubeconfig file for `kubectl` access + +Initialize the Terraform environment. + +```bash +terraform init +``` + +Add OPEA Helm chart repository. + +```bash +helm repo add opea https://opea-project.github.io/GenAIInfra +helm repo update +``` + +## EKS cluster + +By default, 1-node cluster is created which is suitable for running the OPEA application. See `variables.tf` and `opea-.tfvars` if you want to tune the cluster properties, e.g., number of nodes, instance types or disk size. + +## Persistent Volume Claim + +OPEA needs a volume where to store the model. For that we need to create Kubernetes Persistent Volume Claim (PVC). OPEA requires `ReadWriteMany` option since multiple pods needs access to the storage and they can be on different nodes. On EKS, only EFS supports `ReadWriteMany`. Thus, each OPEA application below uses the file `eks-efs-csi-pvc.yaml` to create PVC in its namespace. + +## OPEA Applications + +### ChatQnA + +Use the commands below to create EKS cluster. + +```bash +terraform plan --var-file opea-chatqna.tfvars -out opea-chatqna.plan +terraform apply "opea-chatqna.plan" +``` + +Once the cluster is ready, the kubeconfig file to access the new cluster is updated automatically. By default, the file is `~/.kube/config`. + +Now you should have access to the cluster via the `kubectl` command. + +Deploy ChatQnA Application with Helm + +```bash +helm install -n chatqna --create-namespace chatqna opea/chatqna --set service.type=LoadBalancer --set global.modelUsePVC=model-volume --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} +``` + +Create the PVC as mentioned [above](#-persistent-volume-claim) + +```bash +kubectl apply -f eks-efs-csi-pvc.yaml -n chatqna +``` + +After a while, the OPEA application should be running. You can check the status via `kubectl`. + +```bash +kubectl get pod -n chatqna +``` + +You can now start using the OPEA application. + +```bash +OPEA_SERVICE=$(kubectl get svc -n chatqna chatqna -ojsonpath='{.status.loadBalancer.ingress[0].hostname}') +curl http://${OPEA_SERVICE}:8888/v1/chatqna \ + -H "Content-Type: application/json" \ + -d '{"messages": "What is the revenue of Nike in 2023?"}' +``` + +Cleanup + +Delete the cluster via the following command. + +```bash +helm uninstall -n chatqna chatqna +terraform destroy -var-file opea-chatqna.tfvars +``` diff --git a/cloud-service-provider/aws/eks/eks-efs-csi-pvc.yaml b/cloud-service-provider/aws/eks/eks-efs-csi-pvc.yaml new file mode 100644 index 00000000..4120ea0c --- /dev/null +++ b/cloud-service-provider/aws/eks/eks-efs-csi-pvc.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-volume +spec: + accessModes: + - ReadWriteMany + storageClassName: efs-sc + resources: + requests: + storage: 50Gi diff --git a/cloud-service-provider/aws/eks/main.tf b/cloud-service-provider/aws/eks/main.tf new file mode 100644 index 00000000..f02e6f09 --- /dev/null +++ b/cloud-service-provider/aws/eks/main.tf @@ -0,0 +1,167 @@ +provider "aws" { + region = var.region +} + +provider "kubernetes" { + config_path = "~/.kube/config" +} + +data "aws_availability_zones" "available" { + filter { + name = "opt-in-status" + values = ["opt-in-not-required"] + } +} +data "aws_caller_identity" "current" {} + +locals { + vpc_cidr = "10.0.0.0/16" + azs = slice(data.aws_availability_zones.available.names, 0, 3) +} + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + + name = "${var.cluster_name}-vpc" + cidr = local.vpc_cidr + azs = local.azs + + private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] + public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"] + + enable_nat_gateway = true + single_nat_gateway = true + enable_dns_hostnames = true + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + } +} + +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "20.8.5" + + cluster_name = var.cluster_name + cluster_version = var.cluster_version + + cluster_endpoint_public_access = true + enable_cluster_creator_admin_permissions = true + + cluster_addons = { + coredns = { most_recent = true } + kube-proxy = { most_recent = true } + vpc-cni = { most_recent = true } + aws-efs-csi-driver = { most_recent = true } + } + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + eks_managed_node_group_defaults = { + instance_types = var.instance_types + capacity_type = var.capacity_type + min_size = var.min_size + max_size = var.max_size + desired_size = var.desired_size + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = var.disk_size + volume_type = "gp3" + iops = 3000 + throughput = 125 + encrypted = true + delete_on_termination = true + } + } + } + } + + eks_managed_node_groups = { + default = { + iam_role_additional_policies = { + AmazonEFSCSIDriverPolicy = "arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy" + } + } + } +} + +module "efs" { + source = "terraform-aws-modules/efs/aws" + + # File system + name = var.cluster_name + creation_token = var.cluster_name + encrypted = true + kms_key_arn = module.kms.key_arn + + # File system policy + policy_statements = [ + { + sid = "Example" + actions = ["elasticfilesystem:ClientMount"] + principals = [ + { + type = "AWS" + identifiers = [data.aws_caller_identity.current.arn] + } + ] + } + ] + + # Mount targets / security group + mount_targets = { for k, v in zipmap(local.azs, module.vpc.private_subnets) : k => { subnet_id = v } } + security_group_description = "EFS security group" + security_group_vpc_id = module.vpc.vpc_id + security_group_rules = { + vpc = { + description = "NFS ingress from VPC private subnets" + cidr_blocks = module.vpc.private_subnets_cidr_blocks + } + } + + tags = { + Terraform = "true" + Environment = "dev" + } +} + +module "kms" { + source = "terraform-aws-modules/kms/aws" + + aliases = ["efs/${var.cluster_name}"] + description = "EFS customer managed key" + enable_default_policy = true + + # For example use only + deletion_window_in_days = 7 +} + +resource "null_resource" "kubectl" { + provisioner "local-exec" { + command = "aws eks --region ${var.region} update-kubeconfig --name ${var.cluster_name}" + } + depends_on = [ module.eks ] +} + +resource "kubernetes_storage_class_v1" "eks_efs" { + metadata { + name = "efs-sc" + } + storage_provisioner = "efs.csi.aws.com" + reclaim_policy = "Retain" + parameters = { + provisioningMode = "efs-ap" + fileSystemId: "${module.efs.id}" + directoryPerms = "700" + } + depends_on = [ + null_resource.kubectl + ] +} diff --git a/cloud-service-provider/aws/eks/opea-chatqna.tfvars b/cloud-service-provider/aws/eks/opea-chatqna.tfvars new file mode 100644 index 00000000..8e85aea2 --- /dev/null +++ b/cloud-service-provider/aws/eks/opea-chatqna.tfvars @@ -0,0 +1,4 @@ +cluster_name = "opea-chatqna" +instance_types = ["m7i.8xlarge"] +capacity_type = "SPOT" # cheaper +disk_size = 50 diff --git a/cloud-service-provider/aws/eks/outputs.tf b/cloud-service-provider/aws/eks/outputs.tf new file mode 100644 index 00000000..acd1a9de --- /dev/null +++ b/cloud-service-provider/aws/eks/outputs.tf @@ -0,0 +1,27 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: MPL-2.0 + +output "cluster_endpoint" { + description = "Endpoint for EKS control plane" + value = module.eks.cluster_endpoint +} + +output "cluster_oidc_issuer_url" { + description = "The URL on the EKS cluster for the OpenID Connect identity provider" + value = module.eks.cluster_oidc_issuer_url +} + +output "cluster_security_group_id" { + description = "Security group ids attached to the cluster control plane" + value = module.eks.cluster_security_group_id +} + +output "region" { + description = "AWS region" + value = var.region +} + +output "cluster_name" { + description = "Kubernetes Cluster Name" + value = module.eks.cluster_name +} \ No newline at end of file diff --git a/cloud-service-provider/aws/eks/terraform.tf b/cloud-service-provider/aws/eks/terraform.tf new file mode 100644 index 00000000..ffd53436 --- /dev/null +++ b/cloud-service-provider/aws/eks/terraform.tf @@ -0,0 +1,12 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.49.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "2.33.0" + } + } +} diff --git a/cloud-service-provider/aws/eks/variables.tf b/cloud-service-provider/aws/eks/variables.tf new file mode 100644 index 00000000..2350e260 --- /dev/null +++ b/cloud-service-provider/aws/eks/variables.tf @@ -0,0 +1,59 @@ +variable "region" { + description = "AWS region" + type = string + default = "eu-west-1" +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string + default = null +} + +variable "cluster_version" { + description = "EKS cluster version" + type = string + default = "1.31" +} + +variable "instance_types" { + description = "EC2 instance types" + type = list + default = ["t3.medium"] +} + +variable "use_custom_launch_template" { + description = "Disk size in GiB for nodes." + type = bool + default = true +} + +variable "disk_size" { + description = "Disk size in GiB for nodes." + type = number + default = 20 +} + +variable "capacity_type" { + description = "EC2 spot or on-demand instance types" + type = string + default = "ON_DEMAND" +} + +variable "min_size" { + description = "min size" + type = number + default = 1 +} + +variable "max_size" { + description = "max size" + type = number + default = 10 +} + +variable "desired_size" { + description = "desired size" + type = number + default = 1 +}