From 8c398dce7687398fd8429366c3af172de7fc0834 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 13 Oct 2022 14:14:31 +0100 Subject: [PATCH 01/29] Add jsonnet file and ssh keys for nasa-cryo eksctl cluster --- eksctl/nasa-cryo.jsonnet | 109 +++++++++++++++++++++++++++ eksctl/ssh-keys/nasa-cryo.key.pub | 1 + eksctl/ssh-keys/secret/nasa-cryo.key | 21 ++++++ 3 files changed, 131 insertions(+) create mode 100644 eksctl/nasa-cryo.jsonnet create mode 100644 eksctl/ssh-keys/nasa-cryo.key.pub create mode 100644 eksctl/ssh-keys/secret/nasa-cryo.key diff --git a/eksctl/nasa-cryo.jsonnet b/eksctl/nasa-cryo.jsonnet new file mode 100644 index 0000000000..384a79ec26 --- /dev/null +++ b/eksctl/nasa-cryo.jsonnet @@ -0,0 +1,109 @@ +// Exports an eksctl config file for carbonplan cluster +local ng = import "./libsonnet/nodegroup.jsonnet"; + +// place all cluster nodes here +local clusterRegion = "us-west-2"; +local masterAzs = ["us-west-2a", "us-west-2b", "us-west-2c"]; +local nodeAz = "us-west-2a"; + +// Node definitions for notebook nodes. Config here is merged +// with our notebook node definition. +// A `node.kubernetes.io/instance-type label is added, so pods +// can request a particular kind of node with a nodeSelector +local notebookNodes = [ + { instanceType: "m5.large" }, + { instanceType: "m5.xlarge" }, + { instanceType: "m5.2xlarge" }, + { instanceType: "m5.8xlarge" }, +]; + +// Node definitions for dask worker nodes. Config here is merged +// with our dask worker node definition, which uses spot instances. +// A `node.kubernetes.io/instance-type label is set to the name of the +// *first* item in instanceDistribution.instanceTypes, to match +// what we do with notebook nodes. Pods can request a particular +// kind of node with a nodeSelector +local daskNodes = [ + { instancesDistribution+: { instanceTypes: ["m5.large"] }}, + { instancesDistribution+: { instanceTypes: ["m5.xlarge"] }}, + { instancesDistribution+: { instanceTypes: ["m5.2xlarge"] }}, + { instancesDistribution+: { instanceTypes: ["m5.8xlarge"] }}, +]; + +{ + apiVersion: 'eksctl.io/v1alpha5', + kind: 'ClusterConfig', + metadata+: { + name: "nasa-cryo", + region: clusterRegion, + version: '1.22' + }, + availabilityZones: masterAzs, + iam: { + withOIDC: true, + }, + nodeGroups: [ + ng { + name: 'core-a', + availabilityZones: [nodeAz], + ssh: { + publicKeyPath: 'ssh-keys/nasa-cryo.key.pub' + }, + instanceType: "m5.xlarge", + minSize: 1, + maxSize: 6, + labels+: { + "hub.jupyter.org/node-purpose": "core", + "k8s.dask.org/node-purpose": "core" + }, + }, + ] + [ + ng { + // NodeGroup names can't have a '.' in them, while + // instanceTypes always have a . + name: "nb-%s" % std.strReplace(n.instanceType, ".", "-"), + availabilityZones: [nodeAz], + minSize: 0, + maxSize: 500, + instanceType: n.instanceType, + ssh: { + publicKeyPath: 'ssh-keys/nasa-cryo.key.pub' + }, + labels+: { + "hub.jupyter.org/node-purpose": "user", + "k8s.dask.org/node-purpose": "scheduler" + }, + taints+: { + "hub.jupyter.org_dedicated": "user:NoSchedule", + "hub.jupyter.org/dedicated": "user:NoSchedule" + }, + + } + n for n in notebookNodes + ] + [ + ng { + // NodeGroup names can't have a '.' in them, while + // instanceTypes always have a . + name: "dask-%s" % std.strReplace(n.instancesDistribution.instanceTypes[0], ".", "-"), + availabilityZones: [nodeAz], + minSize: 0, + maxSize: 500, + ssh: { + publicKeyPath: 'ssh-keys/nasa-cryo.key.pub' + }, + labels+: { + "k8s.dask.org/node-purpose": "worker" + }, + taints+: { + "k8s.dask.org_dedicated" : "worker:NoSchedule", + "k8s.dask.org/dedicated" : "worker:NoSchedule" + }, + instancesDistribution+: { + onDemandBaseCapacity: 0, + onDemandPercentageAboveBaseCapacity: 0, + spotAllocationStrategy: "capacity-optimized", + }, + } + n for n in daskNodes + ] + + +} \ No newline at end of file diff --git a/eksctl/ssh-keys/nasa-cryo.key.pub b/eksctl/ssh-keys/nasa-cryo.key.pub new file mode 100644 index 0000000000..b807f58500 --- /dev/null +++ b/eksctl/ssh-keys/nasa-cryo.key.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCypRMw4nZaf7oER+dgggklFH6pYOm8zgznRVSD9OrWP2zDXu2xYNFGmgsXIEvAxxy7DZM8c1yRCjJlSVgmBSXd1ARuO6LnnLK7THkz9jAprIIQXmO14qZNZxRV4zdVXVQMAU74vkDI2RkUGo5/XDuUCltUIAtjHcdnrk6B3CONORc96UsAHFgX3keVWmlrxcBavoh2lxrlJwQwAJP91hJm53cZURdV16jpiuseUPoZ4gUlctmECByb2Bc0PaCPP4nQy6Gph6Eo8VLFtqvGtmW6LXr9meQ7G87G7B/C0mlF/X6p75LDUnhfPyVkB2nNYS2m3xyXOS/lNSerGvUShyNqCp8MV+L5m7bBHsrfuBgZwpiSvSpqIudcUJesK2CRiLJ1Z0/mnjYwxtzSFP26JQ7xsqNahF5hu1+NrZbjx4OQea6Qk5p7yZcAEsQrjIruI+57KA3cKTq2Vq49m863wL8nLW5dICg54riqRoOq0E1JRgvpXLOK+hKo4yQKZYcJnRM= sgibson@Athena.broadband diff --git a/eksctl/ssh-keys/secret/nasa-cryo.key b/eksctl/ssh-keys/secret/nasa-cryo.key new file mode 100644 index 0000000000..9471f7fd71 --- /dev/null +++ b/eksctl/ssh-keys/secret/nasa-cryo.key @@ -0,0 +1,21 @@ +{ + "data": "ENC[AES256_GCM,data:/NEMqURGW8vfvJKpGNHuvI1KAF5gI8c2kXApVoC76e/djJat9H/jQU66NPGONrcQUH38fC0wH3zSp1ZRge/T9TU3s+GGJM/Q+mh1g9tc2Wnwf5TUlB49FhgBS/ZAeV3i49kC+S+COtt/5HW04Z2N05i3Rz3YckVFaliRC5P5aRN8uMry/3bfLJykqoWlNDC4tifx5j0s1WMo9pHFmrB3i/Zeb1nIz+eklAzTpLa/4CkVlythGLPxXqPQrj9XKe8suDW/8192cd/Y+WgwUlH8O/zZUaY20BLqHr7jDtdR9RRoULwZKoh7tTeoGCVWVYXb6Sw43JlyhCFo7HL/7r+uxA9H03KMh4yIlLUAPLStxmoH7aQ+uwz93QpWVJo6+t6/XUVO13kP1hmIgfpLSC66M7sB0MkK0j5gHI1pL4Z3TADN7LMb4RHTI7LX9J/umhN9N5PmuIESxWST+ZJ31LKDU/NSLMYjLKV1T4DSUgN1ZX1hM+TQOLUa6Hxu8iDwONg86WA0os8rQe1NAVmls4fPMwNRcRb0laTVs/wWJZI1u7ntBvdC5gxd7DK8jm2/UVS6rphNk3K38AE+E2M9YEhIW/u7MSuz4KFteynyTv0flNV3716apRpWpwg11Z39ACeXT0w/HqHXCLcFO3bBmIZFWnRc+XGXXFrPGdON/RFk2NBVHm/5t/MPxhp0H2DBwmRjLLGLEg2bXO+RWZhkVOQQAoGHhPin15xPumQTTu5oQYYn2fsRK4rpi3CcZSt9uJOXH2nk5t6mRGwXeR+jfeKBejPVfAA0pXDxAYU0+9hk9OAS4hLQSWTFlK3bLXCTZ4J2U2+aMLpT23b4uipt1+l7qr9i6aJPK56P/+ZJnb5KFGZ5jT7FA12jnDhrSlxWDBrPHG+PcbyWzqnnVvFKvlHC63BGURsTcQBM5Yi/cHNbZe7j8+87pK7rb99QmWRUPEH9HqbgZ0SR3/rKqBpMW00Iu5zJLzLaAXMVdMBNaqmOsD4c+H4a/wElsfPyctLtjw5AVyDkMMtrLoX4QeiCIhIPwNL2SEAeLP5N9hcmHr0d4t7gDi7dVtWtiij/MZDtofOVoLAFErk6u2V+TMcXCssMXLEgW4VZWOJWemjbibk0zWYNhZDV/UVgEFrPn/pOWZqkQyRHiCYkwhorgypLPyRQRjC9cQ4F1cUDoxGYV056I978kWigiVXPu602bo+j/fifZ7we0iXMz6CUNnQVOgNmC1RX4IEFkQ+ry+lkYegfc8R2Yyt6VjWG3k62wR0HoVFmtOdFvyIPsfblp/pqsbTktDSiHB0hH40n+wVrNAeXKDwYRGPundylbzjMr/FldoJChje0jwcI1oLxuo5KTGXTNA9bxKkU4hehSr67TduPeJLc1NNIoIS3XEtvKvfJJj61RkQ0deyiHKiBgdbfMzku+7A4F2GBDrp5uIdRo81PRACM6wNlAvVrOssE/jU79tOM2IJ6lVUdELacRtTiOg91JMLYoIZiTInQ1lVAYNkcSerWt7udW6tQjRR97lLIwLXjV9d45F9dugUiwtTXm8EbNzgPFr7isjAa2SOG/TIVKxG0/KODzK/sVr2Zh1hU5+EBJ59uqFvOemetBiw2EdOk2M0FeB/D278PHXa8klvw/rh5mMjfnkid1RyDFJf0i4GYEJXxyJv3Xfdebv7naK0gLCe6B6tsREtq2vDtmYd1M7YDwI15BvaN7lrp3nNpcpQNju+/HvgqydmoVRVcryWKk/OWOBgpu2rOrP+nu7BcbBv7znWnEFUIZCMNmxgPKTCwcgmvokjBs+79UOicphDNcpwd88t/hMz+/t1b38RKfsW3L0pfzALxZ7XPneA2r917b96SBVuUgZc3RU+EKqOPOZ2b3s5TJbWotiNo+fAhCjSqSI6Fzl83suPCTXaL9qejhdP7FST0QlLHS7SIrCGB0XqAu462rp5z9WfzTpB7U04DiXI9Y7XMEKwcurUedqSjk/dWhyiEgrzT1QTjCYJLxn6PSMQpwNQHEdRnwp69XMr7SilbgXAfKn5kdaSP1fn/tneG4JhEiwoskPhK3+sitDVqhxu9BgdZ/033NLyT5CI1RCH7n0rzl6enHtIUBYIJac/Skb58ACqjgGoDqoisg+lbHkOyRAYNFgnjH9LxUUyOHbTl8LfvmWPLbmg9Z0drOEC2/fQ5z8ZvJihkCXDmtLghxLbDJxJBKJUs5kyVc+bppZuThRFk7kNVNAkQqr/N15ORvL58k02hEbGd2xmhFpVqtsmtDQjmO+AxTOZj/bXvt3ZenQPVBONtgeEH9WoiCS8/9sLyeM/9xSL2yWw1mAOzqx0Sf40+4sUJmcKIlzx1X15oeVN1qckzUT5ey/s/f+4/zlYOMl9DEKPSGw5qoynRy5bjQyl2L2qgWq5aHCb09cYZtdrozllv2PXn0W1+PBocMLumb65k06jCblDYsMpPjnH4WsvrDrsm0JKY/6//sdhGpjfuRGBTU5qS7Pte4jDcm9RV1soWcF0Dix1wSnsw+5PQwK9nMoJOc2v6I57BBLs6MSk8G/h2Sdd/7m5tC1yJunH1Gn5nKkDdGbWVyQ7XasqnH/1O9lHNY6gq5WcdmP/Ad9qvkkfxE7mPu1LndHoXiJEQCL1ryd0C1B5G82PXT0BNIdy6eAfrawvLEtopdue5Wf4gmGPSVubBl0vBybHXzrMWkAJJT3eaxR2RrJlGQ6ZtbvG+Gi59ihD6tMMewkNJGQsPLo/90j5mUhL/uAD5k7h2s3IB9nZ4IWITiSY8nJS8knfrTZCh7ycMYjyFbePboefa/lWz+Uyt3OKeNA3cqyziVUhT544i22VpCazBN1XD7jvUr7IF7EUAcRHjIaCUsDvI0g5DSfPtmFS9bsr/twSMqEVPtJ6nyZPGK4+vfLJO50ZRv3ANCWNzXXJwtuuPlrFDxH+hvYzxMX6Julx5wqqwRPfqerEd9bqPPnC7eDnAXW8piHvd/CWFLNS3jkBu9MHex3PuqjfW4uBlQ2yt42YEPrm18zNMnR2DYPS39EdEvw71Mh8DuEAY34K4fEXx9jerVFIPVu7ghWjPK7T8uV+x7f1tt48QHpINSrHJj4JcfWhKHpavaqUILGtqlQyfgZQ1TXzBpzdCMR+rEZNTPgn4OvnI0NbUSlC6ZZo6bH3iHLBu8fkmvX0r4o9zHcT76d+LXeXT1Qg3SSNtAtUU+EB1BsYkoWhTFIINmP1ZC3d+GoEcHjMRR/L0MTSYL1igAGvHNdWZevA/4Z116Rdh4ubnYFX3UcS9+yWLe3yGC3m06kuAmuivYZeAJFmnbq+1LSK4G3oJmJ87WxYyAp1vrXhQ7aXtKl0QDUtDtE/yUR6eQifZ4dChF7t2PBTtYegy/gggO/NbE3sxS/5ELk6+B3pdlFaV+Cn473SU0bRB0eQhzDVnxkQ9c8JMIMb3xdaugF6X1F3BmqhMPyoSh5s3e5Pb+Pq4yZd1PNMMJdYb,iv:lm7ue452sJqQxte2ZqeZ6iG9Ax/yAx49qvSSS77g1Lc=,tag:VUqgkrulKhIn3JE8sEnihA==,type:str]", + "sops": { + "kms": null, + "gcp_kms": [ + { + "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", + "created_at": "2022-10-13T11:07:26Z", + "enc": "CiQA4OM7eO8TgVMM6V+MGJSs/zEfe1mxUD3XJ1kzvMiDE5EdDOwSSQDuy/p8OaLOnUtGy1hDWAUSw4xtBYWqZdDWQ4JDJqrbl+o2b2RHyGZQTkgOYyci833liLZuOOLrjMjsYIvGn3eFAqFJY9W5K44=" + } + ], + "azure_kv": null, + "hc_vault": null, + "age": null, + "lastmodified": "2022-10-13T11:07:26Z", + "mac": "ENC[AES256_GCM,data:G6QlD7JSFC1B+KWOm5w9ZaSAB9mf8Kj2gpJvbilbcNs30YTyFGXyxvD+ThAbkoe2NIOd7WZ+YTZRtqNXn6lF3p//SmbomKlnN2l8JlxJWJ2+lMIazDKwpSkBGztsZUWhHrmyE5wbxy8IbWQwFGIVPMHIqLCGebp2OQ6ZTL/1vI0=,iv:3zdrXp3LkYQTYio0r086bgJHesgjn/IbnFj+56mjpcI=,tag:uD78bJPI12xm/xo9nmb3vA==,type:str]", + "pgp": null, + "unencrypted_suffix": "_unencrypted", + "version": "3.7.3" + } +} \ No newline at end of file From f3e48e57f118d6bcdd4d20083d32a3f1b7147ca6 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 13 Oct 2022 14:42:51 +0100 Subject: [PATCH 02/29] Add generated .tfvars file for nasa-cryo --- terraform/aws/projects/nasa-cryo.tfvars | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 terraform/aws/projects/nasa-cryo.tfvars diff --git a/terraform/aws/projects/nasa-cryo.tfvars b/terraform/aws/projects/nasa-cryo.tfvars new file mode 100644 index 0000000000..52a48e9cdb --- /dev/null +++ b/terraform/aws/projects/nasa-cryo.tfvars @@ -0,0 +1,32 @@ +region = "us-west-2" + +cluster_name = "nasa-cryo" + +cluster_nodes_location = "us-west-2a" + +user_buckets = { + "scratch-staging": { + "delete_after" : 7 + }, + "scratch": { + "delete_after": 7 + }, +} + + +hub_cloud_permissions = { + "staging" : { + requestor_pays: true, + bucket_admin_access: ["scratch-staging"], + extra_iam_policy: "" + }, + "prod" : { + requestor_pays: true, + bucket_admin_access: ["scratch"], + extra_iam_policy: "" + }, +} + +# Sarah added this value because terraform asked for it to be provided on the +# command line when she executed `tf plan` +db_instance_identifier = "nasa-cryo" \ No newline at end of file From 92ea20583e2284e46a4bc4bf5b204071ed7b00c5 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 14 Oct 2022 10:39:07 +0100 Subject: [PATCH 03/29] Remove db_instance_identifier from .tfvars file --- terraform/aws/projects/nasa-cryo.tfvars | 4 ---- 1 file changed, 4 deletions(-) diff --git a/terraform/aws/projects/nasa-cryo.tfvars b/terraform/aws/projects/nasa-cryo.tfvars index 52a48e9cdb..b7aa0c6639 100644 --- a/terraform/aws/projects/nasa-cryo.tfvars +++ b/terraform/aws/projects/nasa-cryo.tfvars @@ -26,7 +26,3 @@ hub_cloud_permissions = { extra_iam_policy: "" }, } - -# Sarah added this value because terraform asked for it to be provided on the -# command line when she executed `tf plan` -db_instance_identifier = "nasa-cryo" \ No newline at end of file From 99b53291dfcb66f0a82be65512c24d1742d72786 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 14 Oct 2022 10:50:21 +0100 Subject: [PATCH 04/29] Add cluster creds for nasa-cryo --- .../enc-deployer-credentials.secret.json | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 config/clusters/nasa-cryo/enc-deployer-credentials.secret.json diff --git a/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json b/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json new file mode 100644 index 0000000000..12f7847e1a --- /dev/null +++ b/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json @@ -0,0 +1,25 @@ +{ + "AccessKey": { + "AccessKeyId": "ENC[AES256_GCM,data:Qs17cI1Br2buAKGPXJSBhI25R/0=,iv:MNvfYHTTihBsQZlW7Ypo8kd9eMOjZMmYGVoCJUDvQcE=,tag:VCJSIunnsZIev0//ngsaBg==,type:str]", + "SecretAccessKey": "ENC[AES256_GCM,data:ZeycPXTpldXIcKkPpE7rNXWknlb2Pk5PtfHnF9z4abDfhSgRlL9KdA==,iv:m0kMDT9FcG4JMnnlJAa4j94xjvjqTLvTtP+EYBeUhrs=,tag:UHFVmJPlG6vmoU26OnhXew==,type:str]", + "UserName": "ENC[AES256_GCM,data:OEun1XAVil5EV9gpu/mHJSStlQY7Mlc=,iv:zbUKP95qoRMo78OmNY3wTLqV6d4uCjjADdnVEKPUom4=,tag:LgzON7vrSVsXRLF2mfcKtg==,type:str]" + }, + "sops": { + "kms": null, + "gcp_kms": [ + { + "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", + "created_at": "2022-10-14T09:49:47Z", + "enc": "CiQA4OM7eDHVgX84efoMlXia7IJF6oWWNGU+KSlL6i+UAtE6mucSSQDuy/p875FuqYZ2d+Bv5/mK1qb44qE7g2yWYzzW5XlCRLQRxJq8ZC44UsqRHexuFMVhfyILN4bCBLDnWlMFNhiFgVuaZ0JtsUQ=" + } + ], + "azure_kv": null, + "hc_vault": null, + "age": null, + "lastmodified": "2022-10-14T09:49:48Z", + "mac": "ENC[AES256_GCM,data:xm5zfxfo1CDv3Pm05saNZZ3+Z158cMZCS8Sf/blm/xdvJn9gztSWY/UbgVJB48yD4Utvc5l61arwUAAzFd38CSXl8w+Mn2Iu1i8FdfFwW0rxJYzHtYm6stNZZ8SRgPfe+TGpNBrfGgoKLysm+kBqyspQ5OcoDkn1tigoET3lVLE=,iv:gg/OP8i9zjrc0yfe6Jne6/t3XSzWu5HMeVE8kvDzVo8=,tag:TfT23nbLaaJOjzUArbZzyg==,type:str]", + "pgp": null, + "unencrypted_suffix": "_unencrypted", + "version": "3.7.3" + } +} \ No newline at end of file From 6038f973c5bd66165b7c011c43818300954256f3 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 14 Oct 2022 10:54:27 +0100 Subject: [PATCH 05/29] Add a minimal cluster.yaml file for nasa-cryo --- config/clusters/nasa-cryo/cluster.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 config/clusters/nasa-cryo/cluster.yaml diff --git a/config/clusters/nasa-cryo/cluster.yaml b/config/clusters/nasa-cryo/cluster.yaml new file mode 100644 index 0000000000..0a0670c06b --- /dev/null +++ b/config/clusters/nasa-cryo/cluster.yaml @@ -0,0 +1,10 @@ +name: nasa-cryo +provider: aws +aws: + key: enc-deployer-credentials.secret.json + clusterType: eks + clusterName: nasa-cryo + region: us-west-2 +support: + helm_chart_values_files: [] +hubs: [] From a635dd9c7e0f5bbacc326676f634539e9e13485d Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 14 Oct 2022 11:44:45 +0100 Subject: [PATCH 06/29] Update cluster deployer credentials --- .../nasa-cryo/enc-deployer-credentials.secret.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json b/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json index 12f7847e1a..a12c3adac2 100644 --- a/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json +++ b/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json @@ -1,23 +1,23 @@ { "AccessKey": { - "AccessKeyId": "ENC[AES256_GCM,data:Qs17cI1Br2buAKGPXJSBhI25R/0=,iv:MNvfYHTTihBsQZlW7Ypo8kd9eMOjZMmYGVoCJUDvQcE=,tag:VCJSIunnsZIev0//ngsaBg==,type:str]", - "SecretAccessKey": "ENC[AES256_GCM,data:ZeycPXTpldXIcKkPpE7rNXWknlb2Pk5PtfHnF9z4abDfhSgRlL9KdA==,iv:m0kMDT9FcG4JMnnlJAa4j94xjvjqTLvTtP+EYBeUhrs=,tag:UHFVmJPlG6vmoU26OnhXew==,type:str]", - "UserName": "ENC[AES256_GCM,data:OEun1XAVil5EV9gpu/mHJSStlQY7Mlc=,iv:zbUKP95qoRMo78OmNY3wTLqV6d4uCjjADdnVEKPUom4=,tag:LgzON7vrSVsXRLF2mfcKtg==,type:str]" + "AccessKeyId": "ENC[AES256_GCM,data:ioqTaGNPPAsFQ+z53JU9Ez/Rfr0=,iv:9XkzxSy5SsjBad1tbYrUFUBoyjR0J7wn46CurIklma0=,tag:tzVvIgUktuJEd+7EGGFxFw==,type:str]", + "SecretAccessKey": "ENC[AES256_GCM,data:SgRFSnIv5XZXNuR7nmpuvvlYFN9YHy1J7qvn5beTgIq3tN796Y5G4Q==,iv:Aln0SJ9ZJiKJkmr4euv0kSHdozJf+2kfB+mwJwmjGOY=,tag:0ZlJMNVkPlKEoTeWJ5PNXg==,type:str]", + "UserName": "ENC[AES256_GCM,data:W8nJRay/DwepDtMsAnmzqrcrjKmooDo=,iv:Myt8UMplSKawWlpTBDpGl9t+Sza3YxXpYhsSwn5ZY7I=,tag:tKCwd9KSuzqKXhtBmp901A==,type:str]" }, "sops": { "kms": null, "gcp_kms": [ { "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", - "created_at": "2022-10-14T09:49:47Z", - "enc": "CiQA4OM7eDHVgX84efoMlXia7IJF6oWWNGU+KSlL6i+UAtE6mucSSQDuy/p875FuqYZ2d+Bv5/mK1qb44qE7g2yWYzzW5XlCRLQRxJq8ZC44UsqRHexuFMVhfyILN4bCBLDnWlMFNhiFgVuaZ0JtsUQ=" + "created_at": "2022-10-14T10:43:45Z", + "enc": "CiQA4OM7eLig1/XSPsMWyx4XY/gig7gwO2T2emDKfoKSQcs+fEkSSQDuy/p8kFUJVK51783eXMVY4C6rIWB89LJY8IcBA3zVrzCiF2l0PIaOGe7TWKG97ArDOB7tdrJQjM5AQeLpuHDLikvDAVtdM7w=" } ], "azure_kv": null, "hc_vault": null, "age": null, - "lastmodified": "2022-10-14T09:49:48Z", - "mac": "ENC[AES256_GCM,data:xm5zfxfo1CDv3Pm05saNZZ3+Z158cMZCS8Sf/blm/xdvJn9gztSWY/UbgVJB48yD4Utvc5l61arwUAAzFd38CSXl8w+Mn2Iu1i8FdfFwW0rxJYzHtYm6stNZZ8SRgPfe+TGpNBrfGgoKLysm+kBqyspQ5OcoDkn1tigoET3lVLE=,iv:gg/OP8i9zjrc0yfe6Jne6/t3XSzWu5HMeVE8kvDzVo8=,tag:TfT23nbLaaJOjzUArbZzyg==,type:str]", + "lastmodified": "2022-10-14T10:43:46Z", + "mac": "ENC[AES256_GCM,data:6ruSzoTanQRXt71z/JFdPah3PDLZjb+lP88fEBkwr3+hHTxuHghKQDpG0xWN21P3cAfAvNK5cn5By8V6RLIMGSZ1IUey0OU0uRUm2lrikXZ/R5kJBG6iMXiAWsXw+ARY2MXFpyZBZ1KKeVDTGANIwbCdhUUb8M32PWo4EMrEN9g=,iv:pMHDea1SZc4Y2q145ydorlSL4wHbTaXH8Xc2XcKdfF4=,tag:jQ1US/1CwPpXgbX651x17g==,type:str]", "pgp": null, "unencrypted_suffix": "_unencrypted", "version": "3.7.3" From bad6fcc881c4328d555745654dd3ef800bf0c881 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 14 Oct 2022 11:51:45 +0100 Subject: [PATCH 07/29] Add support chart config --- config/clusters/nasa-cryo/cluster.yaml | 4 +++- .../nasa-cryo/enc-support.secret.values.yaml | 17 ++++++++++++++ config/clusters/nasa-cryo/support.values.yaml | 22 +++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 config/clusters/nasa-cryo/enc-support.secret.values.yaml create mode 100644 config/clusters/nasa-cryo/support.values.yaml diff --git a/config/clusters/nasa-cryo/cluster.yaml b/config/clusters/nasa-cryo/cluster.yaml index 0a0670c06b..6642194ea6 100644 --- a/config/clusters/nasa-cryo/cluster.yaml +++ b/config/clusters/nasa-cryo/cluster.yaml @@ -6,5 +6,7 @@ aws: clusterName: nasa-cryo region: us-west-2 support: - helm_chart_values_files: [] + helm_chart_values_files: + - support.values.yaml + - enc-support.secret.values.yaml hubs: [] diff --git a/config/clusters/nasa-cryo/enc-support.secret.values.yaml b/config/clusters/nasa-cryo/enc-support.secret.values.yaml new file mode 100644 index 0000000000..5f845e5720 --- /dev/null +++ b/config/clusters/nasa-cryo/enc-support.secret.values.yaml @@ -0,0 +1,17 @@ +prometheusIngressAuthSecret: + username: ENC[AES256_GCM,data:ebPGMvUPCEaDzGY144yJZkI/+0FJuvFxcs0+lxKt1o1s6wMzkL5di+HB3NY98VIOFtbHro0cOBjtFyes3UkgSA==,iv:j4SXSqWNJjXhPcnDep2tRPsdF+9vUG+d9QW0ZRqNxGM=,tag:lAG+hK1dLSwJHVy5tT1kjA==,type:str] + password: ENC[AES256_GCM,data:zHPuOyaT5+HC+3IKin54as4YI1nJrxCa7huIP85MEKWaHa11lyQLjAW4bQsQY1hBH/0kgW+kzVtkDlcIghtjGA==,iv:8nHsm5KipEKs7o4KV7mBQ31a6xCXMp8oBL7Ip1IWpy4=,tag:tuKlTIAiMCWnuME0EpJJKQ==,type:str] +sops: + kms: [] + gcp_kms: + - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs + created_at: "2022-10-14T10:37:09Z" + enc: CiQA4OM7eI66DIfP/2zTsYsei1C4UzAv+/lOYjahuA9Bmh8CgXYSSQDuy/p8pHTxiQlqkmY/NUjPdfxlAT5uiYBjNBX3RdE5ikDvQnpY9BwyQy4bDs075GlkLAU4BwMax0iC0s20gPu8+01pphDS0DM= + azure_kv: [] + hc_vault: [] + age: [] + lastmodified: "2022-10-14T10:37:09Z" + mac: ENC[AES256_GCM,data:rIOhzKLT0i55m9Ro+1DZ56DFnM5/uyMoGgaKWsgFCCxlIXcbSjwY3opyKjI8yn2JTZjxh1/8aScxHiLfI3tVsa1hYQqnI7HVtmbqR8PYLxi4W7OTWmoc8xoaYGuSD2SWRM7s/JXcZqcBLA3JqqZUiAmfnHfqo6bcU5WclItI6I0=,iv:o+c+SeWnt9E+b9zoDnUr3lxfrDVD7aUNl7fyvK8QVm4=,tag:g01sgSXHjlZEIy3DSrKH5g==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.7.3 diff --git a/config/clusters/nasa-cryo/support.values.yaml b/config/clusters/nasa-cryo/support.values.yaml new file mode 100644 index 0000000000..1d0c7c8021 --- /dev/null +++ b/config/clusters/nasa-cryo/support.values.yaml @@ -0,0 +1,22 @@ +prometheusIngressAuthSecret: + enabled: true + +grafana: + ingress: + hosts: + - grafana.cryointhecloud.2i2c.cloud + tls: + - secretName: grafana-tls + hosts: + - grafana.cryointhecloud.2i2c.cloud + +prometheus: + server: + ingress: + enabled: true + hosts: + - prometheus.cryointhecloud.2i2c.cloud + tls: + - secretName: prometheus-tls + hosts: + - prometheus.cryointhecloud.2i2c.cloud From 79af854cdfcbe414612204a52752a0e6ee1c40df Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Fri, 14 Oct 2022 12:42:34 -0700 Subject: [PATCH 08/29] Add a commant to output correct eksctl iam command to run --- docs/howto/operate/new-cluster/aws.md | 12 +++++++++++- terraform/aws/cd.tf | 12 ++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/howto/operate/new-cluster/aws.md b/docs/howto/operate/new-cluster/aws.md index 50ffe19893..a1c90d3c7e 100644 --- a/docs/howto/operate/new-cluster/aws.md +++ b/docs/howto/operate/new-cluster/aws.md @@ -165,9 +165,12 @@ have least amount of permissions possible. actually encrypted by `sops` before checking it in to the git repo. Otherwise this can be a serious security leak! -5. Grant the freshly created IAM user access to the kubernetes cluster. +5. Grant the freshly created IAM user access to the kubernetes cluster. As this requires + passing in some parameters that match the created cluster, we have a `terraform output` + that can give you the exact command to run. ```bash + $ terraform output -raw eksctl_iam_command eksctl create iamidentitymapping \ --cluster \ --region \ @@ -176,6 +179,9 @@ have least amount of permissions possible. --group system:masters ``` + Run the command output by `terraform output -raw eksctl_iam_command`, and that should + give the continuous deployer user access. + 6. In your hub deployment file (`config/clusters//cluster.yaml`), provide enough information for the deployer to find the correct credentials. @@ -192,6 +198,10 @@ have least amount of permissions possible. The `aws.key` file is defined _relative_ to the location of the `cluster.yaml` file. ``` +7. Test the access by running `python deployer use-cluster-credentials ` and + running `kubectl get node`. It should show you the provisioned node on the cluster if + everything works out ok. + ## Scaling up a nodegroup in a cluster `eksctl` creates nodepools that are mostly immutable, except for autoscaling properties - diff --git a/terraform/aws/cd.tf b/terraform/aws/cd.tf index ed8011e1c7..7fe02db345 100644 --- a/terraform/aws/cd.tf +++ b/terraform/aws/cd.tf @@ -40,3 +40,15 @@ output "continuous_deployer_creds" { value = jsonencode(local.cd_creds) sensitive = true } + +output "eksctl_iam_command" { + description = "eksctl command to grant cluster access to our CD" + value = <<-EOT + eksctl create iamidentitymapping \ + --cluster ${var.cluster_name} \ + --region ${var.region} \ + --arn ${aws_iam_user.continuous_deployer.arn} \ + --username ${aws_iam_user.continuous_deployer.name} \ + --group system:masters + EOT +} From 140e01bdd10871dfbb17cf6b59570be23210dc68 Mon Sep 17 00:00:00 2001 From: Sarah Gibson <44771837+sgibson91@users.noreply.github.com> Date: Mon, 17 Oct 2022 11:22:50 +0100 Subject: [PATCH 09/29] Remove remnant of merge conflict resolution --- docs/howto/operate/new-cluster/aws.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/howto/operate/new-cluster/aws.md b/docs/howto/operate/new-cluster/aws.md index 291b31e386..2c39d39360 100644 --- a/docs/howto/operate/new-cluster/aws.md +++ b/docs/howto/operate/new-cluster/aws.md @@ -232,7 +232,6 @@ Get the address a hub on this cluster should use for connecting to NFS with `terraform output nfs_server_dns`, and set it in the hub's config under `nfs.pv.serverIP` (nested under `basehub` when necessary) in the appropriate `.values.yaml` file. ->>>>>>> upstream/master ## Scaling up a nodegroup in a cluster From 94b835a60773a8bfa2d1f84e29e719f92926c4b0 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 17 Oct 2022 11:38:12 +0100 Subject: [PATCH 10/29] Update docs to create a new terraform workspace --- docs/howto/operate/new-cluster/aws.md | 7 ++++++- docs/topic/terraform.md | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/howto/operate/new-cluster/aws.md b/docs/howto/operate/new-cluster/aws.md index 2c39d39360..a926f1de8a 100644 --- a/docs/howto/operate/new-cluster/aws.md +++ b/docs/howto/operate/new-cluster/aws.md @@ -158,7 +158,12 @@ in GCP, so you also need to have `gcloud` set up and authenticated already. terraform init ``` -3. Deploy the terraform-managed infrastructure +3. Create a new [terraform workspace](topic:terraform:workspaces) + ```{bash} + terraform workspace new + ``` + +4. Deploy the terraform-managed infrastructure ```bash terraform apply -var-file projects/.tfvars ``` diff --git a/docs/topic/terraform.md b/docs/topic/terraform.md index 889dca368b..342e2f48b0 100644 --- a/docs/topic/terraform.md +++ b/docs/topic/terraform.md @@ -50,6 +50,7 @@ terraform init -backend-config=backends/pangeo-backend.hcl If prior backend data exists in a `terraform.lock.hcl`, you might see an `Error: Backend configuration changed` when trying to initialize that backend. To reconfigure this backend, ignoring any saved configuration, add the `-reconfigure` flag to the init command. ``` +(topic:terraform:workspaces)= ## Workspaces We use [terraform workspaces](https://www.terraform.io/docs/language/state/workspaces.html) From eb9680a01ce936ae86a4550f8b9c5fb8895eefc5 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 17 Oct 2022 11:44:31 +0100 Subject: [PATCH 11/29] Update cluster creds --- .../nasa-cryo/enc-deployer-credentials.secret.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json b/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json index a12c3adac2..a051b50626 100644 --- a/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json +++ b/config/clusters/nasa-cryo/enc-deployer-credentials.secret.json @@ -1,23 +1,23 @@ { "AccessKey": { - "AccessKeyId": "ENC[AES256_GCM,data:ioqTaGNPPAsFQ+z53JU9Ez/Rfr0=,iv:9XkzxSy5SsjBad1tbYrUFUBoyjR0J7wn46CurIklma0=,tag:tzVvIgUktuJEd+7EGGFxFw==,type:str]", - "SecretAccessKey": "ENC[AES256_GCM,data:SgRFSnIv5XZXNuR7nmpuvvlYFN9YHy1J7qvn5beTgIq3tN796Y5G4Q==,iv:Aln0SJ9ZJiKJkmr4euv0kSHdozJf+2kfB+mwJwmjGOY=,tag:0ZlJMNVkPlKEoTeWJ5PNXg==,type:str]", - "UserName": "ENC[AES256_GCM,data:W8nJRay/DwepDtMsAnmzqrcrjKmooDo=,iv:Myt8UMplSKawWlpTBDpGl9t+Sza3YxXpYhsSwn5ZY7I=,tag:tKCwd9KSuzqKXhtBmp901A==,type:str]" + "AccessKeyId": "ENC[AES256_GCM,data:ypqptk+AINfBDTr9yT8XI1i12+A=,iv:ZSzJEMsXfqMDwz1PUQWFD4UmDjxixs0ARfVa//iCOOw=,tag:WkM0TZhtdd4YTOdBGbimGg==,type:str]", + "SecretAccessKey": "ENC[AES256_GCM,data:gh7waVFBVdh5/7cCyq53ZWZm00HQA0W3lxKVmt01/mE9uRHlASLZfw==,iv:6azuIVBjSnEx/aXwi4Xt2LxdFXbKKOGnpAfeLdoEBtM=,tag:+MMmBcnkYXh9udYZBwA1eQ==,type:str]", + "UserName": "ENC[AES256_GCM,data:yecSFFtr/KTIFK5Z6LH4ElihJ7oE7d4=,iv:9A1RIWrNfKoOcDSJmdyvmdfGWTv09p2onTUjTzQpxa4=,tag:FMAmHMsZZ2VZ+VpI6RNqHw==,type:str]" }, "sops": { "kms": null, "gcp_kms": [ { "resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs", - "created_at": "2022-10-14T10:43:45Z", - "enc": "CiQA4OM7eLig1/XSPsMWyx4XY/gig7gwO2T2emDKfoKSQcs+fEkSSQDuy/p8kFUJVK51783eXMVY4C6rIWB89LJY8IcBA3zVrzCiF2l0PIaOGe7TWKG97ArDOB7tdrJQjM5AQeLpuHDLikvDAVtdM7w=" + "created_at": "2022-10-17T10:43:57Z", + "enc": "CiQA4OM7eHvbSMs2Ln1giHqphcJf8uMlHXoFfuThnBwPiA6bYQUSSQDuy/p8VPlHP9sg3d5csL432NPi4NNGwe5VUeeHR9RdIcauiP5KhHiFOI7rVrGcz1SIlW5XTYmQ5Ochj4mbXgGhiZ2qKzn/PCE=" } ], "azure_kv": null, "hc_vault": null, "age": null, - "lastmodified": "2022-10-14T10:43:46Z", - "mac": "ENC[AES256_GCM,data:6ruSzoTanQRXt71z/JFdPah3PDLZjb+lP88fEBkwr3+hHTxuHghKQDpG0xWN21P3cAfAvNK5cn5By8V6RLIMGSZ1IUey0OU0uRUm2lrikXZ/R5kJBG6iMXiAWsXw+ARY2MXFpyZBZ1KKeVDTGANIwbCdhUUb8M32PWo4EMrEN9g=,iv:pMHDea1SZc4Y2q145ydorlSL4wHbTaXH8Xc2XcKdfF4=,tag:jQ1US/1CwPpXgbX651x17g==,type:str]", + "lastmodified": "2022-10-17T10:43:58Z", + "mac": "ENC[AES256_GCM,data:Fr6V4xZ+3/EAKr9JBFX96ag1eMR8qm7p1HZ8yS+mZfCgWjlciHosSVjgd+gQG84xnilFh+dSpTjy4W8Vev5vfHAAdQZfl7FVg8GjdCfqT3LJabJDQMksdFNHKfux3bBHN4Uw0EB/n3VYJuNg7zocl21PjtLKMSzZtuaR3cruh9M=,iv:pFwAedyytIU1OaQfwfs4ipnAEmGhVXABHxQhSfR8xc8=,tag:vslkISuohP6V4Qlnynwk7Q==,type:str]", "pgp": null, "unencrypted_suffix": "_unencrypted", "version": "3.7.3" From 1c5734e7602631097cfa34f0e636a3742dfe44c3 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Mon, 17 Oct 2022 11:58:55 +0100 Subject: [PATCH 12/29] Move eksctl access section to after terraform section, Reference new tf output command --- docs/howto/operate/new-cluster/aws.md | 67 +++++++++++++++------------ 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/docs/howto/operate/new-cluster/aws.md b/docs/howto/operate/new-cluster/aws.md index a926f1de8a..c73f994111 100644 --- a/docs/howto/operate/new-cluster/aws.md +++ b/docs/howto/operate/new-cluster/aws.md @@ -108,36 +108,7 @@ aws eks update-kubeconfig --name= --region= \ - --region \ - --arn arn:aws:iam:::user/ \ - --username \ - --group system:masters -``` - -This gives all the users full access to the entire kubernetes cluster. They can -fetch local config with `aws eks update-kubeconfig --name= --region=` -after this step is done. - -This should eventually be converted to use an [IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) -instead, so we need not give each individual user access, but just grant access to the -role - and users can modify them as they wish. - +(new-cluster:aws:terraform)= ## Deploy Terraform-managed infrastructure Our AWS *terraform* code is now used to deploy supporting infrastructure for the EKS cluster, including: @@ -169,6 +140,7 @@ in GCP, so you also need to have `gcloud` set up and authenticated already. ``` Observe the plan carefully, and accept it. +(new-cluster:aws:terraform:cicd)= ### Export account credentials with finely scoped permissions for automatic deployment In the previous step, we will have created an AWS IAM user with just @@ -227,6 +199,41 @@ have least amount of permissions possible. running `kubectl get node`. It should show you the provisioned node on the cluster if everything works out ok. +## Grant `eksctl` access to other users + +```{note} +This section is still required even if the account is managed by SSO. +Though a user could run `python deployer use-cluster-credentials` to gain access as well. +``` + +AWS EKS has a strange access control problem, where the IAM user who creates +the cluster has [full access without any visible settings +changes](https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html), +and nobody else does. You need to explicitly grant access to other users. Find +the usernames of the 2i2c engineers on this particular AWS account, and run the +following command to give them access: + +```{note} +You can modify the command output by running `terraform output -raw eksctl_iam_command` as described in [](new-cluster:aws:terraform:cicd). +``` + +```bash +eksctl create iamidentitymapping \ + --cluster \ + --region \ + --arn arn:aws:iam:::user/ \ + --username \ + --group system:masters +``` + +This gives all the users full access to the entire kubernetes cluster. They can +fetch local config with `aws eks update-kubeconfig --name= --region=` +after this step is done. + +This should eventually be converted to use an [IAM Role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) +instead, so we need not give each individual user access, but just grant access to the +role - and users can modify them as they wish. + ## Export the EFS IP address for home directories The terraform run in the previous step will have also created an From 4da183b26b50fe369dd048c4782c5223e0cf938a Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 17 Oct 2022 22:22:10 -0700 Subject: [PATCH 13/29] Fix cases where AWS_ variables are set in terminal This produces weird error messages otherwise --- deployer/cluster.py | 168 +++++++++++++++++++++----------------------- deployer/utils.py | 21 ++++++ 2 files changed, 100 insertions(+), 89 deletions(-) diff --git a/deployer/cluster.py b/deployer/cluster.py index 439bf39d70..4d7e1c2461 100644 --- a/deployer/cluster.py +++ b/deployer/cluster.py @@ -7,7 +7,7 @@ from file_acquisition import get_decrypted_file, get_decrypted_files from hub import Hub -from utils import print_colour +from utils import print_colour, unset_env_vars class Cluster: @@ -138,8 +138,9 @@ def auth_kubeconfig(self): config = self.spec["kubeconfig"] config_path = self.config_path.joinpath(config["file"]) - with get_decrypted_file(config_path) as decrypted_key_path: - # FIXME: Unset this after our yield + with get_decrypted_file(config_path) as decrypted_key_path, unset_env_vars( + ["KUBECONFIG"] + ): os.environ["KUBECONFIG"] = decrypted_key_path yield @@ -157,44 +158,37 @@ def auth_aws(self): cluster_name = config["clusterName"] region = config["region"] - with tempfile.NamedTemporaryFile() as kubeconfig: - orig_kubeconfig = os.environ.get("KUBECONFIG", None) - orig_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID", None) - orig_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY", None) - try: - with get_decrypted_file(key_path) as decrypted_key_path: - - decrypted_key_abspath = os.path.abspath(decrypted_key_path) - if not os.path.isfile(decrypted_key_abspath): - raise FileNotFoundError("The decrypted key file does not exist") - with open(decrypted_key_abspath) as f: - creds = json.load(f) - - os.environ["AWS_ACCESS_KEY_ID"] = creds["AccessKey"]["AccessKeyId"] - os.environ["AWS_SECRET_ACCESS_KEY"] = creds["AccessKey"][ - "SecretAccessKey" - ] - - os.environ["KUBECONFIG"] = kubeconfig.name - - subprocess.check_call( - [ - "aws", - "eks", - "update-kubeconfig", - f"--name={cluster_name}", - f"--region={region}", - ] - ) + # Unset all env vars that start with AWS_, as that might affect the aws + # commandline we call. This could make some weird error messages. + unset_envs = ["KUBECONFIG"] + [k for k in os.environ if k.startswith("AWS_")] + + with tempfile.NamedTemporaryFile() as kubeconfig, unset_env_vars(unset_envs): + with get_decrypted_file(key_path) as decrypted_key_path: + + decrypted_key_abspath = os.path.abspath(decrypted_key_path) + if not os.path.isfile(decrypted_key_abspath): + raise FileNotFoundError("The decrypted key file does not exist") + with open(decrypted_key_abspath) as f: + creds = json.load(f) + + os.environ["AWS_ACCESS_KEY_ID"] = creds["AccessKey"]["AccessKeyId"] + os.environ["AWS_SECRET_ACCESS_KEY"] = creds["AccessKey"][ + "SecretAccessKey" + ] + + os.environ["KUBECONFIG"] = kubeconfig.name + + subprocess.check_call( + [ + "aws", + "eks", + "update-kubeconfig", + f"--name={cluster_name}", + f"--region={region}", + ] + ) - yield - finally: - if orig_kubeconfig is not None: - os.environ["KUBECONFIG"] = orig_kubeconfig - if orig_access_key_id is not None: - os.environ["AWS_ACCESS_KEY_ID"] = orig_access_key_id - if orig_secret_access_key is not None: - os.environ["AWS_SECRET_ACCESS_KEY"] = orig_secret_access_key + yield def auth_azure(self): """ @@ -207,58 +201,54 @@ def auth_azure(self): cluster = config["cluster"] resource_group = config["resource_group"] - with tempfile.NamedTemporaryFile() as kubeconfig: - orig_kubeconfig = os.environ.get("KUBECONFIG", None) - - try: - os.environ["KUBECONFIG"] = kubeconfig.name - - with get_decrypted_file(key_path) as decrypted_key_path: - - decrypted_key_abspath = os.path.abspath(decrypted_key_path) - if not os.path.isfile(decrypted_key_abspath): - raise FileNotFoundError("The decrypted key file does not exist") - - with open(decrypted_key_path) as f: - service_principal = json.load(f) - - # Login to Azure - subprocess.check_call( - [ - "az", - "login", - "--service-principal", - f"--username={service_principal['service_principal_id']}", - f"--password={service_principal['service_principal_password']}", - f"--tenant={service_principal['tenant_id']}", - ] - ) + with tempfile.NamedTemporaryFile() as kubeconfig, unset_env_vars( + ["KUBECONFIG"] + ): + os.environ["KUBECONFIG"] = kubeconfig.name + + with get_decrypted_file(key_path) as decrypted_key_path: + + decrypted_key_abspath = os.path.abspath(decrypted_key_path) + if not os.path.isfile(decrypted_key_abspath): + raise FileNotFoundError("The decrypted key file does not exist") + + with open(decrypted_key_path) as f: + service_principal = json.load(f) + + # Login to Azure + subprocess.check_call( + [ + "az", + "login", + "--service-principal", + f"--username={service_principal['service_principal_id']}", + f"--password={service_principal['service_principal_password']}", + f"--tenant={service_principal['tenant_id']}", + ] + ) - # Set the Azure subscription - subprocess.check_call( - [ - "az", - "account", - "set", - f"--subscription={service_principal['subscription_id']}", - ] - ) + # Set the Azure subscription + subprocess.check_call( + [ + "az", + "account", + "set", + f"--subscription={service_principal['subscription_id']}", + ] + ) - # Get cluster creds - subprocess.check_call( - [ - "az", - "aks", - "get-credentials", - f"--name={cluster}", - f"--resource-group={resource_group}", - ] - ) + # Get cluster creds + subprocess.check_call( + [ + "az", + "aks", + "get-credentials", + f"--name={cluster}", + f"--resource-group={resource_group}", + ] + ) - yield - finally: - if orig_kubeconfig is not None: - os.environ["KUBECONFIG"] = orig_kubeconfig + yield def auth_gcp(self): config = self.spec["gcp"] diff --git a/deployer/utils.py b/deployer/utils.py index 0dc53b583f..895109f014 100644 --- a/deployer/utils.py +++ b/deployer/utils.py @@ -1,5 +1,6 @@ import os import subprocess +from contextlib import contextmanager from markdownTable import markdownTable @@ -141,3 +142,23 @@ def create_markdown_comment(support_staging_matrix, prod_matrix): # Save comment body to a file to be uploaded as an atrifact by GitHub Actions with open("comment-body.txt", "w") as f: f.write(comment_body) + + +@contextmanager +def unset_env_vars(vars): + """ + Temporarily unset env vars in vars if they exist + """ + orig_values = {} + for e in vars: + if e in os.environ: + orig_values[e] = os.environ[e] + # Clear values from os.environ if they are present! + del os.environ[e] + + try: + yield + finally: + for e in orig_values: + # Put values back into os.environ when contextmanager returns + os.environ[e] = orig_values[e] From 04c5ef2e9aee2e24488612a3e7fc9c57921ba34e Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 18 Oct 2022 11:34:08 +0100 Subject: [PATCH 14/29] Setup grafana dashboards for nasa-cryo cluster --- .github/workflows/deploy-grafana-dashboards.yaml | 1 + .../nasa-cryo/enc-grafana-token.secret.yaml | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 config/clusters/nasa-cryo/enc-grafana-token.secret.yaml diff --git a/.github/workflows/deploy-grafana-dashboards.yaml b/.github/workflows/deploy-grafana-dashboards.yaml index 106fe2fb65..4247d174e0 100644 --- a/.github/workflows/deploy-grafana-dashboards.yaml +++ b/.github/workflows/deploy-grafana-dashboards.yaml @@ -24,6 +24,7 @@ jobs: - cluster_name: awi-ciroh - cluster_name: callysto - cluster_name: 2i2c-uk + - cluster_name: nasa-cryo steps: - name: Checkout repo uses: actions/checkout@v3 diff --git a/config/clusters/nasa-cryo/enc-grafana-token.secret.yaml b/config/clusters/nasa-cryo/enc-grafana-token.secret.yaml new file mode 100644 index 0000000000..dd5a7bfb51 --- /dev/null +++ b/config/clusters/nasa-cryo/enc-grafana-token.secret.yaml @@ -0,0 +1,15 @@ +grafana_token: ENC[AES256_GCM,data:ni6X5Vo/tkhe7yHD3lMz+7NY+rf289VToX5WBwHfH6yEbgzZBM7TRndXYo9wAvlmMVqmsrMsc3Ay7cuYnTUed5zND+lataA/iWjeKoefLxpNiv3uUl6vzYzMiymRUwLElmQe3te8iTWgAwgAY29b/w==,iv:FmihMyr/ojx1U0H4iXq+GYk7QN55zH2TuQRDyCGBWOk=,tag:kiuK3u2ajLRrSzwnDJIVug==,type:str] +sops: + kms: [] + gcp_kms: + - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs + created_at: "2022-10-18T10:32:19Z" + enc: CiQA4OM7eHhF9zEUU0TU6TneV3t9Lo/iGr79TcOPVLMNzA3YuGUSSQDuy/p8Cg17FCCUEdV+ER7ttX8eNgM09WgCIb6jj4vJgAhvI5OZdPL1t2GOQMJOOeFkYt/dH/ClrttGqJISI1yMhaiZEniCfWU= + azure_kv: [] + hc_vault: [] + age: [] + lastmodified: "2022-10-18T10:32:20Z" + mac: ENC[AES256_GCM,data:O6ro11Orl2Y9SCRj+9LWtBN8d6vMFiubXeCdnxxyCEzcqZnzCquVvs68WxSiP+xZCvGlQtJZmO05IFMamVKHBrDfk8P172T5RyjFvZZ8J3VvvTgkGiRLaw2oZXhZrsnqcIxdNSGAUaapuApcDH69pEyhixRLXFFIxRw2yMyXfdk=,iv:JrIPlPvh4sJy2zLehDgneJfnTFi+crsrgC6QtVpwdN8=,tag:Sn1yYKcaGCfzdIRPLoqrRA==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.7.3 From 1d7e8bbcf4a149c5f1c04b7f31847fd8cfd25cb8 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 18 Oct 2022 16:36:00 +0100 Subject: [PATCH 15/29] Sketch out common.vlaues.yaml for nasa-cryo --- config/clusters/nasa-cryo/common.values.yaml | 119 +++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 config/clusters/nasa-cryo/common.values.yaml diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml new file mode 100644 index 0000000000..d099a28c51 --- /dev/null +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -0,0 +1,119 @@ +basehub: + nfs: + pv: + # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html + mountOptions: + - rsize=1048576 + - wsize=1048576 + - timeo=600 + - soft # We pick soft over hard, so NFS lockups don't lead to hung processes + - retrans=2 + - noresvport + serverIP: fs-0872256335d483d5f.efs.us-west-2.amazonaws.com + baseShareName: / + jupyterhub: + custom: + 2i2c: + add_staff_user_ids_to_admin_users: true + add_staff_user_ids_of_type: "github" + homepage: + templateVars: + org: + name: Cryo in the Cloud + logo_url: https://github.com/CryoInTheCloud/CryoCloudWebsite/blob/main/cryocloud.png + url: https://github.com/CryoInTheCloud + designed_by: + name: 2i2c + url: https://2i2c.org + operated_by: + name: 2i2c + url: https://2i2c.org + funded_by: + name: "" + url: "" + hub: + config: + Authenticator: + # This hub uses GitHub Teams auth and so we don't set + # allowed_users in order to not deny access to valid members of + # the listed teams. These people should have admin access though. + admin_users: + - tsnow03 + - JessicaS11 + - dfelikson + JupyterHub: + authenticator_class: github + GitHubOAuthenticator: + allowed_organizations: + - 2i2c-org:tech-team + - CryoInTheCloud:CryoCloudUser + - CryoInTheCloud:CryoCloudAdvanced + singleuser: + serviceAccountName: cloud-user-sa + defaultUrl: /lab + # User image repo: https://github.com/CryoInTheCloud/CryoCloudWebsite/tree/main/conda + image: + # This image is available on both Docker Hub and quay.io. We use quay.io + # here due to its more generous pull rate limits. + name: quay.io/cryointhecloud/cryocloudwebsite + tag: "2022.10.12" + storage: + extraVolumeMounts: + - name: home + mountPath: /home/jovyan/shared + subPath: _shared + readOnly: false + profileList: + # The mem-guarantees are here so k8s doesn't schedule other pods + # on these nodes. + - display_name: "Small: m5.large" + description: "~2 CPU, ~8G RAM" + default: true + allowed_teams: + - 2i2c-org:tech-team + - CryoInTheCloud:CryoCloudUser + - CryoInTheCloud:CryoCloudAdvanced + kubespawner_override: + # Explicitly unset mem_limit, so it overrides the default memory limit we set in + # basehub/values.yaml + mem_limit: null + mem_guarantee: 6.5G + node_selector: + node.kubernetes.io/instance-type: m5.large + - display_name: "Medium: m5.xlarge" + description: "~4 CPU, ~15G RAM" + allowed_teams: + - 2i2c-org:tech-team + - CryoInTheCloud:CryoCloudUser + - CryoInTheCloud:CryoCloudAdvanced + kubespawner_override: + mem_limit: null + mem_guarantee: 12G + node_selector: + node.kubernetes.io/instance-type: m5.xlarge + - display_name: "Large: m5.2xlarge" + description: "~8 CPU, ~30G RAM" + allowed_teams: + - 2i2c-org:tech-team + - CryoInTheCloud:CryoCloudAdvanced + kubespawner_override: + mem_limit: null + mem_guarantee: 26G + node_selector: + node.kubernetes.io/instance-type: m5.2xlarge + - display_name: "Huge: m5.8xlarge" + description: "~32 CPU, ~128G RAM" + allowed_teams: + - 2i2c-org:tech-team + - CryoInTheCloud:CryoCloudAdvanced + kubespawner_override: + mem_limit: null + mem_guarantee: 115G + node_selector: + node.kubernetes.io/instance-type: m5.8xlarge + scheduling: + userPlaceholder: + enabled: false + replicas: 0 + userScheduler: + enabled: false From df66638c7d8838f544cbff928ce1840baeaf6f1b Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 18 Oct 2022 16:36:15 +0100 Subject: [PATCH 16/29] Begin creating hub definitions in cluster.yaml for nasa-cryo --- config/clusters/nasa-cryo/cluster.yaml | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/config/clusters/nasa-cryo/cluster.yaml b/config/clusters/nasa-cryo/cluster.yaml index 6642194ea6..3a688cf520 100644 --- a/config/clusters/nasa-cryo/cluster.yaml +++ b/config/clusters/nasa-cryo/cluster.yaml @@ -9,4 +9,30 @@ support: helm_chart_values_files: - support.values.yaml - enc-support.secret.values.yaml -hubs: [] +hubs: + - name: staging + display_name: "NASA Cryo in the Cloud (staging)" + domain: staging.cryointhecloud.2i2c.cloud + helm_chart: daskhub + auth0: + # connection update? Also ensure the basehub Helm chart is provided a + # matching value for jupyterhub.custom.2i2c.add_staff_user_ids_of_type! + enabled: false + helm_chart_values_files: + # The order in which you list files here is the order the will be passed + # to the helm upgrade command in, and that has meaning. Please check + # that you intend for these files to be applied in this order. + - common.values.yaml + - name: prod + display_name: "NASA Cryo in the Cloud (prod)" + domain: cryointhecloud.2i2c.cloud + helm_chart: daskhub + auth0: + # connection update? Also ensure the basehub Helm chart is provided a + # matching value for jupyterhub.custom.2i2c.add_staff_user_ids_of_type! + enabled: false + helm_chart_values_files: + # The order in which you list files here is the order the will be passed + # to the helm upgrade command in, and that has meaning. Please check + # that you intend for these files to be applied in this order. + - common.values.yaml From c488aa50fab9cf7d50062afe281ea1015f465f65 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Tue, 18 Oct 2022 16:43:29 +0100 Subject: [PATCH 17/29] Update funded_by section of common values --- config/clusters/nasa-cryo/common.values.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index d099a28c51..149ea29ab1 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -28,9 +28,12 @@ basehub: operated_by: name: 2i2c url: https://2i2c.org + # Ideally, this community would like to list more than one funder + # Issue tracking implementation of this feature: + # https://github.com/2i2c-org/default-hub-homepage/issues/16 funded_by: - name: "" - url: "" + name: "NASA ICESat-2 Science Team" + url: https://icesat-2.gsfc.nasa.gov/science_definition_team hub: config: Authenticator: From 3bbfa5642e1236a6d449493b52e7efcc7fca0945 Mon Sep 17 00:00:00 2001 From: Sarah Gibson <44771837+sgibson91@users.noreply.github.com> Date: Tue, 18 Oct 2022 20:34:25 +0100 Subject: [PATCH 18/29] Make the shared directory read only Co-authored-by: Yuvi Panda --- config/clusters/nasa-cryo/common.values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 149ea29ab1..33375f9b7a 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -65,7 +65,7 @@ basehub: - name: home mountPath: /home/jovyan/shared subPath: _shared - readOnly: false + readOnly: true profileList: # The mem-guarantees are here so k8s doesn't schedule other pods # on these nodes. From 6f27234c688b4ee10e84140467d78127fd57bd5e Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 19 Oct 2022 10:59:43 +0100 Subject: [PATCH 19/29] Add Authenticator config to allow restricting profiles based on GitHub Team membership --- config/clusters/nasa-cryo/common.values.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 33375f9b7a..f6090ae8d3 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -37,6 +37,9 @@ basehub: hub: config: Authenticator: + # We are restricting profiles based on GitHub Team membership and + # so need to persist auth state + enable_auth_state: true # This hub uses GitHub Teams auth and so we don't set # allowed_users in order to not deny access to valid members of # the listed teams. These people should have admin access though. @@ -47,10 +50,15 @@ basehub: JupyterHub: authenticator_class: github GitHubOAuthenticator: + # We are restricting profiles based on GitHub Team membership and + # so need to populate the teams in the auth state + populate_teams_in_auth_state: true allowed_organizations: - 2i2c-org:tech-team - CryoInTheCloud:CryoCloudUser - CryoInTheCloud:CryoCloudAdvanced + scope: + - read:org singleuser: serviceAccountName: cloud-user-sa defaultUrl: /lab From 7751d60e5df99f805e91a24458f6faedd391a8ec Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 19 Oct 2022 11:44:00 +0100 Subject: [PATCH 20/29] Correct the URL for the logo --- config/clusters/nasa-cryo/common.values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index f6090ae8d3..ad3b8dcf58 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -20,7 +20,7 @@ basehub: templateVars: org: name: Cryo in the Cloud - logo_url: https://github.com/CryoInTheCloud/CryoCloudWebsite/blob/main/cryocloud.png + logo_url: https://raw.githubusercontent.com/CryoInTheCloud/CryoCloudWebsite/main/cryocloud.png url: https://github.com/CryoInTheCloud designed_by: name: 2i2c From 793c6e70e15fc1bd4989a1555c58faac0dcb5fb6 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 19 Oct 2022 11:44:28 +0100 Subject: [PATCH 21/29] Add config for staging hub --- config/clusters/nasa-cryo/cluster.yaml | 2 ++ .../nasa-cryo/enc-staging.secret.values.yaml | 21 +++++++++++++++++++ config/clusters/nasa-cryo/staging.values.yaml | 9 ++++++++ 3 files changed, 32 insertions(+) create mode 100644 config/clusters/nasa-cryo/enc-staging.secret.values.yaml create mode 100644 config/clusters/nasa-cryo/staging.values.yaml diff --git a/config/clusters/nasa-cryo/cluster.yaml b/config/clusters/nasa-cryo/cluster.yaml index 3a688cf520..56f187033e 100644 --- a/config/clusters/nasa-cryo/cluster.yaml +++ b/config/clusters/nasa-cryo/cluster.yaml @@ -23,6 +23,8 @@ hubs: # to the helm upgrade command in, and that has meaning. Please check # that you intend for these files to be applied in this order. - common.values.yaml + - staging.values.yaml + - enc-staging.secret.values.yaml - name: prod display_name: "NASA Cryo in the Cloud (prod)" domain: cryointhecloud.2i2c.cloud diff --git a/config/clusters/nasa-cryo/enc-staging.secret.values.yaml b/config/clusters/nasa-cryo/enc-staging.secret.values.yaml new file mode 100644 index 0000000000..ba38a71f9f --- /dev/null +++ b/config/clusters/nasa-cryo/enc-staging.secret.values.yaml @@ -0,0 +1,21 @@ +basehub: + jupyterhub: + hub: + config: + GitHubOAuthenticator: + client_id: ENC[AES256_GCM,data:sJhtxTkW9/1/ytqDQjBfrC/U5BY=,iv:W7tJJWrTL69fzjr7BZnvvMAc487rpcgquytIzh8qCxU=,tag:oyb6+YeRPRmh/N9J7lJCog==,type:str] + client_secret: ENC[AES256_GCM,data:Mpdx/rFOEZ5J5aHb1tQe1PMB/V0J2NSEdjos+GTek9xeM3iUcIsbqQ==,iv:FLgvbSmzs8n1BXtPKeUp5dqj7k5R4DDr3FtTjFlxoLw=,tag:WzfGnrXWPrWbrjZ4E3Zolg==,type:str] +sops: + kms: [] + gcp_kms: + - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs + created_at: "2022-10-19T10:40:58Z" + enc: CiQA4OM7eJLTngjXLJHRpB6CqAPHIrdQnOXV+uIL/hvLdhrX3VsSSQDuy/p8Hc7fqevvelfq7RtQEofei3NfeOG0fs0cTWalKp4A9C6/X2gdUhtO0fbxQT7g3489b/vo3un1BKduXwESQjsZMaxGV18= + azure_kv: [] + hc_vault: [] + age: [] + lastmodified: "2022-10-19T10:40:58Z" + mac: ENC[AES256_GCM,data:2SZIGenwl1+TxEwZ8afNW/RbfBJd50BrkAVUMFqstp2wdo3xlLY0oUKh88cEJSSeUE3fnB7fiqZru58Q9/yAuavXeMUkF2QTT+t6GEIiHMcoUzXvzrC3xGPt7/1WDRZSsf1uFw0tY3f2yHQFIaQnxY3qpWadSeknPqqvlfDrthg=,iv:IGbuX99q+4AgddeSwhoqXB8NgbZq1ZPxKDfLL+aTOnE=,tag:cKYljtQuEU2WS0v48wzR6w==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.7.3 diff --git a/config/clusters/nasa-cryo/staging.values.yaml b/config/clusters/nasa-cryo/staging.values.yaml new file mode 100644 index 0000000000..2845ff9964 --- /dev/null +++ b/config/clusters/nasa-cryo/staging.values.yaml @@ -0,0 +1,9 @@ +basehub: + userServiceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::574251165169:role/nasa-cryo-staging + jupyterhub: + hub: + config: + GitHubOAuthenticator: + oauth_callback_url: https://staging.cryointhecloud.2i2c.cloud/hub/oauth_callback From 3bbd847231d59ed80b6393ec0e0a3fdd085f6968 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 19 Oct 2022 13:28:36 +0100 Subject: [PATCH 22/29] Add config for prod hub --- config/clusters/nasa-cryo/cluster.yaml | 2 ++ .../nasa-cryo/enc-prod.secret.values.yaml | 21 +++++++++++++++++++ config/clusters/nasa-cryo/prod.values.yaml | 9 ++++++++ 3 files changed, 32 insertions(+) create mode 100644 config/clusters/nasa-cryo/enc-prod.secret.values.yaml create mode 100644 config/clusters/nasa-cryo/prod.values.yaml diff --git a/config/clusters/nasa-cryo/cluster.yaml b/config/clusters/nasa-cryo/cluster.yaml index 56f187033e..904963665b 100644 --- a/config/clusters/nasa-cryo/cluster.yaml +++ b/config/clusters/nasa-cryo/cluster.yaml @@ -38,3 +38,5 @@ hubs: # to the helm upgrade command in, and that has meaning. Please check # that you intend for these files to be applied in this order. - common.values.yaml + - prod.values.yaml + - enc-prod.secret.values.yaml diff --git a/config/clusters/nasa-cryo/enc-prod.secret.values.yaml b/config/clusters/nasa-cryo/enc-prod.secret.values.yaml new file mode 100644 index 0000000000..6e38980a68 --- /dev/null +++ b/config/clusters/nasa-cryo/enc-prod.secret.values.yaml @@ -0,0 +1,21 @@ +basehub: + jupyterhub: + hub: + config: + GitHubOAuthenticator: + client_id: ENC[AES256_GCM,data:ilEVTmjN83Y3YnOlBKjA9EOfUuM=,iv:C8143cwC6pB0sFueQq7T1XbbmABAHu6kiLnxbZy9hVc=,tag:Jc34Po9xZZShfq5l/2CEbQ==,type:str] + client_secret: ENC[AES256_GCM,data:cSq7uA39SxQ9wIXAWp2FkEkGuzPXHCixp6iwRnkya9YngnUhufpIYg==,iv:dSXXOKhEt/ipOD1bWhcghks+Mjpn2oUexzJhjwEt6ek=,tag:8woToirTzLvX1WqHjbLpGw==,type:str] +sops: + kms: [] + gcp_kms: + - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs + created_at: "2022-10-19T12:21:15Z" + enc: CiQA4OM7eHhdeYFpxkSDTcwgJSCnGcXyuW+RXURXZyqVtBExJLgSSQDuy/p8cJdVxnALmGegCdGggKwgIqy27Dtr97EyJnjFxmQMGDQsWh3vL/2xXyL4Gw8E1cYoxh7r+ecwI4YmBOO+q6rBIM2fS4I= + azure_kv: [] + hc_vault: [] + age: [] + lastmodified: "2022-10-19T12:21:15Z" + mac: ENC[AES256_GCM,data:DyxGjuM/4q9M8zWES9QZcIJ+hD08HPlqyGucuZOrqwLQ51UegEG0hx94bt5ZDDpbAbwDl+0tUj59NKG98dPE0cZfwS+7mKmn7Ym+2l4rmuQpGQuZv2MCUBlVt+E5xZXxOTWV83HIbPgkP8+u/LtlNAhTFR9ehRG/0sFEmeLL5/w=,iv:ANa5xKiHr+06hatYIVsO/EFMiPymuXp8RndXzAcMl0Q=,tag:UDVpNvW8rudwk2x10uFPKw==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.7.3 diff --git a/config/clusters/nasa-cryo/prod.values.yaml b/config/clusters/nasa-cryo/prod.values.yaml new file mode 100644 index 0000000000..960961c70a --- /dev/null +++ b/config/clusters/nasa-cryo/prod.values.yaml @@ -0,0 +1,9 @@ +basehub: + userServiceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::574251165169:role/nasa-cryo-prod + jupyterhub: + hub: + config: + GitHubOAuthenticator: + oauth_callback_url: https://cryointhecloud.2i2c.cloud/hub/oauth_callback From 008b03d577b5546d49ac80980059f3e97ca3fa30 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 19 Oct 2022 13:31:08 +0100 Subject: [PATCH 23/29] Add new cluster to deploy and validate workflow files --- .github/workflows/deploy-hubs.yaml | 1 + .github/workflows/validate-clusters.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/deploy-hubs.yaml b/.github/workflows/deploy-hubs.yaml index 5adfeccbb5..ee383a6c3d 100644 --- a/.github/workflows/deploy-hubs.yaml +++ b/.github/workflows/deploy-hubs.yaml @@ -156,6 +156,7 @@ jobs: failure_linked-earth: "${{ steps.declare-failure-status.outputs.failure_linked-earth }}" failure_awi-ciroh: "${{ steps.declare-failure-status.outputs.failure_awi-ciroh }}" failure_callysto: "${{ steps.declare-failure-status.outputs.failure_callysto }}" + failure_nasa-cryo: "${{ steps.declare-failure-status.outputs.failure_nasa-cryo }}" # Only run this job on pushes to the default branch and when the job output is not # an empty list diff --git a/.github/workflows/validate-clusters.yaml b/.github/workflows/validate-clusters.yaml index 0f7b3cd48d..46061b2fee 100644 --- a/.github/workflows/validate-clusters.yaml +++ b/.github/workflows/validate-clusters.yaml @@ -50,6 +50,7 @@ jobs: - cluster_name: linked-earth - cluster_name: awi-ciroh - cluster_name: callysto + - cluster_name: nasa-cryo steps: - uses: actions/checkout@v3 From 559c1e384635c75d55c791380d40fb93a0160dc1 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 19 Oct 2022 15:33:21 +0100 Subject: [PATCH 24/29] Remove serviceAccountName from common config I think this was copy-pasta from openscapes and was causing user servers to not spawn --- config/clusters/nasa-cryo/common.values.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index ad3b8dcf58..900c7d3891 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -60,7 +60,6 @@ basehub: scope: - read:org singleuser: - serviceAccountName: cloud-user-sa defaultUrl: /lab # User image repo: https://github.com/CryoInTheCloud/CryoCloudWebsite/tree/main/conda image: From fedcfb2c2d7a10bfa06d95fb7adfaa40bea59611 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 19 Oct 2022 10:47:28 -0700 Subject: [PATCH 25/29] Enable autoscaler for the nasa-cryo cluster --- config/clusters/nasa-cryo/support.values.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config/clusters/nasa-cryo/support.values.yaml b/config/clusters/nasa-cryo/support.values.yaml index 1d0c7c8021..004fa9ff5a 100644 --- a/config/clusters/nasa-cryo/support.values.yaml +++ b/config/clusters/nasa-cryo/support.values.yaml @@ -1,6 +1,12 @@ prometheusIngressAuthSecret: enabled: true +cluster-autoscaler: + enabled: true + autoDiscovery: + clusterName: nasa-cryo + awsRegion: us-west-2 + grafana: ingress: hosts: From c46e9b57554626289554c2044d9fa12888c00dd7 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 20 Oct 2022 11:44:21 +0100 Subject: [PATCH 26/29] Add note on AWS quotas to docs --- docs/howto/cloud-accounts/new-aws-account.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/howto/cloud-accounts/new-aws-account.md b/docs/howto/cloud-accounts/new-aws-account.md index 40ed220749..7b4aa84fe3 100644 --- a/docs/howto/cloud-accounts/new-aws-account.md +++ b/docs/howto/cloud-accounts/new-aws-account.md @@ -35,6 +35,14 @@ Finally, we should check what quotas are enforced on the account and increase th 4. Click the "Request quota increase" button in the "Recent quota increase requests" section of the page 5. Fill in the form that pops up and change the quota value (must be greater than the current quota value), then click "Request" +The quotas we mostly need increasing are [EC2 quotas](https://us-east-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas) in order for new nodes to spin up. +In particular, we need to increase: + +- `All Standard (A, C, D, H, I, M, R, T, Z) Spot Instance Requests`: This is what dask instances use (as they are spot instances) +- `Running On-Demand Standard (A, C, D, H, I, M, R, T, Z) instances`: This is what is used for core and notebook instances + +The values of these quotas are 'Total CPUs' and hence larger nodes consume more quota. + [^1]: AWS documentation on creating new accounts in an Organization: [^2]: AWS documentation on managing account access: [^3]: AWS documentation on service quotas: From 95bae944836a9939e225dc646935db845b64c10a Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Thu, 20 Oct 2022 11:55:18 +0100 Subject: [PATCH 27/29] Include warning about enabling the cluster-autoscaler subchart in support for AWS clusters --- docs/howto/operate/grafana.md | 13 +++++++++++++ docs/howto/operate/new-cluster/aws.md | 15 +++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/docs/howto/operate/grafana.md b/docs/howto/operate/grafana.md index 4aa3b8b9b3..c169b95c15 100644 --- a/docs/howto/operate/grafana.md +++ b/docs/howto/operate/grafana.md @@ -116,6 +116,19 @@ prometheus: - ``` +````{warning} +If you are deploying the support chart on an AWS cluster, you **must** enable the `cluster-autoscaler` sub-chart, otherwise the node groups will not automatically scale. +Include the following in your `support.values.yaml` file: + +``` +cluster-autoscaler: + enabled: true + autoDiscovery: + clusterName: + awsRegion: +``` +```` + #### Create a `enc-support.secret.values.yaml` file Only 2i2c staff + our centralized grafana should be able to access the diff --git a/docs/howto/operate/new-cluster/aws.md b/docs/howto/operate/new-cluster/aws.md index c73f994111..313468b8b7 100644 --- a/docs/howto/operate/new-cluster/aws.md +++ b/docs/howto/operate/new-cluster/aws.md @@ -303,3 +303,18 @@ The [CI deploy-hubs workflow](https://github.com/2i2c-org/infrastructure/tree/HEAD/.github/workflows/deploy-hubs.yaml#L31-L36) contains the list of clusters being automatically deployed by our CI/CD system. Make sure there is an entry for new AWS cluster. + +## A note on the support chart for AWS clusters + +````{warning} +When you deploy the support chart on an AWS cluster, you **must** enable the `cluster-autoscaler` sub-chart, otherwise the node groups will not automatically scale. +Include the following in your `support.values.yaml` file: + +``` +cluster-autoscaler: + enabled: true + autoDiscovery: + clusterName: + awsRegion: +``` +```` From e21b25f665eb621f0073b6571cdef73e13f2d97e Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 21 Oct 2022 10:38:51 +0100 Subject: [PATCH 28/29] Update team names to correct capitalisation --- config/clusters/nasa-cryo/common.values.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 900c7d3891..dc11a44eba 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -55,8 +55,8 @@ basehub: populate_teams_in_auth_state: true allowed_organizations: - 2i2c-org:tech-team - - CryoInTheCloud:CryoCloudUser - - CryoInTheCloud:CryoCloudAdvanced + - CryoInTheCloud:cryoclouduser + - CryoInTheCloud:cryocloudadvanced scope: - read:org singleuser: @@ -81,8 +81,8 @@ basehub: default: true allowed_teams: - 2i2c-org:tech-team - - CryoInTheCloud:CryoCloudUser - - CryoInTheCloud:CryoCloudAdvanced + - CryoInTheCloud:cryoclouduser + - CryoInTheCloud:cryocloudadvanced kubespawner_override: # Explicitly unset mem_limit, so it overrides the default memory limit we set in # basehub/values.yaml @@ -94,8 +94,8 @@ basehub: description: "~4 CPU, ~15G RAM" allowed_teams: - 2i2c-org:tech-team - - CryoInTheCloud:CryoCloudUser - - CryoInTheCloud:CryoCloudAdvanced + - CryoInTheCloud:cryoclouduser + - CryoInTheCloud:cryocloudadvanced kubespawner_override: mem_limit: null mem_guarantee: 12G @@ -105,7 +105,7 @@ basehub: description: "~8 CPU, ~30G RAM" allowed_teams: - 2i2c-org:tech-team - - CryoInTheCloud:CryoCloudAdvanced + - CryoInTheCloud:cryocloudadvanced kubespawner_override: mem_limit: null mem_guarantee: 26G @@ -115,7 +115,7 @@ basehub: description: "~32 CPU, ~128G RAM" allowed_teams: - 2i2c-org:tech-team - - CryoInTheCloud:CryoCloudAdvanced + - CryoInTheCloud:cryocloudadvanced kubespawner_override: mem_limit: null mem_guarantee: 115G From f6ba2bd4561223154d2a8d041df95917beceaf70 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 21 Oct 2022 10:50:53 +0100 Subject: [PATCH 29/29] Update domains to what the community would like to use --- config/clusters/nasa-cryo/cluster.yaml | 4 ++-- config/clusters/nasa-cryo/prod.values.yaml | 2 +- config/clusters/nasa-cryo/staging.values.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/clusters/nasa-cryo/cluster.yaml b/config/clusters/nasa-cryo/cluster.yaml index 904963665b..8a1e5270f1 100644 --- a/config/clusters/nasa-cryo/cluster.yaml +++ b/config/clusters/nasa-cryo/cluster.yaml @@ -12,7 +12,7 @@ support: hubs: - name: staging display_name: "NASA Cryo in the Cloud (staging)" - domain: staging.cryointhecloud.2i2c.cloud + domain: staging.hub.cryointhecloud.com helm_chart: daskhub auth0: # connection update? Also ensure the basehub Helm chart is provided a @@ -27,7 +27,7 @@ hubs: - enc-staging.secret.values.yaml - name: prod display_name: "NASA Cryo in the Cloud (prod)" - domain: cryointhecloud.2i2c.cloud + domain: hub.cryointhecloud.com helm_chart: daskhub auth0: # connection update? Also ensure the basehub Helm chart is provided a diff --git a/config/clusters/nasa-cryo/prod.values.yaml b/config/clusters/nasa-cryo/prod.values.yaml index 960961c70a..69e57ad842 100644 --- a/config/clusters/nasa-cryo/prod.values.yaml +++ b/config/clusters/nasa-cryo/prod.values.yaml @@ -6,4 +6,4 @@ basehub: hub: config: GitHubOAuthenticator: - oauth_callback_url: https://cryointhecloud.2i2c.cloud/hub/oauth_callback + oauth_callback_url: https://hub.cryointhecloud.com/hub/oauth_callback diff --git a/config/clusters/nasa-cryo/staging.values.yaml b/config/clusters/nasa-cryo/staging.values.yaml index 2845ff9964..9d3179dd51 100644 --- a/config/clusters/nasa-cryo/staging.values.yaml +++ b/config/clusters/nasa-cryo/staging.values.yaml @@ -6,4 +6,4 @@ basehub: hub: config: GitHubOAuthenticator: - oauth_callback_url: https://staging.cryointhecloud.2i2c.cloud/hub/oauth_callback + oauth_callback_url: https://staging.hub.cryointhecloud.com/hub/oauth_callback