diff --git a/docs/howto/features/buckets.md b/docs/howto/features/buckets.md index 2342bb2f4d..dd8efa0617 100644 --- a/docs/howto/features/buckets.md +++ b/docs/howto/features/buckets.md @@ -80,4 +80,46 @@ on why users want this! You can also add other env vars pointing to other buckets users requested. 5. Get this change deployed, and users should now be able to use the buckets! - Currently running users might have to restart their pods for the change to take effect. \ No newline at end of file + Currently running users might have to restart their pods for the change to take effect. + + +## Allowing access to buckets from outside the JupyterHub + +### GCP + +Some hub users want to be able to write to the bucket from outside the hub, +primarily for large data transfer from on-premise systems. Since +[Google Groups](https://groups.google.com) can be used to control access to +GCS buckets, it can be used to allow arbitrary users to write to the bucket! + +1. With your `2i2c.org` google account, go to [Google Groups](https://groups.google.com) and create a new Google Group with the name + "-writers", where "" is the name of the bucket + we are going to grant write access to. + +2. Grant "Group Owner" access to the community champion requesting this feature. + They will be able to add / remove users from the group as necessary, and + thus manage access without needing to involve 2i2c engineers. + +3. In the `user_buckets` definition for the bucket in question, add the group + name as an `extra_admin_members`: + + ```terraform + user_buckets = { + "persistent": { + "delete_after": null, + "extra_admin_members": [ + "group:@googlegroups.com" + ] + } + } + ``` + + Apply this terraform change to create the appropriate permissions for members + of the group to have full read/write access to that GCS bucket. + +4. We want the community champions to handle granting / revoking access to + this google group, as well as produce community specific documentation on + how to actually upload data here. We currently do not have a template of + how end users can use this, but something can be stolen from the + [documentation for LEAP users](https://leap-stc.github.io/leap-pangeo/jupyterhub.html#i-have-a-dataset-and-want-to-work-with-it-on-the-hub-how-do-i-upload-it) + diff --git a/docs/howto/features/gpu.md b/docs/howto/features/gpu.md index 77385d5c00..0af5c6039d 100644 --- a/docs/howto/features/gpu.md +++ b/docs/howto/features/gpu.md @@ -6,6 +6,70 @@ GPUs on all major cloud providers. ## Setting up GPU nodes +### GCP + +#### Requesting quota increase + +New GCP projects start with no GPU quota, so we must ask for some to enable +GPUs. + +1. Go to the [GCP Quotas page](https://console.cloud.google.com/apis/api/compute.googleapis.com/quotas), + **and make sure you are in the right project**. + +2. Search for "NVIDIA T4 GPU", and find the entry for the **correct region**. + This is very important, as getting a quota increase in the wrong region means + we have to do this all over again. + +3. Check the box next to the correct quota, and click "Edit Quotas" button + just above the list. + +4. Enter the number of GPUs we want quota for on the right. For a brand new + project, 4 is a good starting number. We can consistently ask for more, + if these get used. GCP requires we provide a description for this quota + increase request - "We need GPUs to work on some ML based research" is + a good start. + +5. Click "Next", and then "Submit Request". + +6. Sometimes the request is immediately granted, other times it takes a few + days. + +#### Setting up GPU nodepools with terraform + +The `notebook_nodes` variable for our GCP terraform accepts a `gpu` +parameter, which can be used to provision a GPU nodepool. An example +would look like: + +```terraform +notebook_nodes = { + "gpu-t4": { + min: 0, + max: 20, + machine_type: "n1-highmem-8", + gpu: { + enabled: true, + type: "nvidia-tesla-t4", + count: 1 + }, + # Optional, in case we run into resource exhaustion in the main zone + zones: [ + "us-central1-a", + "us-central1-b", + "us-central1-c", + "us-central1-f" + ] + } +} +``` + +This provisions a `n1-highmem-8` node, where each node has 1 NVidia +T4 GPU. + +In addition, we could ask for GPU nodes to be spawned in whatever zone +available in the same region, rather than just the same zone as the rest +of our notebook nodes. This should only be used if we run into GPU scarcity +issues in the zone! + ### AWS #### Requesting Quota Increase @@ -78,7 +142,7 @@ AWS, and we can configure a node group there to provide us GPUs. autoscaler should recognize this! `eksctl` will also setup the appropriate driver installer, so you won't have to. -#### Setting up a GPU user profile +## Setting up a GPU user profile Finally, we need to give users the option of using the GPU via a profile. This should be placed in the hub configuration: @@ -141,7 +205,7 @@ jupyterhub: Do a deployment with this config, and then we can test to make sure this works! -#### Testing +## Testing 1. Login to the hub, and start a server with the GPU profile you just set up. diff --git a/terraform/gcp/buckets.tf b/terraform/gcp/buckets.tf index 04892a8b4e..59044a42bc 100644 --- a/terraform/gcp/buckets.tf +++ b/terraform/gcp/buckets.tf @@ -28,7 +28,7 @@ resource "google_storage_bucket" "user_buckets" { locals { # Nested for loop, thanks to https://www.daveperrett.com/articles/2021/08/19/nested-for-each-with-terraform/ - bucket_permissions = distinct(flatten([ + bucket_admin_permissions = distinct(flatten([ for hub_name, permissions in var.hub_cloud_permissions : [ for bucket_name in permissions.bucket_admin_access : { hub_name = hub_name @@ -36,15 +36,47 @@ locals { } ] ])) + + bucket_readonly_permissions = distinct(flatten([ + for hub_name, permissions in var.hub_cloud_permissions : [ + for bucket_name in permissions.bucket_readonly_access : { + hub_name = hub_name + bucket_name = bucket_name + } + ] + ])) + + bucket_extra_admin_members = distinct(flatten([ + for bucket_name, properties in var.user_buckets : [ + for extra_member in properties.extra_admin_members : { + bucket_name = bucket_name + member = extra_member + } + ] + ])) } resource "google_storage_bucket_iam_member" "member" { - for_each = { for bp in local.bucket_permissions : "${bp.hub_name}.${bp.bucket_name}" => bp } + for_each = { for bp in local.bucket_admin_permissions : "${bp.hub_name}.${bp.bucket_name}" => bp } bucket = google_storage_bucket.user_buckets[each.value.bucket_name].name role = "roles/storage.admin" member = "serviceAccount:${google_service_account.workload_sa[each.value.hub_name].email}" } +resource "google_storage_bucket_iam_member" "member_readonly" { + for_each = { for bp in local.bucket_readonly_permissions : "${bp.hub_name}.${bp.bucket_name}" => bp } + bucket = google_storage_bucket.user_buckets[each.value.bucket_name].name + role = "roles/storage.objectViewer" + member = "serviceAccount:${google_service_account.workload_sa[each.value.hub_name].email}" +} + +resource "google_storage_bucket_iam_member" "extra_admin_members" { + for_each = { for bm in local.bucket_extra_admin_members : "${bm.bucket_name}.${bm.member}" => bm } + bucket = google_storage_bucket.user_buckets[each.value.bucket_name].name + role = "roles/storage.admin" + member = each.value.member +} + resource "google_storage_default_object_access_control" "public_rule" { for_each = toset(var.bucket_public_access) bucket = google_storage_bucket.user_buckets[each.key].name diff --git a/terraform/gcp/cluster.tf b/terraform/gcp/cluster.tf index 114ab163f5..e270d09007 100644 --- a/terraform/gcp/cluster.tf +++ b/terraform/gcp/cluster.tf @@ -236,14 +236,16 @@ resource "google_container_node_pool" "core" { # resource ref: https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/container_node_pool resource "google_container_node_pool" "notebook" { - name = "nb-${each.key}" - cluster = google_container_cluster.cluster.name - project = google_container_cluster.cluster.project - location = google_container_cluster.cluster.location - version = var.k8s_versions.notebook_nodes_version - for_each = var.notebook_nodes + name = "nb-${each.key}" + cluster = google_container_cluster.cluster.name + project = google_container_cluster.cluster.project + version = var.k8s_versions.notebook_nodes_version + + node_locations = each.value.zones == null ? google_container_cluster.cluster.node_locations : each.value.zones + + initial_node_count = each.value.min autoscaling { min_node_count = each.value.min @@ -335,12 +337,12 @@ resource "google_container_node_pool" "notebook" { # resource ref: https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/container_node_pool resource "google_container_node_pool" "dask_worker" { - name = "dask-${each.key}" - cluster = google_container_cluster.cluster.name - project = google_container_cluster.cluster.project - location = google_container_cluster.cluster.location - version = var.k8s_versions.dask_nodes_version + name = "dask-${each.key}" + cluster = google_container_cluster.cluster.name + project = google_container_cluster.cluster.project + version = var.k8s_versions.dask_nodes_version + node_locations = each.value.zones == null ? google_container_cluster.cluster.node_locations : each.value.zones # Default to same config as notebook nodepools config for_each = var.dask_nodes diff --git a/terraform/gcp/projects/leap.tfvars b/terraform/gcp/projects/leap.tfvars index e7342858fd..0382edd486 100644 --- a/terraform/gcp/projects/leap.tfvars +++ b/terraform/gcp/projects/leap.tfvars @@ -25,17 +25,30 @@ filestore_capacity_gb = 2048 user_buckets = { "scratch-staging" : { - "delete_after" : 7 + "delete_after" : 7, + "extra_admin_members" : [] }, "scratch" : { - "delete_after" : 7 + "delete_after" : 7, + "extra_admin_members" : [] } # For https://github.com/2i2c-org/infrastructure/issues/1230#issuecomment-1278183441 "persistent" : { - "delete_after" : null + "delete_after" : null, + "extra_admin_members" : ["group:leap-persistent-bucket-writers@googlegroups.com"] }, "persistent-staging" : { - "delete_after" : null + "delete_after" : null, + "extra_admin_members" : ["group:leap-persistent-bucket-writers@googlegroups.com"] + } + # For https://github.com/2i2c-org/infrastructure/issues/1230#issuecomment-1278183441 + "persistent-ro" : { + "delete_after" : null, + "extra_admin_members" : ["group:leap-persistent-bucket-writers@googlegroups.com"] + }, + "persistent-ro-staging" : { + "delete_after" : null, + "extra_admin_members" : ["group:leap-persistent-bucket-writers@googlegroups.com"] } } @@ -43,11 +56,13 @@ hub_cloud_permissions = { "staging" : { requestor_pays : true, bucket_admin_access : ["scratch-staging", "persistent-staging"], + bucket_readonly_access : ["persistent-ro-staging"], hub_namespace : "staging" }, "prod" : { requestor_pays : true, bucket_admin_access : ["scratch", "persistent"], + bucket_readonly_access : ["persistent-ro"], hub_namespace : "prod" } } @@ -69,7 +84,15 @@ notebook_nodes = { enabled : true, type : "nvidia-tesla-t4", count : 1 - } + }, + zones : [ + # Get GPUs wherever they are available, as sometimes a single + # zone might be out of GPUs. + "us-central1-a", + "us-central1-b", + "us-central1-c", + "us-central1-f" + ] }, } diff --git a/terraform/gcp/projects/m2lines.tfvars b/terraform/gcp/projects/m2lines.tfvars index 92bf225e0b..bf4c67d9ff 100644 --- a/terraform/gcp/projects/m2lines.tfvars +++ b/terraform/gcp/projects/m2lines.tfvars @@ -22,13 +22,16 @@ user_buckets = { }, # For https://2i2c.freshdesk.com/a/tickets/218 "persistent" : { - "delete_after" : null + "delete_after" : null, + "extra_admin_members" : ["group:m2lines-persistent-bucket-writers@googlegroups.com"] }, "persistent-staging" : { - "delete_after" : null + "delete_after" : null, + "extra_admin_members" : ["group:m2lines-persistent-bucket-writers@googlegroups.com"] }, "public-persistent" : { - "delete_after" : null + "delete_after" : null, + "extra_admin_members" : ["group:m2lines-persistent-bucket-writers@googlegroups.com"] }, } diff --git a/terraform/gcp/variables.tf b/terraform/gcp/variables.tf index ff741f99c0..80d674d1db 100644 --- a/terraform/gcp/variables.tf +++ b/terraform/gcp/variables.tf @@ -83,7 +83,8 @@ variable "notebook_nodes" { }), {} ), - resource_labels : optional(map(string), {}) + resource_labels : optional(map(string), {}), + zones : optional(list(string), null) })) description = "Notebook node pools to create" default = {} @@ -109,7 +110,8 @@ variable "dask_nodes" { }), {} ), - resource_labels : optional(map(string), {}) + resource_labels : optional(map(string), {}), + zones : optional(list(string), null) })) description = "Dask node pools to create. Defaults to notebook_nodes" default = {} @@ -223,7 +225,7 @@ variable "enable_network_policy" { } variable "user_buckets" { - type = map(object({ delete_after : number })) + type = map(object({ delete_after : number, extra_admin_members : optional(list(string), []) })) default = {} description = <<-EOT GCS Buckets to be created. @@ -231,9 +233,16 @@ variable "user_buckets" { The key for each entry will be prefixed with {var.prefix}- to form the name of the bucket. - The value is a map, with 'delete_after' the only accepted key in that - map - it lists the number of days after which any content in the - bucket will be deleted. Set to null to not delete data. + The value is a map, accepting the following keys: + + 'delete_after' specifies the number of days after which any content + in the bucket will be deleted. Set to null to not delete data. + + 'extra_admin_members' describes extra identies (user groups, user accounts, + service accounts, etc) that will have *full* access to this bucket. This + is primarily useful for moving data into and out of buckets from outside + the cloud. See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_iam#member/members + for the format this would be specified in. EOT } @@ -341,7 +350,14 @@ variable "max_cpu" { } variable "hub_cloud_permissions" { - type = map(object({ requestor_pays : bool, bucket_admin_access : set(string), hub_namespace : string })) + type = map( + object({ + requestor_pays : bool, + bucket_admin_access : set(string), + bucket_readonly_access : optional(set(string), []), + hub_namespace : string + }) + ) default = {} description = <<-EOT Map of cloud permissions given to a particular hub