diff --git a/config/clusters/uwhackweeks/common.values.yaml b/config/clusters/uwhackweeks/common.values.yaml index 08882e7b26..021129287f 100644 --- a/config/clusters/uwhackweeks/common.values.yaml +++ b/config/clusters/uwhackweeks/common.values.yaml @@ -28,6 +28,10 @@ basehub: name: ICESat Hackweek url: https://icesat-2.hackweek.io singleuser: + extraEnv: + # Temporarily set for *all* pods, including pods without any GPUs, + # to work around https://github.com/2i2c-org/infrastructure/issues/1530 + NVIDIA_DRIVER_CAPABILITIES: compute,utility defaultUrl: /lab # User image repo: https://github.com/ICESAT-2HackWeek/website2022 image: @@ -72,18 +76,50 @@ basehub: mem_guarantee: 115G node_selector: node.kubernetes.io/instance-type: m5.8xlarge - - display_name: "Large + GPU: p2.xlarge" - description: "~4CPUs, 60G RAM, 1 NVIDIA K80 GPU" + + - display_name: "GPU" + # P2.xlarge has 64G of RAM per GPU while g4dn has 16?! + description: | + ~4CPUs, Nvidia K80 or T4 GPU. + + K80 comes with 55G of RAM, while T4 comes with about 14G + profile_options: + gpu: + display_name: GPU + choices: + k80: + display_name: NVidia Tesla K80 + slug: k80 + kubespawner_override: + mem_guarantee: 55G + node_selector: + node.kubernetes.io/instance-type: p2.xlarge + t4: + display_name: NVidia Tesla T4 + slug: t4 + default: true + kubespawner_override: + mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge + image: + display_name: Image + choices: + tensorflow: + display_name: Pangeo Tensorflow ML Notebook + slug: "tensorflow" + kubespawner_override: + image: "pangeo/ml-notebook:2022.10.13" + pytorch: + display_name: Pangeo PyTorch ML Notebook + default: true + slug: "pytorch" + kubespawner_override: + image: "pangeo/pytorch-notebook:2022.10.13" kubespawner_override: mem_limit: null - mem_guarantee: 55G - image: "pangeo/ml-notebook:master" - environment: - NVIDIA_DRIVER_CAPABILITIES: compute,utility extra_resource_limits: nvidia.com/gpu: "1" - node_selector: - node.kubernetes.io/instance-type: p2.xlarge scheduling: userPlaceholder: enabled: false diff --git a/config/clusters/uwhackweeks/support.values.yaml b/config/clusters/uwhackweeks/support.values.yaml index adc81d13fc..2ea1760805 100644 --- a/config/clusters/uwhackweeks/support.values.yaml +++ b/config/clusters/uwhackweeks/support.values.yaml @@ -1,10 +1,6 @@ prometheusIngressAuthSecret: enabled: true -nvidiaDevicePlugin: - aws: - enabled: true - prometheus: server: ingress: diff --git a/docs/howto/features/gpu.md b/docs/howto/features/gpu.md index 473dcf679e..2d81277061 100644 --- a/docs/howto/features/gpu.md +++ b/docs/howto/features/gpu.md @@ -24,13 +24,14 @@ series nodes. 5. Select 'Request Quota Increase'. 6. Input the *number of vCPUs* needed. This translates to a total number of GPU nodes based on how many CPUs the nodes we want have. - For example, if we are using [P2 nodes](https://aws.amazon.com/ec2/instance-types/p2/) - with NVIDIA K80 GPUs, each `p2.xlarge` node gives us 1 GPU and + For example, if we are using [G4 nodes](https://aws.amazon.com/ec2/instance-types/g4/) + with NVIDIA K80 GPUs, each `g4dn.xlarge` node gives us 1 GPU and 4 vCPUs, so a quota of 8 vCPUs will allow us to spawn 2 GPU nodes. We should fine tune this calculation for later, but for now, the - recommendation is to give users a `p2.xlarge` each, so the number + recommendation is to give users a single `g4dn.xlarge` each, so the number of vCPUs requested should be `4 * max number of GPU nodes`. -7. Ask for the increase, and wait. This can take *several working days*. +7. Ask for the increase, and wait. This can take *several working days*, + so do it as early as possible! #### Setup GPU nodegroup on eksctl @@ -43,14 +44,14 @@ AWS, and we can configure a node group there to provide us GPUs. ``` { - instanceType: "p2.xlarge", + instanceType: "g4dn.xlarge", tags+: { "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" }, } ``` - `p2.xlarge` gives us 1 K80 GPU and ~4 CPUs. The `tags` definition + `g4dn.xlarge` gives us 1 Nvidia T4 GPU and ~4 CPUs. The `tags` definition is necessary to let the autoscaler know that this nodegroup has 1 GPU per node. If you're using a different machine type with more GPUs, adjust this definition accordingly. @@ -64,15 +65,12 @@ AWS, and we can configure a node group there to provide us GPUs. 3. Create the nodegroup ```bash - eksctl create nodegroup -f .eksctl.yaml --install-nvidia-plugin=false + eksctl create nodegroup -f .eksctl.yaml ``` - The `--install-nvidia-plugin=false` is required until - [this bug](https://github.com/weaveworks/eksctl/issues/5277) - is fixed. - This should create the nodegroup with 0 nodes in it, and the - autoscaler should recognize this! + autoscaler should recognize this! `eksctl` will also setup the + appropriate driver installer, so you won't have to. #### Setting up a GPU user profile @@ -81,29 +79,51 @@ a profile. This should be placed in the hub configuration: ```yaml jupyterhub: - singleuser: - profileList: - - display_name: "Large + GPU: p2.xlarge" - description: "~4CPUs, 60G RAM, 1 NVIDIA K80 GPU" + extraEnv: + # Temporarily set for *all* pods, including pods without any GPUs, + # to work around https://github.com/2i2c-org/infrastructure/issues/1530 + NVIDIA_DRIVER_CAPABILITIES: compute,utility + singleuser: + profileList: + - display_name: Large + GPU + description: 14GB RAM, 4 CPUs, T4 GPU + profile_options: + gpu: + image: + display_name: Image + choices: + tensorflow: + display_name: Pangeo Tensorflow ML Notebook + slug: "tensorflow" + kubespawner_override: + node.kubernetes.io/instance-type: g4dn.xlarge + image: "pangeo/ml-notebook:" + pytorch: + display_name: Pangeo PyTorch ML Notebook + default: true + slug: "pytorch" + kubespawner_override: + node.kubernetes.io/instance-type: g4dn.xlarge + image: "pangeo/pytorch-notebook:" kubespawner_override: mem_limit: null - mem_guarantee: 55G - image: "pangeo/ml-notebook:" - environment: - NVIDIA_DRIVER_CAPABILITIES: compute,utility + mem_guarantee: 14G extra_resource_limits: nvidia.com/gpu: "1" - node_selector: - node.kubernetes.io/instance-type: p2.xlarge ``` 1. If using a `daskhub`, place this under the `basehub` key. 2. The image used should have ML tools (pytorch, cuda, etc) - installed. The recommendation is to use Pangeo's + installed. The recommendation is to provide Pangeo's [ml-notebook](https://hub.docker.com/r/pangeo/ml-notebook) for tensorflow and [pytorch-notebook](https://hub.docker.com/r/pangeo/pytorch-notebook) - for pytorch. **Do not** use the `latest` or `master` tags - find + for pytorch. We expose these as options so users can pick what they want + to use. + + ```{warning} + **Do not** use the `latest` or `master` tags - find a specific tag listed for the image you want, and use that. + ``` 3. The [NVIDIA_DRIVER_CAPABILITIES](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html#driver-capabilities) environment variable tells the GPU driver what kind of libraries and tools to inject into the container. Without setting this, @@ -134,6 +154,15 @@ this works! ``` [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] ``` + + If on an image with pytorch instead, try this: + ```python + import torch + + torch.cuda.is_available() + ``` + + This should return `True`. 4. Remember to explicitly shut down your server after testing, as GPU instances can get expensive! diff --git a/docs/reference/tools.md b/docs/reference/tools.md index 993d004fc1..fc088e6f27 100644 --- a/docs/reference/tools.md +++ b/docs/reference/tools.md @@ -136,5 +136,5 @@ With just one tool to download and configure, you can control multiple AWS servi `eksctl` is a simple CLI tool for creating and managing clusters on EKS - Amazon's managed Kubernetes service for EC2. See [the `eksctl` documentation for more information](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-eksctl.html). -Make sure you are using at least version 0.97. You +Make sure you are using at least version 0.115. You can check the installed version with `eksctl version` diff --git a/eksctl/uwhackweeks.jsonnet b/eksctl/uwhackweeks.jsonnet index 5bd576114b..9b791b6117 100644 --- a/eksctl/uwhackweeks.jsonnet +++ b/eksctl/uwhackweeks.jsonnet @@ -26,6 +26,12 @@ local notebookNodes = [ "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" }, }, + { + instanceType: "g4dn.xlarge", minSize: 0, + tags+: { + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + }, ]; // Node definitions for dask worker nodes. Config here is merged diff --git a/helm-charts/support/templates/aws-nvidia-device-plugin.yaml b/helm-charts/support/templates/aws-nvidia-device-plugin.yaml deleted file mode 100644 index 6579f1b72e..0000000000 --- a/helm-charts/support/templates/aws-nvidia-device-plugin.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Sourced from $ kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.11.0/nvidia-device-plugin.yml -# Could be made automatic if https://github.com/weaveworks/eksctl/issues/5277 is fixed - -{{- if .Values.nvidiaDevicePlugin.aws.enabled }} -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: nvidia-device-plugin-daemonset - namespace: kube-system -spec: - selector: - matchLabels: - name: nvidia-device-plugin-ds - updateStrategy: - type: RollingUpdate - template: - metadata: - # This annotation is deprecated. Kept here for backward compatibility - # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" - labels: - name: nvidia-device-plugin-ds - spec: - tolerations: - # This toleration is deprecated. Kept here for backward compatibility - # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - - key: CriticalAddonsOnly - operator: Exists - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - # Custom tolerations required for our user pods - - effect: NoSchedule - key: hub.jupyter.org/dedicated - operator: Equal - value: user - - effect: NoSchedule - key: hub.jupyter.org_dedicated - operator: Equal - value: user - # Mark this pod as a critical add-on; when enabled, the critical add-on - # scheduler reserves resources for critical add-on pods so that they can - # be rescheduled after a failure. - # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - priorityClassName: "system-node-critical" - containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.11.0 - name: nvidia-device-plugin-ctr - args: ["--fail-on-init-error=false"] - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins - volumes: - - name: device-plugin - hostPath: - path: /var/lib/kubelet/device-plugins -{{- end -}} \ No newline at end of file diff --git a/helm-charts/support/values.schema.yaml b/helm-charts/support/values.schema.yaml index 1be0681d96..cebe0dcdb4 100644 --- a/helm-charts/support/values.schema.yaml +++ b/helm-charts/support/values.schema.yaml @@ -60,7 +60,6 @@ properties: required: - azure - gke - - aws properties: azure: type: object @@ -70,14 +69,6 @@ properties: properties: enabled: type: boolean - aws: - type: object - additionalProperties: false - required: - - enabled - properties: - enabled: - type: boolean gke: type: object additionalProperties: false diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index ab53390da0..492189211b 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -105,9 +105,6 @@ nvidiaDevicePlugin: gke: enabled: false version: "stable" - # For eksctl / AWS specific daemonset, defaults to false - aws: - enabled: false # Enables https://github.com/yuvipanda/cryptnono/ to prevent cryptomining cryptnono: