2i2c-org · yuvipanda · Oct 18, 2022 · Oct 18, 2022
diff --git a/config/clusters/uwhackweeks/common.values.yaml b/config/clusters/uwhackweeks/common.values.yaml
@@ -28,6 +28,10 @@ basehub:
             name: ICESat Hackweek
             url: https://icesat-2.hackweek.io
     singleuser:
+      extraEnv:
+        # Temporarily set for *all* pods, including pods without any GPUs,
+        # to work around https://github.com/2i2c-org/infrastructure/issues/1530
+        NVIDIA_DRIVER_CAPABILITIES: compute,utility
       defaultUrl: /lab
       # User image repo: https://github.com/ICESAT-2HackWeek/website2022
       image:
@@ -72,18 +76,50 @@ basehub:
             mem_guarantee: 115G
             node_selector:
               node.kubernetes.io/instance-type: m5.8xlarge
-        - display_name: "Large + GPU: p2.xlarge"
-          description: "~4CPUs, 60G RAM, 1 NVIDIA K80 GPU"
+
+        - display_name: "GPU"
+          # P2.xlarge has 64G of RAM per GPU while g4dn has 16?!
+          description: |
+            ~4CPUs, Nvidia K80 or T4 GPU.
+
+            K80 comes with 55G of RAM, while T4 comes with about 14G
+          profile_options:
+            gpu:
+              display_name: GPU
+              choices:
+                k80:
+                  display_name: NVidia Tesla K80
+                  slug: k80
+                  kubespawner_override:
+                    mem_guarantee: 55G
+                    node_selector:
+                      node.kubernetes.io/instance-type: p2.xlarge
+                t4:
+                  display_name: NVidia Tesla T4
+                  slug: t4
+                  default: true
+                  kubespawner_override:
+                    mem_guarantee: 14G
+                    node_selector:
+                      node.kubernetes.io/instance-type: g4dn.xlarge
+            image:
+              display_name: Image
+              choices:
+                tensorflow:
+                  display_name: Pangeo Tensorflow ML Notebook
+                  slug: "tensorflow"
+                  kubespawner_override:
+                    image: "pangeo/ml-notebook:2022.10.13"
+                pytorch:
+                  display_name: Pangeo PyTorch ML Notebook
+                  default: true
+                  slug: "pytorch"
+                  kubespawner_override:
+                    image: "pangeo/pytorch-notebook:2022.10.13"
           kubespawner_override:
             mem_limit: null
-            mem_guarantee: 55G
-            image: "pangeo/ml-notebook:master"
-            environment:
-              NVIDIA_DRIVER_CAPABILITIES: compute,utility
             extra_resource_limits:
               nvidia.com/gpu: "1"
-            node_selector:
-              node.kubernetes.io/instance-type: p2.xlarge
     scheduling:
       userPlaceholder:
         enabled: false

diff --git a/config/clusters/uwhackweeks/support.values.yaml b/config/clusters/uwhackweeks/support.values.yaml
@@ -1,10 +1,6 @@
 prometheusIngressAuthSecret:
   enabled: true
 
-nvidiaDevicePlugin:
-  aws:
-    enabled: true
-
 prometheus:
   server:
     ingress:

diff --git a/docs/howto/features/gpu.md b/docs/howto/features/gpu.md
@@ -24,13 +24,14 @@ series nodes.
 5. Select 'Request Quota Increase'.
 6. Input the *number of vCPUs* needed. This translates to a total
    number of GPU nodes based on how many CPUs the nodes we want have.
-   For example, if we are using [P2 nodes](https://aws.amazon.com/ec2/instance-types/p2/)
-   with NVIDIA K80 GPUs, each `p2.xlarge` node gives us 1 GPU and
+   For example, if we are using [G4 nodes](https://aws.amazon.com/ec2/instance-types/g4/)
+   with NVIDIA K80 GPUs, each `g4dn.xlarge` node gives us 1 GPU and
    4 vCPUs, so a quota of 8 vCPUs will allow us to spawn 2 GPU nodes.
    We should fine tune this calculation for later, but for now, the
-   recommendation is to give users a `p2.xlarge` each, so the number
+   recommendation is to give users a single `g4dn.xlarge` each, so the number
    of vCPUs requested should be `4 * max number of GPU nodes`.
-7. Ask for the increase, and wait. This can take *several working days*.
+7. Ask for the increase, and wait. This can take *several working days*,
+   so do it as early as possible!
 
 #### Setup GPU nodegroup on eksctl
 
@@ -43,14 +44,14 @@ AWS, and we can configure a node group there to provide us GPUs.
 
    ```
     {
-        instanceType: "p2.xlarge",
+        instanceType: "g4dn.xlarge",
         tags+: {
             "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
         },
     }
    ```
 
-   `p2.xlarge` gives us 1 K80 GPU and ~4 CPUs. The `tags` definition
+   `g4dn.xlarge` gives us 1 Nvidia T4 GPU and ~4 CPUs. The `tags` definition
    is necessary to let the autoscaler know that this nodegroup has
    1 GPU per node. If you're using a different machine type with
    more GPUs, adjust this definition accordingly.
@@ -64,15 +65,12 @@ AWS, and we can configure a node group there to provide us GPUs.
 3. Create the nodegroup
 
    ```bash
-   eksctl create nodegroup -f <your-cluster>.eksctl.yaml --install-nvidia-plugin=false
+   eksctl create nodegroup -f <your-cluster>.eksctl.yaml
    ```
 
-   The `--install-nvidia-plugin=false` is required until
-   [this bug](https://github.com/weaveworks/eksctl/issues/5277)
-   is fixed.
-
    This should create the nodegroup with 0 nodes in it, and the
-   autoscaler should recognize this!
+   autoscaler should recognize this! `eksctl` will also setup the
+   appropriate driver installer, so you won't have to.
 
 #### Setting up a GPU user profile
 
@@ -81,29 +79,51 @@ a profile. This should be placed in the hub configuration:
 
 ```yaml
 jupyterhub:
-    singleuser:
-        profileList:
-        - display_name: "Large + GPU: p2.xlarge"
-          description: "~4CPUs, 60G RAM, 1 NVIDIA K80 GPU"
+   extraEnv:
+      # Temporarily set for *all* pods, including pods without any GPUs,
+      # to work around https://github.com/2i2c-org/infrastructure/issues/1530
+      NVIDIA_DRIVER_CAPABILITIES: compute,utility
+   singleuser:
+      profileList:
+        - display_name: Large + GPU
+          description: 14GB RAM, 4 CPUs, T4 GPU
+          profile_options:
+            gpu:
+            image:
+              display_name: Image
+              choices:
+                tensorflow:
+                  display_name: Pangeo Tensorflow ML Notebook
+                  slug: "tensorflow"
+                  kubespawner_override:
+                    node.kubernetes.io/instance-type: g4dn.xlarge
+                    image: "pangeo/ml-notebook:<tag>"
+                pytorch:
+                  display_name: Pangeo PyTorch ML Notebook
+                  default: true
+                  slug: "pytorch"
+                  kubespawner_override:
+                    node.kubernetes.io/instance-type: g4dn.xlarge
+                    image: "pangeo/pytorch-notebook:<tag>"
           kubespawner_override:
             mem_limit: null
-            mem_guarantee: 55G
-            image: "pangeo/ml-notebook:<tag>"
-            environment:
-              NVIDIA_DRIVER_CAPABILITIES: compute,utility
+            mem_guarantee: 14G
             extra_resource_limits:
               nvidia.com/gpu: "1"
-            node_selector:
-              node.kubernetes.io/instance-type: p2.xlarge
 ```
 
 1. If using a `daskhub`, place this under the `basehub` key.
 2. The image used should have ML tools (pytorch, cuda, etc)
-   installed. The recommendation is to use Pangeo's
+   installed. The recommendation is to provide Pangeo's
    [ml-notebook](https://hub.docker.com/r/pangeo/ml-notebook)
    for tensorflow and [pytorch-notebook](https://hub.docker.com/r/pangeo/pytorch-notebook)
-   for pytorch. **Do not** use the `latest` or `master` tags - find
+   for pytorch. We expose these as options so users can pick what they want
+   to use.
+
+   ```{warning}
+   **Do not** use the `latest` or `master` tags - find
    a specific tag listed for the image you want, and use that.
+   ```
 3. The [NVIDIA_DRIVER_CAPABILITIES](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html#driver-capabilities)
    environment variable tells the GPU driver what kind of libraries
    and tools to inject into the container. Without setting this,
@@ -134,6 +154,15 @@ this works!
    ```
    [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
    ```
+
+   If on an image with pytorch instead, try this:
+   ```python
+   import torch
+
+   torch.cuda.is_available()
+   ```
+
+   This should return `True`.
 4. Remember to explicitly shut down your server after testing,
    as GPU instances can get expensive!
 

diff --git a/docs/reference/tools.md b/docs/reference/tools.md
@@ -136,5 +136,5 @@ With just one tool to download and configure, you can control multiple AWS servi
 `eksctl` is a simple CLI tool for creating and managing clusters on EKS - Amazon's
 managed Kubernetes service for EC2. See [the `eksctl` documentation for more information](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-eksctl.html).
 
-Make sure you are using at least version 0.97. You
+Make sure you are using at least version 0.115. You
 can check the installed version with `eksctl version`
diff --git a/eksctl/uwhackweeks.jsonnet b/eksctl/uwhackweeks.jsonnet
@@ -26,6 +26,12 @@ local notebookNodes = [
             "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
         },
     },
+    {
+        instanceType: "g4dn.xlarge", minSize: 0,
+        tags+: {
+            "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
+        },
+    },
 ];
 
 // Node definitions for dask worker nodes. Config here is merged

diff --git a/helm-charts/support/templates/aws-nvidia-device-plugin.yaml b/helm-charts/support/templates/aws-nvidia-device-plugin.yaml
diff --git a/helm-charts/support/values.schema.yaml b/helm-charts/support/values.schema.yaml
@@ -60,7 +60,6 @@ properties:
     required:
       - azure
       - gke
-      - aws
     properties:
       azure:
         type: object
@@ -70,14 +69,6 @@ properties:
         properties:
           enabled:
             type: boolean
-      aws:
-        type: object
-        additionalProperties: false
-        required:
-          - enabled
-        properties:
-          enabled:
-            type: boolean
       gke:
         type: object
         additionalProperties: false

diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml
@@ -105,9 +105,6 @@ nvidiaDevicePlugin:
   gke:
     enabled: false
     version: "stable"
-  # For eksctl / AWS specific daemonset, defaults to false
-  aws:
-    enabled: false
 
 # Enables  https://github.com/yuvipanda/cryptnono/ to prevent cryptomining
 cryptnono: