bug 1904569: add remaining d2g-enabled worker pools for translations …

…GPU workers And also, use the newly minted dated image for them, which appears to be working well in mozilla/translations#700.
mozilla-releng · Jan 17, 2025 · 9272cac · 9272cac
1 parent d4fc0dd
commit 9272cac
Showing 1 changed file with 174 additions and 2 deletions.
diff --git a/worker-pools.yml b/worker-pools.yml
@@ -2080,7 +2080,7 @@ pools:
       maxCapacity: 128
       implementation: generic-worker/worker-runner-linux-multi
       regions: [us-central1, us-west1, us-east1]
-      image: ubuntu-2404-headless-alpha
+      image: ubuntu-2404-headless
       instance_types:
         - minCpuPlatform: Intel Skylake
           disks:
@@ -2123,14 +2123,186 @@ pools:
       maxCapacity: 128
       implementation: generic-worker/worker-runner-linux-multi
       regions: [us-central1, us-west1, us-east1]
-      image: ubuntu-2404-headless-alpha
+      image: ubuntu-2404-headless
+      instance_types:
+        - minCpuPlatform: Intel Skylake
+          disks:
+            - <<: *persistent-disk
+              diskSizeGb: 300
+          # 40 CPUs, 256GB RAM
+          machine_type: n1-custom-40-262144
+          guestAccelerators:
+            - acceleratorCount: 4
+              acceleratorType: nvidia-tesla-v100
+  - pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-300gb-standard'
+    description: Worker for machine learning tasks that require standard VMs
+    owner: [email protected]
+    variants:
+      - pool-group: translations-1
+    email_on_error: true
+    provider_id:
+      by-chain-of-trust:
+        trusted: fxci-level3-gcp
+        default: fxci-level1-gcp
+    config:
+      lifecycle:
+        # low inactivity timeout because these workers are very expensive
+        queueInactivityTimeout: 1800
+      worker-config:
+        genericWorker:
+          config:
+            # 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
+            maxTaskRunTime: 2592900
+            enableInteractive: true
+            d2gConfig:
+              enableD2G: true
+              allowGPUs: true
+              containerEngine: docker
+            headlessTasks: true
+            ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
+      minCapacity: 0
+      maxCapacity: 50
+      implementation: generic-worker/worker-runner-linux-multi
+      regions: [us-central1, us-west1]
+      image: ubuntu-2404-headless
       instance_types:
         - minCpuPlatform: Intel Skylake
           disks:
             - <<: *persistent-disk
               diskSizeGb: 300
           # 40 CPUs, 256GB RAM
           machine_type: n1-custom-40-262144
+          scheduling: standard
+          guestAccelerators:
+            - acceleratorCount: 4
+              acceleratorType: nvidia-tesla-v100
+  - pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb-standard'
+    description: Worker for machine learning tasks that require standard VMs
+    owner: [email protected]
+    variants:
+      - pool-group: translations-1
+    email_on_error: true
+    provider_id:
+      by-chain-of-trust:
+        trusted: fxci-level3-gcp
+        default: fxci-level1-gcp
+    config:
+      lifecycle:
+        # low inactivity timeout because these workers are very expensive
+        queueInactivityTimeout: 1800
+      worker-config:
+        genericWorker:
+          config:
+            # 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
+            maxTaskRunTime: 2592900
+            enableInteractive: true
+            d2gConfig:
+              enableD2G: true
+              allowGPUs: true
+              containerEngine: docker
+            headlessTasks: true
+            ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
+      minCapacity: 0
+      maxCapacity: 50
+      implementation: generic-worker/worker-runner-linux-multi
+      regions: [us-central1, us-west1]
+      image: ubuntu-2404-headless
+      instance_types:
+        - minCpuPlatform: Intel Skylake
+          disks:
+            - <<: *persistent-disk
+              diskSizeGb: 1024
+          # 40 CPUs, 256GB RAM
+          machine_type: n1-custom-40-262144
+          scheduling: standard
+          guestAccelerators:
+            - acceleratorCount: 4
+              acceleratorType: nvidia-tesla-v100
+  - pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb'
+    description: Worker for machine learning and other high GPU tasks
+    owner: [email protected]
+    variants:
+      - pool-group: translations-1
+    email_on_error: true
+    provider_id:
+      by-chain-of-trust:
+        trusted: fxci-level3-gcp
+        default: fxci-level1-gcp
+    config:
+      lifecycle:
+        # low inactivity timeout because these workers are very expensive
+        queueInactivityTimeout: 1800
+      worker-config:
+        genericWorker:
+          config:
+            # 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
+            maxTaskRunTime: 2592900
+            enableInteractive: true
+            d2gConfig:
+              enableD2G: true
+              allowGPUs: true
+              containerEngine: docker
+            headlessTasks: true
+            ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
+      minCapacity: 0
+      # We use 4 GPUs per instance across 4 regions with a limit of 128
+      # per region at any given time. 4 regions * 4 GPUs = 512 total GPUs
+      # 512 GPUs / 4 per instance = 128 instances possibly running at once.
+      maxCapacity: 128
+      implementation: generic-worker/worker-runner-linux-multi
+      regions: [us-central1, us-west1, us-east1]
+      image: ubuntu-2404-headless
+      instance_types:
+        - minCpuPlatform: Intel Skylake
+          disks:
+            - <<: *persistent-disk
+              diskSizeGb: 1024
+          # 40 CPUs, 256GB RAM
+          machine_type: n1-custom-40-262144
+          guestAccelerators:
+            - acceleratorCount: 4
+              acceleratorType: nvidia-tesla-v100
+  - pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-2tb'
+    description: Worker for machine learning and other high GPU tasks
+    owner: [email protected]
+    variants:
+      - pool-group: translations-1
+    email_on_error: true
+    provider_id:
+      by-chain-of-trust:
+        trusted: fxci-level3-gcp
+        default: fxci-level1-gcp
+    config:
+      lifecycle:
+        # low inactivity timeout because these workers are very expensive
+        queueInactivityTimeout: 1800
+      worker-config:
+        genericWorker:
+          config:
+            # 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
+            maxTaskRunTime: 2592900
+            enableInteractive: true
+            d2gConfig:
+              enableD2G: true
+              allowGPUs: true
+              containerEngine: docker
+            headlessTasks: true
+            ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
+      minCapacity: 0
+      # We use 4 GPUs per instance across 4 regions with a limit of 128
+      # per region at any given time. 4 regions * 4 GPUs = 512 total GPUs
+      # 512 GPUs / 4 per instance = 128 instances possibly running at once.
+      maxCapacity: 128
+      implementation: generic-worker/worker-runner-linux-multi
+      regions: [us-central1, us-west1, us-east1]
+      image: ubuntu-2404-headless
+      instance_types:
+        - minCpuPlatform: Intel Skylake
+          disks:
+            - <<: *persistent-disk
+              diskSizeGb: 2048
+          # 40 CPUs, 256GB RAM
+          machine_type: n1-custom-40-262144
           guestAccelerators:
             - acceleratorCount: 4
               acceleratorType: nvidia-tesla-v100