Skip to content

Commit

Permalink
bug 1904569: add remaining d2g-enabled worker pools for translations …
Browse files Browse the repository at this point in the history
…GPU workers

And also, use the newly minted dated image for them, which appears to be working well in mozilla/translations#700.
  • Loading branch information
bhearsum committed Jan 17, 2025
1 parent d4fc0dd commit 9272cac
Showing 1 changed file with 174 additions and 2 deletions.
176 changes: 174 additions & 2 deletions worker-pools.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2080,7 +2080,7 @@ pools:
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless-alpha
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
Expand Down Expand Up @@ -2123,14 +2123,186 @@ pools:
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless-alpha
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 300
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-300gb-standard'
description: Worker for machine learning tasks that require standard VMs
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
maxCapacity: 50
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 300
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
scheduling: standard
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb-standard'
description: Worker for machine learning tasks that require standard VMs
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
maxCapacity: 50
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 1024
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
scheduling: standard
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb'
description: Worker for machine learning and other high GPU tasks
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
# We use 4 GPUs per instance across 4 regions with a limit of 128
# per region at any given time. 4 regions * 4 GPUs = 512 total GPUs
# 512 GPUs / 4 per instance = 128 instances possibly running at once.
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 1024
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-2tb'
description: Worker for machine learning and other high GPU tasks
owner: [email protected]
variants:
- pool-group: translations-1
email_on_error: true
provider_id:
by-chain-of-trust:
trusted: fxci-level3-gcp
default: fxci-level1-gcp
config:
lifecycle:
# low inactivity timeout because these workers are very expensive
queueInactivityTimeout: 1800
worker-config:
genericWorker:
config:
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423
maxTaskRunTime: 2592900
enableInteractive: true
d2gConfig:
enableD2G: true
allowGPUs: true
containerEngine: docker
headlessTasks: true
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key'
minCapacity: 0
# We use 4 GPUs per instance across 4 regions with a limit of 128
# per region at any given time. 4 regions * 4 GPUs = 512 total GPUs
# 512 GPUs / 4 per instance = 128 instances possibly running at once.
maxCapacity: 128
implementation: generic-worker/worker-runner-linux-multi
regions: [us-central1, us-west1, us-east1]
image: ubuntu-2404-headless
instance_types:
- minCpuPlatform: Intel Skylake
disks:
- <<: *persistent-disk
diskSizeGb: 2048
# 40 CPUs, 256GB RAM
machine_type: n1-custom-40-262144
guestAccelerators:
- acceleratorCount: 4
acceleratorType: nvidia-tesla-v100
Expand Down

0 comments on commit 9272cac

Please sign in to comment.