-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
bug 1904569: add remaining d2g-enabled worker pools for translations …
…GPU workers And also, use the newly minted dated image for them, which appears to be working well in mozilla/translations#700.
- Loading branch information
Showing
1 changed file
with
174 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2080,7 +2080,7 @@ pools: | |
maxCapacity: 128 | ||
implementation: generic-worker/worker-runner-linux-multi | ||
regions: [us-central1, us-west1, us-east1] | ||
image: ubuntu-2404-headless-alpha | ||
image: ubuntu-2404-headless | ||
instance_types: | ||
- minCpuPlatform: Intel Skylake | ||
disks: | ||
|
@@ -2123,14 +2123,186 @@ pools: | |
maxCapacity: 128 | ||
implementation: generic-worker/worker-runner-linux-multi | ||
regions: [us-central1, us-west1, us-east1] | ||
image: ubuntu-2404-headless-alpha | ||
image: ubuntu-2404-headless | ||
instance_types: | ||
- minCpuPlatform: Intel Skylake | ||
disks: | ||
- <<: *persistent-disk | ||
diskSizeGb: 300 | ||
# 40 CPUs, 256GB RAM | ||
machine_type: n1-custom-40-262144 | ||
guestAccelerators: | ||
- acceleratorCount: 4 | ||
acceleratorType: nvidia-tesla-v100 | ||
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-300gb-standard' | ||
description: Worker for machine learning tasks that require standard VMs | ||
owner: [email protected] | ||
variants: | ||
- pool-group: translations-1 | ||
email_on_error: true | ||
provider_id: | ||
by-chain-of-trust: | ||
trusted: fxci-level3-gcp | ||
default: fxci-level1-gcp | ||
config: | ||
lifecycle: | ||
# low inactivity timeout because these workers are very expensive | ||
queueInactivityTimeout: 1800 | ||
worker-config: | ||
genericWorker: | ||
config: | ||
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423 | ||
maxTaskRunTime: 2592900 | ||
enableInteractive: true | ||
d2gConfig: | ||
enableD2G: true | ||
allowGPUs: true | ||
containerEngine: docker | ||
headlessTasks: true | ||
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key' | ||
minCapacity: 0 | ||
maxCapacity: 50 | ||
implementation: generic-worker/worker-runner-linux-multi | ||
regions: [us-central1, us-west1] | ||
image: ubuntu-2404-headless | ||
instance_types: | ||
- minCpuPlatform: Intel Skylake | ||
disks: | ||
- <<: *persistent-disk | ||
diskSizeGb: 300 | ||
# 40 CPUs, 256GB RAM | ||
machine_type: n1-custom-40-262144 | ||
scheduling: standard | ||
guestAccelerators: | ||
- acceleratorCount: 4 | ||
acceleratorType: nvidia-tesla-v100 | ||
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb-standard' | ||
description: Worker for machine learning tasks that require standard VMs | ||
owner: [email protected] | ||
variants: | ||
- pool-group: translations-1 | ||
email_on_error: true | ||
provider_id: | ||
by-chain-of-trust: | ||
trusted: fxci-level3-gcp | ||
default: fxci-level1-gcp | ||
config: | ||
lifecycle: | ||
# low inactivity timeout because these workers are very expensive | ||
queueInactivityTimeout: 1800 | ||
worker-config: | ||
genericWorker: | ||
config: | ||
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423 | ||
maxTaskRunTime: 2592900 | ||
enableInteractive: true | ||
d2gConfig: | ||
enableD2G: true | ||
allowGPUs: true | ||
containerEngine: docker | ||
headlessTasks: true | ||
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key' | ||
minCapacity: 0 | ||
maxCapacity: 50 | ||
implementation: generic-worker/worker-runner-linux-multi | ||
regions: [us-central1, us-west1] | ||
image: ubuntu-2404-headless | ||
instance_types: | ||
- minCpuPlatform: Intel Skylake | ||
disks: | ||
- <<: *persistent-disk | ||
diskSizeGb: 1024 | ||
# 40 CPUs, 256GB RAM | ||
machine_type: n1-custom-40-262144 | ||
scheduling: standard | ||
guestAccelerators: | ||
- acceleratorCount: 4 | ||
acceleratorType: nvidia-tesla-v100 | ||
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-1tb' | ||
description: Worker for machine learning and other high GPU tasks | ||
owner: [email protected] | ||
variants: | ||
- pool-group: translations-1 | ||
email_on_error: true | ||
provider_id: | ||
by-chain-of-trust: | ||
trusted: fxci-level3-gcp | ||
default: fxci-level1-gcp | ||
config: | ||
lifecycle: | ||
# low inactivity timeout because these workers are very expensive | ||
queueInactivityTimeout: 1800 | ||
worker-config: | ||
genericWorker: | ||
config: | ||
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423 | ||
maxTaskRunTime: 2592900 | ||
enableInteractive: true | ||
d2gConfig: | ||
enableD2G: true | ||
allowGPUs: true | ||
containerEngine: docker | ||
headlessTasks: true | ||
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key' | ||
minCapacity: 0 | ||
# We use 4 GPUs per instance across 4 regions with a limit of 128 | ||
# per region at any given time. 4 regions * 4 GPUs = 512 total GPUs | ||
# 512 GPUs / 4 per instance = 128 instances possibly running at once. | ||
maxCapacity: 128 | ||
implementation: generic-worker/worker-runner-linux-multi | ||
regions: [us-central1, us-west1, us-east1] | ||
image: ubuntu-2404-headless | ||
instance_types: | ||
- minCpuPlatform: Intel Skylake | ||
disks: | ||
- <<: *persistent-disk | ||
diskSizeGb: 1024 | ||
# 40 CPUs, 256GB RAM | ||
machine_type: n1-custom-40-262144 | ||
guestAccelerators: | ||
- acceleratorCount: 4 | ||
acceleratorType: nvidia-tesla-v100 | ||
- pool_id: '{pool-group}/b-linux-v100-gpu-d2g-4-2tb' | ||
description: Worker for machine learning and other high GPU tasks | ||
owner: [email protected] | ||
variants: | ||
- pool-group: translations-1 | ||
email_on_error: true | ||
provider_id: | ||
by-chain-of-trust: | ||
trusted: fxci-level3-gcp | ||
default: fxci-level1-gcp | ||
config: | ||
lifecycle: | ||
# low inactivity timeout because these workers are very expensive | ||
queueInactivityTimeout: 1800 | ||
worker-config: | ||
genericWorker: | ||
config: | ||
# 2592900s is 30 days plus 900 seconds to account for https://github.com/taskcluster/taskcluster/issues/7423 | ||
maxTaskRunTime: 2592900 | ||
enableInteractive: true | ||
d2gConfig: | ||
enableD2G: true | ||
allowGPUs: true | ||
containerEngine: docker | ||
headlessTasks: true | ||
ed25519SigningKeyLocation: '/etc/generic-worker/ed25519_key' | ||
minCapacity: 0 | ||
# We use 4 GPUs per instance across 4 regions with a limit of 128 | ||
# per region at any given time. 4 regions * 4 GPUs = 512 total GPUs | ||
# 512 GPUs / 4 per instance = 128 instances possibly running at once. | ||
maxCapacity: 128 | ||
implementation: generic-worker/worker-runner-linux-multi | ||
regions: [us-central1, us-west1, us-east1] | ||
image: ubuntu-2404-headless | ||
instance_types: | ||
- minCpuPlatform: Intel Skylake | ||
disks: | ||
- <<: *persistent-disk | ||
diskSizeGb: 2048 | ||
# 40 CPUs, 256GB RAM | ||
machine_type: n1-custom-40-262144 | ||
guestAccelerators: | ||
- acceleratorCount: 4 | ||
acceleratorType: nvidia-tesla-v100 | ||
|