From 26cb18614cc7ff91e438907cc67ee16c6d9e11de Mon Sep 17 00:00:00 2001 From: Alexander Zhipa Date: Tue, 26 Nov 2024 15:37:57 -0500 Subject: [PATCH] feat: add aws_g6e instances (#969) Co-authored-by: Alexander Jipa --- torchx/specs/named_resources_aws.py | 82 +++++++++++++++++++ torchx/specs/test/named_resources_aws_test.py | 52 ++++++++++++ 2 files changed, 134 insertions(+) diff --git a/torchx/specs/named_resources_aws.py b/torchx/specs/named_resources_aws.py index cbd69988b..555513a06 100644 --- a/torchx/specs/named_resources_aws.py +++ b/torchx/specs/named_resources_aws.py @@ -254,6 +254,80 @@ def aws_g5_48xlarge() -> Resource: ) +def aws_g6e_xlarge() -> Resource: + return Resource( + cpu=4, + gpu=1, + memMB=32 * GiB, + capabilities={K8S_ITYPE: "g6e.xlarge"}, + ) + + +def aws_g6e_2xlarge() -> Resource: + return Resource( + cpu=8, + gpu=1, + memMB=64 * GiB, + capabilities={K8S_ITYPE: "g6e.2xlarge"}, + ) + + +def aws_g6e_4xlarge() -> Resource: + return Resource( + cpu=16, + gpu=1, + memMB=128 * GiB, + capabilities={K8S_ITYPE: "g6e.4xlarge"}, + ) + + +def aws_g6e_8xlarge() -> Resource: + return Resource( + cpu=32, + gpu=1, + memMB=256 * GiB, + capabilities={K8S_ITYPE: "g6e.8xlarge"}, + ) + + +def aws_g6e_16xlarge() -> Resource: + return Resource( + cpu=64, + gpu=1, + memMB=512 * GiB, + capabilities={K8S_ITYPE: "g6e.16xlarge"}, + ) + + +def aws_g6e_12xlarge() -> Resource: + return Resource( + cpu=48, + gpu=4, + memMB=384 * GiB, + capabilities={K8S_ITYPE: "g6e.12xlarge"}, + ) + + +def aws_g6e_24xlarge() -> Resource: + return Resource( + cpu=96, + gpu=4, + memMB=768 * GiB, + capabilities={K8S_ITYPE: "g6e.24xlarge"}, + devices={EFA_DEVICE: 2}, + ) + + +def aws_g6e_48xlarge() -> Resource: + return Resource( + cpu=192, + gpu=8, + memMB=1536 * GiB, + capabilities={K8S_ITYPE: "g6e.48xlarge"}, + devices={EFA_DEVICE: 4}, + ) + + def aws_trn1_2xlarge() -> Resource: return Resource( cpu=8, @@ -299,6 +373,14 @@ def aws_trn1_32xlarge() -> Resource: "aws_g5.12xlarge": aws_g5_12xlarge, "aws_g5.24xlarge": aws_g5_24xlarge, "aws_g5.48xlarge": aws_g5_48xlarge, + "aws_g6e.xlarge": aws_g6e_xlarge, + "aws_g6e.2xlarge": aws_g6e_2xlarge, + "aws_g6e.4xlarge": aws_g6e_4xlarge, + "aws_g6e.8xlarge": aws_g6e_8xlarge, + "aws_g6e.16xlarge": aws_g6e_16xlarge, + "aws_g6e.12xlarge": aws_g6e_12xlarge, + "aws_g6e.24xlarge": aws_g6e_24xlarge, + "aws_g6e.48xlarge": aws_g6e_48xlarge, "aws_trn1.2xlarge": aws_trn1_2xlarge, "aws_trn1.32xlarge": aws_trn1_32xlarge, } diff --git a/torchx/specs/test/named_resources_aws_test.py b/torchx/specs/test/named_resources_aws_test.py index 064483f8f..3480fa463 100644 --- a/torchx/specs/test/named_resources_aws_test.py +++ b/torchx/specs/test/named_resources_aws_test.py @@ -23,6 +23,14 @@ aws_g5_4xlarge, aws_g5_8xlarge, aws_g5_xlarge, + aws_g6e_12xlarge, + aws_g6e_16xlarge, + aws_g6e_24xlarge, + aws_g6e_2xlarge, + aws_g6e_48xlarge, + aws_g6e_4xlarge, + aws_g6e_8xlarge, + aws_g6e_xlarge, aws_m5_2xlarge, aws_p3_16xlarge, aws_p3_2xlarge, @@ -87,6 +95,50 @@ def test_aws_p5(self) -> None: self.assertEqual(2048 * GiB, p5.memMB) self.assertEqual({EFA_DEVICE: 32}, p5.devices) + def test_aws_g6e(self) -> None: + g6e = aws_g6e_xlarge() + g6e_2 = aws_g6e_2xlarge() + g6e_4 = aws_g6e_4xlarge() + g6e_8 = aws_g6e_8xlarge() + g6e_16 = aws_g6e_16xlarge() + g6e_12 = aws_g6e_12xlarge() + g6e_24 = aws_g6e_24xlarge() + g6e_48 = aws_g6e_48xlarge() + + self.assertEqual(4, g6e.cpu) + self.assertEqual(1, g6e.gpu) + self.assertEqual(32 * GiB, g6e.memMB) + + self.assertEqual(8, g6e_2.cpu) + self.assertEqual(1, g6e_2.gpu) + self.assertEqual(64 * GiB, g6e_2.memMB) + + self.assertEqual(16, g6e_4.cpu) + self.assertEqual(1, g6e_4.gpu) + self.assertEqual(128 * GiB, g6e_4.memMB) + + self.assertEqual(32, g6e_8.cpu) + self.assertEqual(1, g6e_8.gpu) + self.assertEqual(256 * GiB, g6e_8.memMB) + + self.assertEqual(64, g6e_16.cpu) + self.assertEqual(1, g6e_16.gpu) + self.assertEqual(512 * GiB, g6e_16.memMB) + + self.assertEqual(48, g6e_12.cpu) + self.assertEqual(4, g6e_12.gpu) + self.assertEqual(384 * GiB, g6e_12.memMB) + + self.assertEqual(96, g6e_24.cpu) + self.assertEqual(4, g6e_24.gpu) + self.assertEqual(768 * GiB, g6e_24.memMB) + self.assertEqual({EFA_DEVICE: 2}, g6e_24.devices) + + self.assertEqual(192, g6e_48.cpu) + self.assertEqual(8, g6e_48.gpu) + self.assertEqual(1536 * GiB, g6e_48.memMB) + self.assertEqual({EFA_DEVICE: 4}, g6e_48.devices) + def test_aws_g4dn(self) -> None: g4d = aws_g4dn_xlarge() self.assertEqual(4, g4d.cpu)