Skip to content

Commit

Permalink
feat: accont for gpu attributes (#1833)
Browse files Browse the repository at this point in the history
allow wildcards for GPU attributes in deployment messages

Signed-off-by: Artur Troian <[email protected]>
  • Loading branch information
troian authored Jun 7, 2023
1 parent e0fcadd commit 649d42c
Show file tree
Hide file tree
Showing 9 changed files with 313 additions and 57 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/akash-network/node
go 1.20

require (
github.com/akash-network/akash-api v0.0.16
github.com/akash-network/akash-api v0.0.19
github.com/blang/semver/v4 v4.0.0
github.com/boz/go-lifecycle v0.1.0
github.com/cosmos/cosmos-sdk v0.45.16
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBA
github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c=
github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY=
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
github.com/akash-network/akash-api v0.0.16 h1:6XlJ6a0jROL8JwxXjOVtQbjoOVrWtJL2QoL+6l/f44M=
github.com/akash-network/akash-api v0.0.16/go.mod h1:e1QqkOFwxHKf88I3U5bPOmREdfHHHX2bY27ZZOFnTX4=
github.com/akash-network/akash-api v0.0.19 h1:D99DqD5ocBkepA5CmXJzdB1Jy0zA00pefdRDYdIQyHM=
github.com/akash-network/akash-api v0.0.19/go.mod h1:9/uYusyBcZecBQCgZWUbXRu0i1tyxj4/ze45XB2oLIU=
github.com/akash-network/cometbft v0.34.27-akash h1:V1dApDOr8Ee7BJzYyQ7Z9VBtrAul4+baMeA6C49dje0=
github.com/akash-network/cometbft v0.34.27-akash/go.mod h1:BcCbhKv7ieM0KEddnYXvQZR+pZykTKReJJYf7YC7qhw=
github.com/akash-network/ledger-go v0.14.3 h1:LCEFkTfgGA2xFMN2CtiKvXKE7dh0QSM77PJHCpSkaAo=
Expand Down
7 changes: 6 additions & 1 deletion sdl/_testdata/simple-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,19 @@ profiles:
units: "100m"
gpu:
units: 1
attributes:
vendor:
nvidia:
- model: a100
memory:
size: "128Mi"
storage:
size: "1Gi"
- size: "1Gi"
placement:
westcoast:
attributes:
region: us-west
blalbla: foo
signedBy:
anyOf:
- 1
Expand Down
4 changes: 4 additions & 0 deletions sdl/full_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ profiles:
arch: amd64
gpu:
units: 1
attributes:
vendor:
nvidia:
- model: a100
memory:
size: 16Mi
storage:
Expand Down
109 changes: 85 additions & 24 deletions sdl/gpu.go
Original file line number Diff line number Diff line change
@@ -1,49 +1,110 @@
package sdl

import (
"sort"
"fmt"

"gopkg.in/yaml.v3"

types "github.com/akash-network/akash-api/go/node/types/v1beta3"
)

type v2GPUAttributes types.Attributes
type v2GPUNvidia struct {
Model string `yaml:"model"`
RAM *memoryQuantity `yaml:"ram,omitempty"`
}

func (sdl *v2GPUNvidia) String() string {
key := fmt.Sprintf("%s", sdl.Model)
if sdl.RAM != nil {
key += "/" + sdl.RAM.StringWithSuffix("Gi")
}

return key
}

type v2GPUsNvidia []v2GPUNvidia

type gpuVendor struct {
Nvidia v2GPUsNvidia `yaml:"nvidia,omitempty"`
}

type v2GPUAttributes struct {
attr types.Attributes
Vendor *gpuVendor `yaml:"vendor,omitempty"`
}

type v2ResourceGPU struct {
Units gpuQuantity `yaml:"units"`
Attributes v2GPUAttributes `yaml:"attributes,omitempty"`
}

func (sdl *v2ResourceGPU) UnmarshalYAML(node *yaml.Node) error {
res := v2ResourceGPU{}

for i := 0; i < len(node.Content); i += 2 {
switch node.Content[i].Value {
case "units":
if err := node.Content[i+1].Decode(&res.Units); err != nil {
return err
}
case "attributes":
if err := node.Content[i+1].Decode(&res.Attributes); err != nil {
return err
}
default:
return fmt.Errorf("sdl: unsupported field (%s) for GPU resource", node.Content[i].Value)
}
}

if res.Units > 0 && len(res.Attributes.attr) == 0 {
return fmt.Errorf("sdl: GPU attributes must be present if units > 0")
}

*sdl = res

return nil
}

func (sdl *v2GPUAttributes) UnmarshalYAML(node *yaml.Node) error {
var attr v2GPUAttributes
var res v2GPUAttributes

for i := 0; i+1 < len(node.Content); i += 2 {
var value string
if err := node.Content[i+1].Decode(&value); err != nil {
return err
for i := 0; i < len(node.Content); i += 2 {
switch node.Content[i].Value {
case "vendor":
if err := node.Content[i+1].Decode(&res.Vendor); err != nil {
return err
}
default:
return fmt.Errorf("sdl: unsupported attribute (%s) for GPU resource", node.Content[i].Value)
}
// switch node.Content[i].Value {
// case "arch":
// if err := node.Content[i+1].Decode(&value); err != nil {
// return err
// }
// default:
// return errors.Errorf("unsupported cpu attribute \"%s\"", node.Content[i].Value)
// }

attr = append(attr, types.Attribute{
Key: node.Content[i].Value,
Value: value,
}

if res.Vendor == nil {
return fmt.Errorf("sdl: invalid GPU attributes. at least one vendor must be set")
}

res.attr = make(types.Attributes, 0, len(res.Vendor.Nvidia))

for _, model := range res.Vendor.Nvidia {
res.attr = append(res.attr, types.Attribute{
Key: fmt.Sprintf("vendor/nvidia/model/%s", model.String()),
Value: "true",
})
}

// keys are unique in attributes parsed from sdl so don't need to use sort.SliceStable
sort.Slice(attr, func(i, j int) bool {
return attr[i].Key < attr[j].Key
})
if len(res.attr) == 0 {
res.attr = append(res.attr, types.Attribute{
Key: "vendor/nvidia/model/*",
Value: "true",
})
}
res.attr.Sort()

if err := res.attr.Validate(); err != nil {
return fmt.Errorf("sdl: invalid GPU attributes: %w", err)
}

*sdl = attr
*sdl = res

return nil
}
155 changes: 155 additions & 0 deletions sdl/gpu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package sdl

import (
"testing"

"github.com/stretchr/testify/require"
"gopkg.in/yaml.v3"
)

func TestV2ResourceGPU_EmptyVendor(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.Error(t, err)
}

func TestV2ResourceGPU_Wildcard(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.NoError(t, err)
require.Equal(t, gpuQuantity(1), p.Units)
require.Equal(t, 1, len(p.Attributes.attr))
require.Equal(t, "vendor/nvidia/model/*", p.Attributes.attr[0].Key)
require.Equal(t, "true", p.Attributes.attr[0].Value)
}

func TestV2ResourceGPU_SingleModel(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
- model: a100
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.NoError(t, err)
require.Equal(t, gpuQuantity(1), p.Units)
require.Equal(t, 1, len(p.Attributes.attr))
require.Equal(t, "vendor/nvidia/model/a100", p.Attributes.attr[0].Key)
require.Equal(t, "true", p.Attributes.attr[0].Value)
}

func TestV2ResourceGPU_SingleModelWithRAM(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
- model: a100
ram: 80Gi
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.NoError(t, err)
require.Equal(t, gpuQuantity(1), p.Units)
require.Equal(t, 1, len(p.Attributes.attr))
require.Equal(t, "vendor/nvidia/model/a100/80Gi", p.Attributes.attr[0].Key)
require.Equal(t, "true", p.Attributes.attr[0].Value)
}

func TestV2ResourceGPU_InvalidRAMUnit(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
- model: a100
ram: 80G
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.Error(t, err)
}

func TestV2ResourceGPU_MultipleModels(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
- model: a100
ram: 80Gi
- model: a100
ram: 40Gi
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.NoError(t, err)
require.Equal(t, gpuQuantity(1), p.Units)
require.Equal(t, 2, len(p.Attributes.attr))
require.Equal(t, "vendor/nvidia/model/a100/40Gi", p.Attributes.attr[0].Key)
require.Equal(t, "true", p.Attributes.attr[0].Value)
require.Equal(t, "vendor/nvidia/model/a100/80Gi", p.Attributes.attr[1].Key)
require.Equal(t, "true", p.Attributes.attr[1].Value)
}

func TestV2ResourceGPU_MultipleModels2(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
- model: a100
ram: 80Gi
- model: a100
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.NoError(t, err)
require.Equal(t, gpuQuantity(1), p.Units)
require.Equal(t, 2, len(p.Attributes.attr))
require.Equal(t, "vendor/nvidia/model/a100", p.Attributes.attr[0].Key)
require.Equal(t, "true", p.Attributes.attr[0].Value)
require.Equal(t, "vendor/nvidia/model/a100/80Gi", p.Attributes.attr[1].Key)
require.Equal(t, "true", p.Attributes.attr[1].Value)
}

func TestV2ResourceGPU_MultipleModels3(t *testing.T) {
var stream = `
units: 1
attributes:
vendor:
nvidia:
- model: a6000
- model: a40
`
var p v2ResourceGPU

err := yaml.Unmarshal([]byte(stream), &p)
require.NoError(t, err)
require.Equal(t, gpuQuantity(1), p.Units)
require.Equal(t, 2, len(p.Attributes.attr))
require.Equal(t, "vendor/nvidia/model/a40", p.Attributes.attr[0].Key)
require.Equal(t, "true", p.Attributes.attr[0].Value)
require.Equal(t, "vendor/nvidia/model/a6000", p.Attributes.attr[1].Key)
require.Equal(t, "true", p.Attributes.attr[1].Value)
}
2 changes: 1 addition & 1 deletion sdl/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func (sdl *v2ComputeResources) toDGroupResourceUnits() types.ResourceUnits {
if sdl.GPU != nil {
units.GPU = &types.GPU{
Units: types.NewResourceValue(uint64(sdl.GPU.Units)),
Attributes: types.Attributes(sdl.GPU.Attributes),
Attributes: sdl.GPU.Attributes.attr,
}
} else {
units.GPU = &types.GPU{
Expand Down
Loading

0 comments on commit 649d42c

Please sign in to comment.