Skip to content

Commit

Permalink
chore: add resource manager name/metadata to resourcepoolv1 proto
Browse files Browse the repository at this point in the history
  • Loading branch information
NicholasBlaskey committed Mar 4, 2024
1 parent a5b425a commit cf08cbf
Show file tree
Hide file tree
Showing 8 changed files with 489 additions and 245 deletions.
8 changes: 8 additions & 0 deletions harness/determined/common/api/bindings.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions master/internal/rm/agentrm/agent_resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,8 @@ func (a *ResourceManager) createResourcePoolSummary(
Details: &resourcepoolv1.ResourcePoolDetail{},
SlotType: slotType.Proto(),
Accelerator: accelerator,
ResourceManagerName: a.config.Name,
ResourceManagerMetadata: a.config.Metadata,
}
if pool.Provider != nil {
resp.MinAgents = int32(pool.Provider.MinInstances)
Expand Down
81 changes: 81 additions & 0 deletions master/internal/rm/agentrm/agent_resource_manager_intg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package agentrm

import (
"encoding/json"
"testing"

"github.com/determined-ai/determined/master/internal/db"
Expand All @@ -15,6 +16,8 @@ import (
"github.com/determined-ai/determined/master/internal/sproto"
"github.com/determined-ai/determined/master/internal/user"
"github.com/determined-ai/determined/master/pkg/syncx/queue"
"github.com/determined-ai/determined/proto/pkg/jobv1"
"github.com/determined-ai/determined/proto/pkg/resourcepoolv1"
)

func TestAgentRMRoutingTaskRelatedMessages(t *testing.T) {
Expand Down Expand Up @@ -163,3 +166,81 @@ func TestAgentRMRoutingTaskRelatedMessages(t *testing.T) {
_, err = agentRM.fetchAvgQueuedTime("non-existed-pool")
assert.NilError(t, err, "error fetch average queued time for non-existed-pool")
}

func TestGetResourcePools(t *testing.T) {
expectedName := "testname"
expectedMetadata := map[string]string{"x": "y*y"}
cfg := &config.ResourceConfig{
RootManagerInternal: &config.ResourceManagerConfig{
AgentRM: &config.AgentResourceManagerConfig{
Name: expectedName,
Metadata: expectedMetadata,
Scheduler: &config.SchedulerConfig{
FairShare: &config.FairShareSchedulerConfig{},
FittingPolicy: best,
},
DefaultAuxResourcePool: "cpu-pool",
DefaultComputeResourcePool: "gpu-pool",
},
},
RootPoolsInternal: []config.ResourcePoolConfig{
{PoolName: "cpu-pool"},
{PoolName: "gpu-pool"},
},
}
cpuPoolRef := setupResourcePool(
t, nil, &config.ResourcePoolConfig{PoolName: "cpu-pool"},
nil, nil, []*MockAgent{{ID: "agent1", Slots: 0}},
)
gpuPoolRef := setupResourcePool(
t, nil, &config.ResourcePoolConfig{PoolName: "gpu-pool"},
nil, nil, []*MockAgent{{ID: "agent2", Slots: 4}},
)
agentRM := &ResourceManager{
config: cfg.ResourceManagers()[0].ResourceManager.AgentRM,
poolsConfig: cfg.ResourceManagers()[0].ResourcePools,
pools: map[string]*resourcePool{
"cpu-pool": cpuPoolRef,
"gpu-pool": gpuPoolRef,
},
agentUpdates: queue.New[agentUpdatedEvent](),
}

resp, err := agentRM.GetResourcePools()
require.NoError(t, err)
actual, err := json.MarshalIndent(resp.ResourcePools, "", " ")
require.NoError(t, err)

expectedPools := []*resourcepoolv1.ResourcePool{
{
Name: "cpu-pool",
Type: resourcepoolv1.ResourcePoolType_RESOURCE_POOL_TYPE_STATIC,
DefaultAuxPool: true,
SlotsPerAgent: -1,
SchedulerType: resourcepoolv1.SchedulerType_SCHEDULER_TYPE_FAIR_SHARE,
SchedulerFittingPolicy: resourcepoolv1.FittingPolicy_FITTING_POLICY_BEST,
Location: "on-prem",
Details: &resourcepoolv1.ResourcePoolDetail{},
Stats: &jobv1.QueueStats{},
ResourceManagerName: expectedName,
ResourceManagerMetadata: expectedMetadata,
},
{
Name: "gpu-pool",
Type: resourcepoolv1.ResourcePoolType_RESOURCE_POOL_TYPE_STATIC,
DefaultComputePool: true,
SlotsPerAgent: -1,
SchedulerType: resourcepoolv1.SchedulerType_SCHEDULER_TYPE_FAIR_SHARE,
SchedulerFittingPolicy: resourcepoolv1.FittingPolicy_FITTING_POLICY_BEST,
Location: "on-prem",
Details: &resourcepoolv1.ResourcePoolDetail{},
Stats: &jobv1.QueueStats{},
ResourceManagerName: expectedName,
ResourceManagerMetadata: expectedMetadata,
},
}
expected, err := json.MarshalIndent(expectedPools, "", " ")
require.NoError(t, err)

require.Equal(t, string(expected), string(actual))
}
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,8 @@ func (k *ResourceManager) createResourcePoolSummary(
InstanceType: instanceType,
Details: &resourcepoolv1.ResourcePoolDetail{},
Accelerator: accelerator,
ResourceManagerName: k.config.Name,
ResourceManagerMetadata: k.config.Metadata,
}

rp, err := k.poolByName(poolName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package kubernetesrm

import (
"encoding/json"
"fmt"
"strconv"
"testing"
Expand All @@ -18,8 +19,12 @@ import (

"github.com/determined-ai/determined/master/internal/config"
"github.com/determined-ai/determined/master/internal/mocks"
"github.com/determined-ai/determined/master/internal/rm/tasklist"
"github.com/determined-ai/determined/master/pkg/device"
"github.com/determined-ai/determined/master/pkg/model"
"github.com/determined-ai/determined/master/pkg/ptrs"
"github.com/determined-ai/determined/proto/pkg/jobv1"
"github.com/determined-ai/determined/proto/pkg/resourcepoolv1"
)

const (
Expand Down Expand Up @@ -439,6 +444,93 @@ func TestGetSlot(t *testing.T) {
}
}

func TestGetResourcePools(t *testing.T) {
expectedName := "testname"
expectedMetadata := map[string]string{"x": "y*y"}
cfg := &config.ResourceConfig{
RootManagerInternal: &config.ResourceManagerConfig{
KubernetesRM: &config.KubernetesResourceManagerConfig{
Name: expectedName,
Metadata: expectedMetadata,
MaxSlotsPerPod: ptrs.Ptr(5),
DefaultAuxResourcePool: "cpu-pool",
DefaultComputeResourcePool: "gpu-pool",
},
},
RootPoolsInternal: []config.ResourcePoolConfig{
{PoolName: "cpu-pool"},
{PoolName: "gpu-pool"},
},
}

mockPods := createMockPodsService(make(map[string]*k8sV1.Node), device.CUDA, true)
cpuPoolRef := &kubernetesResourcePool{
poolConfig: &config.ResourcePoolConfig{PoolName: "cpu-pool"},
podsService: mockPods,
reqList: tasklist.New(),
}
gpuPoolRef := &kubernetesResourcePool{
poolConfig: &config.ResourcePoolConfig{PoolName: "gpu-pool"},
podsService: mockPods,
reqList: tasklist.New(),
}
kubernetesRM := &ResourceManager{
config: cfg.ResourceManagers()[0].ResourceManager.KubernetesRM,
poolsConfig: cfg.ResourceManagers()[0].ResourcePools,
taskContainerDefaults: &model.TaskContainerDefaultsConfig{
Kubernetes: &model.KubernetesTaskContainerDefaults{
MaxSlotsPerPod: ptrs.Ptr(5),
},
},
pools: map[string]*kubernetesResourcePool{
"cpu-pool": cpuPoolRef,
"gpu-pool": gpuPoolRef,
},
}

resp, err := kubernetesRM.GetResourcePools()
require.NoError(t, err)
actual, err := json.MarshalIndent(resp.ResourcePools, "", " ")
require.NoError(t, err)

expectedPools := []*resourcepoolv1.ResourcePool{
{
Name: "cpu-pool",
Type: resourcepoolv1.ResourcePoolType_RESOURCE_POOL_TYPE_K8S,
AuxContainerCapacity: 1,
SlotsPerAgent: 5,
DefaultAuxPool: true,
SchedulerType: resourcepoolv1.SchedulerType_SCHEDULER_TYPE_KUBERNETES,
SchedulerFittingPolicy: resourcepoolv1.FittingPolicy_FITTING_POLICY_KUBERNETES,
Location: "n/a",
InstanceType: "n/a",
Details: &resourcepoolv1.ResourcePoolDetail{},
Stats: &jobv1.QueueStats{},
ResourceManagerName: expectedName,
ResourceManagerMetadata: expectedMetadata,
},
{
Name: "gpu-pool",
Type: resourcepoolv1.ResourcePoolType_RESOURCE_POOL_TYPE_K8S,
SlotsPerAgent: 5,
AuxContainerCapacity: 1,
DefaultComputePool: true,
SchedulerType: resourcepoolv1.SchedulerType_SCHEDULER_TYPE_KUBERNETES,
SchedulerFittingPolicy: resourcepoolv1.FittingPolicy_FITTING_POLICY_KUBERNETES,
Location: "n/a",
InstanceType: "n/a",
Details: &resourcepoolv1.ResourcePoolDetail{},
Stats: &jobv1.QueueStats{},
ResourceManagerName: expectedName,
ResourceManagerMetadata: expectedMetadata,
},
}
expected, err := json.MarshalIndent(expectedPools, "", " ")
require.NoError(t, err)

require.Equal(t, string(expected), string(actual))
}

func TestROCmPodsService(t *testing.T) {
tests := []struct {
name string
Expand Down
Loading

0 comments on commit cf08cbf

Please sign in to comment.