From 0b0b00305da2ed221c967925614f5409f68095bb Mon Sep 17 00:00:00 2001 From: Patryk Bundyra Date: Wed, 20 Nov 2024 12:32:55 +0100 Subject: [PATCH] TAS: Clean up TAS flavor snapshot (#3592) * [WIP] Clean up TAS flavor snapshot, delete sort name, delete domainID refrences, optimize memory, improve code clarity * Add comments * Fix typo, rename nodes -> domains * gst * Fix bug, add unit test --- pkg/cache/tas_cache_test.go | 237 ++++++++++++++++++++++++++++++ pkg/cache/tas_flavor.go | 3 +- pkg/cache/tas_flavor_snapshot.go | 239 ++++++++++++++++--------------- 3 files changed, 360 insertions(+), 119 deletions(-) diff --git a/pkg/cache/tas_cache_test.go b/pkg/cache/tas_cache_test.go index ee9290fc2b..61e9e8b5d5 100644 --- a/pkg/cache/tas_cache_test.go +++ b/pkg/cache/tas_cache_test.go @@ -41,6 +41,11 @@ func TestFindTopologyAssignment(t *testing.T) { tasHostLabel = "kubernetes.io/hostname" ) + // b1 b2 + // / \ / \ + // r1 r2 r1 r2 + // | / | \ | | + // x1 x2 x3 x4 x5 x6 defaultNodes := []corev1.Node{ { ObjectMeta: metav1.ObjectMeta{ @@ -188,6 +193,190 @@ func TestFindTopologyAssignment(t *testing.T) { tasHostLabel, } + // b1 b2 + // / \ / \ + // r1 r2 r1 r2 + // / \ / \ / \ / \ + // x1 x2 x3 x4 x5 x6 x7 x6 + binaryTreesNodes := []corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b1-r1-x1", + Labels: map[string]string{ + tasBlockLabel: "b1", + tasRackLabel: "r1", + tasHostLabel: "x1", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b1-r1-x2", + Labels: map[string]string{ + tasBlockLabel: "b1", + tasRackLabel: "r1", + tasHostLabel: "x2", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b1-r2-x3", + Labels: map[string]string{ + tasBlockLabel: "b1", + tasRackLabel: "r2", + tasHostLabel: "x3", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b1-r2-x4", + Labels: map[string]string{ + tasBlockLabel: "b1", + tasRackLabel: "r2", + tasHostLabel: "x4", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b2-r1-x5", + Labels: map[string]string{ + tasBlockLabel: "b2", + tasRackLabel: "r1", + tasHostLabel: "x5", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b2-r1-x6", + Labels: map[string]string{ + tasBlockLabel: "b2", + tasRackLabel: "r1", + tasHostLabel: "x6", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b2-r2-x7", + Labels: map[string]string{ + tasBlockLabel: "b2", + tasRackLabel: "r2", + tasHostLabel: "x7", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "b2-r2-x8", + Labels: map[string]string{ + tasBlockLabel: "b2", + tasRackLabel: "r2", + tasHostLabel: "x8", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + } + cases := map[string]struct { request kueue.PodSetTopologyRequest levels []string @@ -383,6 +572,54 @@ func TestFindTopologyAssignment(t *testing.T) { }, }, }, + "block required; 4 pods fit into one host each": { + nodes: binaryTreesNodes, + request: kueue.PodSetTopologyRequest{ + Required: ptr.To(tasBlockLabel), + }, + levels: defaultThreeLevels, + requests: resources.Requests{ + corev1.ResourceCPU: 1000, + }, + count: 4, + wantAssignment: &kueue.TopologyAssignment{ + Levels: defaultThreeLevels, + Domains: []kueue.TopologyDomainAssignment{ + { + Count: 1, + Values: []string{ + "b1", + "r1", + "x1", + }, + }, + { + Count: 1, + Values: []string{ + "b1", + "r1", + "x2", + }, + }, + { + Count: 1, + Values: []string{ + "b1", + "r2", + "x3", + }, + }, + { + Count: 1, + Values: []string{ + "b1", + "r2", + "x4", + }, + }, + }, + }, + }, "host required; single Pod fits in the host": { nodes: defaultNodes, request: kueue.PodSetTopologyRequest{ diff --git a/pkg/cache/tas_flavor.go b/pkg/cache/tas_flavor.go index 8555abf53a..3bd939cd17 100644 --- a/pkg/cache/tas_flavor.go +++ b/pkg/cache/tas_flavor.go @@ -117,8 +117,7 @@ func (c *TASFlavorCache) snapshotForNodes(log logr.Logger, nodes []corev1.Node, levelValues := utiltas.LevelValues(c.Levels, node.Labels) capacity := resources.NewRequests(node.Status.Allocatable) domainID := utiltas.DomainID(levelValues) - snapshot.levelValuesPerDomain[domainID] = levelValues - snapshot.addCapacity(domainID, capacity) + snapshot.addLeafDomain(levelValues, capacity, domainID) nodeToDomain[node.Name] = domainID } snapshot.initialize() diff --git a/pkg/cache/tas_flavor_snapshot.go b/pkg/cache/tas_flavor_snapshot.go index 2139b0d2f7..6454e27d62 100644 --- a/pkg/cache/tas_flavor_snapshot.go +++ b/pkg/cache/tas_flavor_snapshot.go @@ -37,24 +37,37 @@ var ( // domain holds the static information about placement of a topology // domain in the hierarchy of topology domains. type domain struct { - // sortName indicates name used for sorting when two domains can fit - // the same number of pods. - // Example for domain corresponding to "rack2" in "block1" the value is - // "rack2 " (where domainID is "block1,rack2"). - sortName string - // id is the globally unique id of the domain id utiltas.TopologyDomainID - // parentID is the global ID of the parent domain - parentID utiltas.TopologyDomainID + // parent points to domain which is a parent in topology structure + parent *domain + + // children points to domains which are children in topology structure + children []*domain + + // state is a temporary state of the topology domains during the + // assignment algorithm. + // + // In the first phase of the algorithm (traversal to the top the topology to + // determine the level to fit the workload) it denotes the number of pods + // which can fit in a given domain. + // + // In the second phase of the algorithm (traversal to the bottom to + // determine the actual assignments) it denotes the number of pods actually + // assigned to the given domain. + state int32 + + // levelValues stores the mapping from domain ID back to the + // ordered list of values + levelValues []string - // childIDs at the global IDs of the child domains - childIDs []utiltas.TopologyDomainID + // freeCapacity stores the free capacity per domain, only for the + // lowest level of topology + freeCapacity resources.Requests } type domainByID map[utiltas.TopologyDomainID]*domain -type statePerDomain map[utiltas.TopologyDomainID]int32 type TASFlavorSnapshot struct { log logr.Logger @@ -67,109 +80,98 @@ type TASFlavorSnapshot struct { // on the Topology object levelKeys []string - // freeCapacityPerLeafDomain stores the free capacity per domain, only for the - // lowest level of topology - freeCapacityPerLeafDomain map[utiltas.TopologyDomainID]resources.Requests + // leaves maps domainID to domains that are at the lowest level of topology structure + leaves domainByID + + // roots maps domainID to domains that are at the highest level of topology structure + roots domainByID - // levelValuesPerDomain stores the mapping from domain ID back to the - // ordered list of values. It stores the information for all levels. - levelValuesPerDomain map[utiltas.TopologyDomainID][]string + // domains maps domainID to every domain available in the topology structure + domains domainByID // domainsPerLevel stores the static tree information domainsPerLevel []domainByID - - // statePerLevel is a temporary state of the topology domains during the - // assignment algorithm. - // - // In the first phase of the algorithm (traversal to the top the topology to - // determine the level to fit the workload) it denotes the number of pods - // which can fit in a given domain. - // - // In the second phase of the algorithm (traversal to the bottom to - // determine the actual assignments) it denotes the number of pods actually - // assigned to the given domain. - state statePerDomain } func newTASFlavorSnapshot(log logr.Logger, topologyName kueue.TopologyReference, levels []string) *TASFlavorSnapshot { + domainsPerLevel := make([]domainByID, len(levels)) + for level := range levels { + domainsPerLevel[level] = make(domainByID) + } + snapshot := &TASFlavorSnapshot{ - log: log, - topologyName: topologyName, - levelKeys: slices.Clone(levels), - freeCapacityPerLeafDomain: make(map[utiltas.TopologyDomainID]resources.Requests), - levelValuesPerDomain: make(map[utiltas.TopologyDomainID][]string), - domainsPerLevel: make([]domainByID, len(levels)), - state: make(statePerDomain), + log: log, + topologyName: topologyName, + levelKeys: slices.Clone(levels), + leaves: make(domainByID), + domains: make(domainByID), + roots: make(domainByID), + domainsPerLevel: domainsPerLevel, } return snapshot } -// initialize prepares the domainsPerLevel tree structure. This structure holds +func (s *TASFlavorSnapshot) addLeafDomain(levelValues []string, capacity resources.Requests, domainID utiltas.TopologyDomainID) { + domain := domain{ + id: domainID, + levelValues: levelValues, + } + if _, found := s.leaves[domainID]; !found { + s.leaves[domainID] = &domain + } + s.addCapacity(domainID, capacity) +} + +// initialize prepares the topology tree structure. This structure holds // for a given the list of topology domains with additional static and dynamic // information. This function initializes the static information which // represents the edges to the parent and child domains. This structure is // reused for multiple workloads during a single scheduling cycle. func (s *TASFlavorSnapshot) initialize() { - levelCount := len(s.levelKeys) - lastLevelIdx := levelCount - 1 - for levelIdx := range s.levelKeys { - s.domainsPerLevel[levelIdx] = make(domainByID) - } - for childID := range s.freeCapacityPerLeafDomain { - childDomain := &domain{ - sortName: s.sortName(lastLevelIdx, childID), - id: childID, - } - s.domainsPerLevel[lastLevelIdx][childID] = childDomain - parentFound := false - var parent *domain - for levelIdx := lastLevelIdx - 1; levelIdx >= 0 && !parentFound; levelIdx-- { - parentValues := s.levelValuesPerDomain[childID][:levelIdx+1] - parentID := utiltas.DomainID(parentValues) - s.levelValuesPerDomain[parentID] = parentValues - parent, parentFound = s.domainsPerLevel[levelIdx][parentID] - if !parentFound { - parent = &domain{ - sortName: s.sortName(levelIdx, parentID), - id: parentID, - } - s.domainsPerLevel[levelIdx][parentID] = parent - } - childDomain.parentID = parentID - parent.childIDs = append(parent.childIDs, childID) - childID = parentID - } + for _, domain := range s.leaves { + s.domains[domain.id] = domain + s.domainsPerLevel[len(domain.levelValues)-1][domain.id] = domain + s.initializeHelper(domain) } } -func (s *TASFlavorSnapshot) sortName(levelIdx int, domainID utiltas.TopologyDomainID) string { - levelValues := s.levelValuesPerDomain[domainID] - if len(levelValues) <= levelIdx { - s.log.Error(errCodeAssumptionsViolated, "invalid invocation of sortName", - "count", len(levelValues), - "levelIdx", levelIdx, - "domainID", domainID, - "levelValuesPerDomain", s.levelValuesPerDomain, - "freeCapacityPerDomain", s.freeCapacityPerLeafDomain) +// initializeHelper is a recursive helper for initialize() method +func (s *TASFlavorSnapshot) initializeHelper(dom *domain) { + if len(dom.levelValues) == 1 { + s.roots[dom.id] = dom + return + } + parentValues := dom.levelValues[:len(dom.levelValues)-1] + parentID := utiltas.DomainID(parentValues) + parent, parentFound := s.domains[parentID] + if !parentFound { + // create parent + parent = &domain{ + id: parentID, + levelValues: parentValues, + } + s.domainsPerLevel[len(parentValues)-1][parentID] = parent + s.domains[parentID] = parent + s.initializeHelper(parent) } - // we prefix with the node label value to make it ordered naturally, but - // append also domain ID as the node label value may not be globally unique. - return s.levelValuesPerDomain[domainID][levelIdx] + " " + string(domainID) + // connect parent and child + dom.parent = parent + parent.children = append(parent.children, dom) } func (s *TASFlavorSnapshot) addCapacity(domainID utiltas.TopologyDomainID, capacity resources.Requests) { s.initializeFreeCapacityPerDomain(domainID) - s.freeCapacityPerLeafDomain[domainID].Add(capacity) + s.leaves[domainID].freeCapacity.Add(capacity) } func (s *TASFlavorSnapshot) addUsage(domainID utiltas.TopologyDomainID, usage resources.Requests) { s.initializeFreeCapacityPerDomain(domainID) - s.freeCapacityPerLeafDomain[domainID].Sub(usage) + s.leaves[domainID].freeCapacity.Sub(usage) } func (s *TASFlavorSnapshot) initializeFreeCapacityPerDomain(domainID utiltas.TopologyDomainID) { - if _, found := s.freeCapacityPerLeafDomain[domainID]; !found { - s.freeCapacityPerLeafDomain[domainID] = resources.Requests{} + if s.leaves[domainID].freeCapacity == nil { + s.leaves[domainID].freeCapacity = resources.Requests{} } } @@ -253,18 +255,18 @@ func (s *TASFlavorSnapshot) findLevelWithFitDomains(levelIdx int, required bool, levelDomains := utilmaps.Values(domains) sortedDomain := s.sortedDomains(levelDomains) topDomain := sortedDomain[0] - if s.state[topDomain.id] < count { + if topDomain.state < count { if required { - return 0, nil, s.notFitMessage(s.state[topDomain.id], count) + return 0, nil, s.notFitMessage(topDomain.state, count) } if levelIdx > 0 { return s.findLevelWithFitDomains(levelIdx-1, required, count) } lastIdx := 0 - remainingCount := count - s.state[sortedDomain[lastIdx].id] - for remainingCount > 0 && lastIdx < len(sortedDomain)-1 && s.state[sortedDomain[lastIdx].id] > 0 { + remainingCount := count - sortedDomain[lastIdx].state + for remainingCount > 0 && lastIdx < len(sortedDomain)-1 && sortedDomain[lastIdx].state > 0 { lastIdx++ - remainingCount -= s.state[sortedDomain[lastIdx].id] + remainingCount -= sortedDomain[lastIdx].state } if remainingCount > 0 { return 0, nil, s.notFitMessage(count-remainingCount, count) @@ -278,19 +280,18 @@ func (s *TASFlavorSnapshot) updateCountsToMinimum(domains []*domain, count int32 result := make([]*domain, 0) remainingCount := count for _, domain := range domains { - if s.state[domain.id] >= remainingCount { - s.state[domain.id] = remainingCount + if domain.state >= remainingCount { + domain.state = remainingCount result = append(result, domain) return result } - remainingCount -= s.state[domain.id] + remainingCount -= domain.state result = append(result, domain) } s.log.Error(errCodeAssumptionsViolated, "unexpected remainingCount", "remainingCount", remainingCount, "count", count, - "levelValuesPerDomain", s.levelValuesPerDomain, - "freeCapacityPerDomain", s.freeCapacityPerLeafDomain) + "leaves", s.leaves) return nil } @@ -301,8 +302,8 @@ func (s *TASFlavorSnapshot) buildAssignment(domains []*domain) *kueue.TopologyAs } for _, domain := range domains { assignment.Domains = append(assignment.Domains, kueue.TopologyDomainAssignment{ - Values: s.levelValuesPerDomain[domain.id], - Count: s.state[domain.id], + Values: domain.levelValues, + Count: domain.state, }) } return &assignment @@ -310,11 +311,8 @@ func (s *TASFlavorSnapshot) buildAssignment(domains []*domain) *kueue.TopologyAs func (s *TASFlavorSnapshot) lowerLevelDomains(levelIdx int, domains []*domain) []*domain { result := make([]*domain, 0, len(domains)) - for _, info := range domains { - for _, childDomainID := range info.childIDs { - childDomain := s.domainsPerLevel[levelIdx+1][childDomainID] - result = append(result, childDomain) - } + for _, domain := range domains { + result = append(result, domain.children...) } return result } @@ -323,12 +321,10 @@ func (s *TASFlavorSnapshot) sortedDomains(domains []*domain) []*domain { result := make([]*domain, len(domains)) copy(result, domains) slices.SortFunc(result, func(a, b *domain) int { - aCount := s.state[a.id] - bCount := s.state[b.id] switch { - case aCount == bCount: - return strings.Compare(a.sortName, b.sortName) - case aCount > bCount: + case a.state == b.state: + return strings.Compare(string(a.id), string(b.id)) + case a.state > b.state: return -1 default: return 1 @@ -338,22 +334,31 @@ func (s *TASFlavorSnapshot) sortedDomains(domains []*domain) []*domain { } func (s *TASFlavorSnapshot) fillInCounts(requests resources.Requests) { - // cleanup the state in case some remaining values are present from computing - // assignments for previous PodSets. - for domainID := range s.state { - s.state[domainID] = 0 + for _, domain := range s.domains { + // cleanup the state in case some remaining values are present from computing + // assignments for previous PodSets. + domain.state = 0 } - for domainID, capacity := range s.freeCapacityPerLeafDomain { - s.state[domainID] = requests.CountIn(capacity) + for _, leaf := range s.leaves { + leaf.state = requests.CountIn(leaf.freeCapacity) } - lastLevelIdx := len(s.domainsPerLevel) - 1 - for levelIdx := lastLevelIdx - 1; levelIdx >= 0; levelIdx-- { - for _, info := range s.domainsPerLevel[levelIdx] { - for _, childDomainID := range info.childIDs { - s.state[info.id] += s.state[childDomainID] - } - } + for _, root := range s.roots { + root.state = s.fillInCountsHelper(root) + } +} + +func (s *TASFlavorSnapshot) fillInCountsHelper(domain *domain) int32 { + // logic for a leaf + if len(domain.children) == 0 { + return domain.state + } + // logic for a parent + childrenCapacity := int32(0) + for _, child := range domain.children { + childrenCapacity += s.fillInCountsHelper(child) } + domain.state = childrenCapacity + return childrenCapacity } func (s *TASFlavorSnapshot) notFitMessage(fitCount, totalCount int32) string {