Skip to content

Commit

Permalink
Introduce extraction of System Architecture from AutoscalerVars in GC…
Browse files Browse the repository at this point in the history
…E cloud-provider
  • Loading branch information
jayantjain93 committed Apr 11, 2022
1 parent 69755bb commit 18c36b5
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 12 deletions.
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/gce/gce_reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ type GceReserved struct{}

// CalculateKernelReserved computes how much memory Linux kernel will reserve.
// TODO(jkaniuk): account for crashkernel reservation on RHEL / CentOS
func (r *GceReserved) CalculateKernelReserved(physicalMemory int64, os OperatingSystem, osDistribution OperatingSystemDistribution, nodeVersion string) int64 {
func (r *GceReserved) CalculateKernelReserved(physicalMemory int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64 {
switch os {
case OperatingSystemLinux:
// Account for memory reserved by kernel
Expand Down Expand Up @@ -262,7 +262,7 @@ func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDist
}

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func (r *GceReserved) CalculateOSReservedEphemeralStorage(diskSize int64, os OperatingSystem, osDistribution OperatingSystemDistribution, nodeVersion string) int64 {
func (r *GceReserved) CalculateOSReservedEphemeralStorage(diskSize int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64 {
switch osDistribution {
case OperatingSystemDistributionCOS:
storage := int64(math.Ceil(0.015635*float64(diskSize))) + int64(math.Ceil(4.148*GiB)) // os partition estimation
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/gce/gce_reserved_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func TestCalculateKernelReservedLinux(t *testing.T) {
for idx, tc := range testCases {
r := &GceReserved{}
t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
reserved := r.CalculateKernelReserved(tc.physicalMemory, OperatingSystemLinux, tc.osDistribution, "")
reserved := r.CalculateKernelReserved(tc.physicalMemory, OperatingSystemLinux, tc.osDistribution, "", "")
if tc.osDistribution == OperatingSystemDistributionUbuntu {
assert.Equal(t, tc.reservedMemory+int64(math.Min(correctionConstant*float64(tc.physicalMemory), maximumCorrectionValue)+ubuntuSpecificOffset), reserved)
} else if tc.osDistribution == OperatingSystemDistributionCOS {
Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/gce/os_reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ package gce
type OsReservedCalculator interface {
// CalculateKernelReserved computes how much memory OS kernel will reserve.
// NodeVersion parameter is optional. If empty string is passed a result calculated using default node version will be returned.
CalculateKernelReserved(physicalMemory int64, os OperatingSystem, osDistribution OperatingSystemDistribution, nodeVersion string) int64
CalculateKernelReserved(physicalMemory int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold.
// NodeVersion parameter is optional. If empty string is passed a result calculated using default node version will be returned.
CalculateOSReservedEphemeralStorage(diskSize int64, os OperatingSystem, osDistribution OperatingSystemDistribution, nodeVersion string) int64
CalculateOSReservedEphemeralStorage(diskSize int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64
}
49 changes: 44 additions & 5 deletions cluster-autoscaler/cloudprovider/gce/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}

// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64, version string, r OsReservedCalculator) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture,
ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64, version string, r OsReservedCalculator) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
Expand All @@ -68,7 +69,7 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}

capacity[apiv1.ResourceCPU] = *resource.NewQuantity(cpu, resource.DecimalSI)
memTotal := mem - r.CalculateKernelReserved(mem, os, osDistribution, version)
memTotal := mem - r.CalculateKernelReserved(mem, os, osDistribution, arch, version)
capacity[apiv1.ResourceMemory] = *resource.NewQuantity(memTotal, resource.DecimalSI)

if accelerators != nil && len(accelerators) > 0 {
Expand All @@ -80,7 +81,7 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
if ephemeralStorageLocalSSDCount > 0 {
storageTotal = ephemeralStorage - EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(ephemeralStorageLocalSSDCount, osDistribution)
} else {
storageTotal = ephemeralStorage - r.CalculateOSReservedEphemeralStorage(ephemeralStorage, os, osDistribution, version)
storageTotal = ephemeralStorage - r.CalculateOSReservedEphemeralStorage(ephemeralStorage, os, osDistribution, arch, version)
}
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}
Expand Down Expand Up @@ -174,6 +175,10 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
if osDistribution == OperatingSystemDistributionUnknown {
return nil, fmt.Errorf("could not obtain os-distribution from kube-env from template metadata")
}
arch := extractSystemArchitectureFromKubeEnv(kubeEnvValue)
if arch == UnknownArch {
return nil, fmt.Errorf("could not obtain arch from kube-env from template metadata")
}

var ephemeralStorage int64 = -1
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
Expand All @@ -186,7 +191,7 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}

capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods, mig.Version(), reserved)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, arch, ephemeralStorage, ssdCount, pods, mig.Version(), reserved)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -318,7 +323,7 @@ func BuildGenericLabels(ref GceRef, machineType string, nodeName string, os Oper
}

// TODO: extract it somehow
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch
result[apiv1.LabelArchStable] = string(DefaultArch)
result[apiv1.LabelOSStable] = string(os)

result[apiv1.LabelInstanceTypeStable] = machineType
Expand Down Expand Up @@ -531,6 +536,40 @@ func extractOperatingSystemDistributionFromImageType(imageType string) Operating
}
}

// SystemArchitecture denotes distribution of the System Architecture used by nodes coming from node group
type SystemArchitecture string

const (
// UnknownArch is used if the Architecture is Unknown
UnknownArch SystemArchitecture = ""
// Amd64 is used if the Architecture is x86_64
Amd64 SystemArchitecture = "amd64"
// Arm64 is used if the Architecture is ARM
Arm64 SystemArchitecture = "arm64"
// DefaultArch is used if the Architecture is used as a fallback if not passed by AUTOSCALER_ENV_VARS
DefaultArch SystemArchitecture = Amd64
)

func extractSystemArchitectureFromKubeEnv(kubeEnv string) SystemArchitecture {
arch, found, err := extractAutoscalerVarFromKubeEnv(kubeEnv, "arch")
if err != nil {
klog.Errorf("error while obtaining arch from AUTOSCALER_ENV_VARS; using default %v", err)
return UnknownArch
}
if !found {
klog.V(4).Infof("no arch defined in AUTOSCALER_ENV_VARS; using default %v", err)
return DefaultArch
}
switch arch {
case string(Arm64):
return Arm64
case string(Amd64):
return Amd64
default:
return UnknownArch
}
}

func extractOperatingSystemDistributionFromKubeEnv(kubeEnv string) OperatingSystemDistribution {
osDistributionValue, found, err := extractAutoscalerVarFromKubeEnv(kubeEnv, "os_distribution")
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/gce/templates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
} else if tc.isEphemeralStorageBlocked {
physicalEphemeralStorageGiB = 0
}
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods, "", &GceReserved{})
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, "", physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods, "", &GceReserved{})
assert.NoError(t, err)
assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity)
if !tc.kubeReserved {
Expand Down Expand Up @@ -532,7 +532,7 @@ func TestBuildCapacityMemory(t *testing.T) {
t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
tb := GceTemplateBuilder{}
noAccelerators := make([]*gce.AcceleratorConfig, 0)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, 0, nil, "", &GceReserved{})
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, "", -1, 0, nil, "", &GceReserved{})
assert.NoError(t, err)
expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110)
assert.NoError(t, err)
Expand Down

0 comments on commit 18c36b5

Please sign in to comment.