From 34555683ebadfc9b103552b0a309b367338ea541 Mon Sep 17 00:00:00 2001 From: adrienjt Date: Wed, 8 Sep 2021 14:57:52 -0700 Subject: [PATCH] GCE ephemeral storage on local SSDs If EPH_STORAGE_LOCAL_SSD_COUNT=N is specified in kube-env (in AUTOSCALER_ENV_VARS), physical ephemeral storage is N*375GiB instead of the boot disk size, and only the variable part of OS reserved (filesystem overhead vs. fixed OS partition) is subtracted to compute capacity. The existing BLOCK_EPH_STORAGE_BOOT_DISK is ignored if EPH_STORAGE_LOCAL_SSD_COUNT>0. --- .../cloudprovider/gce/reserved.go | 22 ++- .../cloudprovider/gce/templates.go | 72 ++++++-- .../cloudprovider/gce/templates_test.go | 159 +++++++++++------- 3 files changed, 167 insertions(+), 86 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/gce/reserved.go b/cluster-autoscaler/cloudprovider/gce/reserved.go index aa69bc84f1ca..7ab83bd47ec3 100644 --- a/cluster-autoscaler/cloudprovider/gce/reserved.go +++ b/cluster-autoscaler/cloudprovider/gce/reserved.go @@ -23,7 +23,7 @@ import ( "strings" "k8s.io/apimachinery/pkg/api/resource" - klog "k8s.io/klog/v2" + "k8s.io/klog/v2" ) // There should be no imports as it is used standalone in e2e tests @@ -198,19 +198,25 @@ func parsePercentageToRatio(percentString string) (float64, error) { } // CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold -func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 { +func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution, ephemeralStorageOnLocalSSDs bool) (storage int64) { switch osDistribution { case OperatingSystemDistributionCOS, OperatingSystemDistributionCOSContainerd: - storage := int64(math.Ceil(0.015635*float64(diskSize))) + int64(math.Ceil(4.148*GiB)) // os partition estimation - storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer + if !ephemeralStorageOnLocalSSDs { + storage += int64(math.Ceil(4.148 * GiB)) // os partition estimation + } + storage += int64(math.Ceil(0.015635 * float64(diskSize))) // filesystem overhead + storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer return storage case OperatingSystemDistributionUbuntu, OperatingSystemDistributionUbuntuContainerd: - storage := int64(math.Ceil(0.03083*float64(diskSize))) + int64(math.Ceil(0.171*GiB)) // os partition estimation - storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer + if !ephemeralStorageOnLocalSSDs { + storage += int64(math.Ceil(0.171 * GiB)) // os partition estimation + } + storage += int64(math.Ceil(0.03083 * float64(diskSize))) // filesystem overhead + storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer return storage case OperatingSystemDistributionWindowsLTSC, OperatingSystemDistributionWindowsSAC: - storage := int64(math.Ceil(0.1133 * GiB)) // os partition estimation - storage += int64(math.Ceil(0.010 * GiB)) // over-provisioning buffer + storage += int64(math.Ceil(0.1133 * GiB)) // os partition estimation + storage += int64(math.Ceil(0.010 * GiB)) // filesystem overhead and over-provisioning buffer return storage default: klog.Errorf("CalculateReservedAndEvictionEphemeralStorage called for unknown os distribution %v", osDistribution) diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index 4183034c6b27..2451de01ed79 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -30,10 +30,11 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/autoscaler/cluster-autoscaler/utils/units" - klog "k8s.io/klog/v2" ) // GceTemplateBuilder builds templates for GCE nodes. @@ -54,7 +55,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator } // BuildCapacity builds a list of resource capacities given list of hardware. -func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) { +func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageOnLocalSSDs bool, pods *int64) (apiv1.ResourceList, error) { capacity := apiv1.ResourceList{} if pods == nil { capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI) @@ -71,7 +72,7 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators [] } if ephemeralStorage > 0 { - storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution) + storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution, ephemeralStorageOnLocalSSDs) capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI) } @@ -166,15 +167,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan } var ephemeralStorage int64 = -1 - if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) { - ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties) - if err != nil { - klog.Errorf("could not fetch ephemeral storage from instance template. %s", err) - return nil, err - } + ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue) + if ssdCount > 0 { + ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount) + } else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) { + ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties) + } + if err != nil { + return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err) } - capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods) + capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount > 0, pods) if err != nil { return nil, err } @@ -228,10 +231,53 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan return &node, nil } -// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being +func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 { + v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "EPH_STORAGE_LOCAL_SSD_COUNT") + if err != nil { + klog.Warningf("cannot extract EPH_STORAGE_LOCAL_SSD_COUNT from kube-env, default to 0: %v", err) + return 0 + } + + if !found { + return 0 + } + + n, err := strconv.Atoi(v) + if err != nil { + klog.Warningf("cannot parse EPH_STORAGE_LOCAL_SSD_COUNT value, default to 0: %v", err) + return 0 + } + + return int64(n) +} + +func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) { + if instanceProperties.Disks == nil { + return 0, fmt.Errorf("instance properties disks is nil") + } + + var count int64 + for _, disk := range instanceProperties.Disks { + if disk != nil && disk.InitializeParams != nil { + if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" { + // we could just multiply ssdCount by a constant 375GiB, because all GCE local SSDs are 375GiB + // but this loop verifies that the instance template has at least the requested number of local SSDs + // and is forward-compatible with a potentially different size + count++ + if count == ssdCount { + return ssdCount * disk.InitializeParams.DiskSizeGb * units.GiB, nil + } + } + } + } + + return 0, fmt.Errorf("actual local SSD count is lower than EPH_STORAGE_LOCAL_SSD_COUNT") +} + +// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being // picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used // as ephemeral storage -func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { +func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK") if err == nil && found && v == "true" { return true @@ -239,7 +285,7 @@ func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { return false } -func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) { +func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) { if instanceProperties.Disks == nil { return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil") } diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index ea6cb3e52a2e..fecb43e220fb 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -34,23 +34,35 @@ import ( quota "k8s.io/apiserver/pkg/quota/v1" ) +// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable +// are loaded into the node template status, a few error scenarios, and physical +// ephemeral storage (an intermediate result), but it doesn't test that capacity +// and allocatable are computed correctly, (the test itself calls +// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable, +// and ParseEvictionHardOrGetDefault to compute expected values); computations +// are tested separately. func TestBuildNodeFromTemplateSetsResources(t *testing.T) { var thirtyPodsPerNode int64 = 30 + var physicalCpu int64 = 8 + var physicalMemory int64 = 200 * units.MiB + var bootDiskSizeGiB int64 = 300 + var localSSDSizeGiB int64 = 375 type testCase struct { - scenario string - kubeEnv string - accelerators []*gce.AcceleratorConfig - mig Mig - physicalCpu int64 - physicalMemory int64 - physicalEphemeralStorage int64 - kubeReserved bool - reservedCpu string - reservedMemory string - reservedEphemeralStorage string - isEphemeralStorageBlocked bool - expectedErr bool - pods *int64 + scenario string + // test inputs + kubeEnv string + accelerators []*gce.AcceleratorConfig + attachedLocalSSDCount int64 + pods *int64 + // dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope + ephemeralStorageOnLocalSSD bool + kubeReserved bool + reservedCpu string + reservedMemory string + reservedEphemeralStorage string + // test outputs + expectedErr bool + physicalEphemeralStorageGiB int64 } testCases := []testCase{ { @@ -64,14 +76,12 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { {AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3}, {AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8}, }, - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, - kubeReserved: true, - reservedCpu: "1000m", - reservedMemory: fmt.Sprintf("%v", 1*units.MiB), - reservedEphemeralStorage: "30Gi", - expectedErr: false, + kubeReserved: true, + reservedCpu: "1000m", + reservedMemory: fmt.Sprintf("%v", 1*units.MiB), + reservedEphemeralStorage: "30Gi", + expectedErr: false, + physicalEphemeralStorageGiB: bootDiskSizeGiB, }, { scenario: "no kube-reserved in kube-env", @@ -79,29 +89,26 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "NODE_LABELS: a=b,c=d,cloud.google.com/gke-nodepool=pool-3,cloud.google.com/gke-preemptible=true\n" + "DNS_SERVER_IP: '10.0.0.10'\n" + "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - kubeReserved: false, - expectedErr: false, + kubeReserved: false, + expectedErr: false, + physicalEphemeralStorageGiB: bootDiskSizeGiB, }, { - scenario: "no kube-env at all", - kubeEnv: "", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - kubeReserved: false, - expectedErr: false, + scenario: "no kube-env at all", + kubeEnv: "", + kubeReserved: false, + expectedErr: false, + physicalEphemeralStorageGiB: bootDiskSizeGiB, }, { scenario: "totally messed up kube-env", kubeEnv: "This kube-env is totally messed up", expectedErr: true, }, { - scenario: "max pods per node specified", - kubeEnv: "", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - pods: &thirtyPodsPerNode, - kubeReserved: false, - expectedErr: false, + scenario: "max pods per node specified", + kubeEnv: "", + pods: &thirtyPodsPerNode, + kubeReserved: false, + expectedErr: false, + physicalEphemeralStorageGiB: bootDiskSizeGiB, }, { scenario: "BLOCK_EPH_STORAGE_BOOT_DISK in kube-env", @@ -110,15 +117,12 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "DNS_SERVER_IP: '10.0.0.10'\n" + "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=true\n" + "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, - reservedCpu: "0m", - reservedMemory: fmt.Sprintf("%v", 0*units.MiB), - reservedEphemeralStorage: "0Gi", - kubeReserved: true, - isEphemeralStorageBlocked: true, - expectedErr: false, + reservedCpu: "0m", + reservedMemory: fmt.Sprintf("%v", 0*units.MiB), + reservedEphemeralStorage: "0Gi", + kubeReserved: true, + expectedErr: false, + physicalEphemeralStorageGiB: 0, }, { scenario: "BLOCK_EPH_STORAGE_BOOT_DISK is false in kube-env", @@ -127,15 +131,35 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "DNS_SERVER_IP: '10.0.0.10'\n" + "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" + "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, - reservedCpu: "0m", - reservedMemory: fmt.Sprintf("%v", 0*units.MiB), - reservedEphemeralStorage: "0Gi", - kubeReserved: true, - isEphemeralStorageBlocked: false, - expectedErr: false, + reservedCpu: "0m", + reservedMemory: fmt.Sprintf("%v", 0*units.MiB), + reservedEphemeralStorage: "0Gi", + kubeReserved: true, + expectedErr: false, + physicalEphemeralStorageGiB: bootDiskSizeGiB, + }, + { + scenario: "more local SSDs requested for ephemeral storage than attached", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;EPH_STORAGE_LOCAL_SSD_COUNT=1\n", + ephemeralStorageOnLocalSSD: true, + attachedLocalSSDCount: 0, + expectedErr: true, + }, + { + scenario: "all attached local SSDs requested for ephemeral storage", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;EPH_STORAGE_LOCAL_SSD_COUNT=2\n", + ephemeralStorageOnLocalSSD: true, + attachedLocalSSDCount: 2, + expectedErr: false, + physicalEphemeralStorageGiB: localSSDSizeGiB * 2, + }, + { + scenario: "more local SSDs attached than requested for ephemeral storage", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;EPH_STORAGE_LOCAL_SSD_COUNT=2\n", + ephemeralStorageOnLocalSSD: true, + attachedLocalSSDCount: 4, + expectedErr: false, + physicalEphemeralStorageGiB: localSSDSizeGiB * 2, }, } for _, tc := range testCases { @@ -158,16 +182,25 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { { Boot: true, InitializeParams: &gce.AttachedDiskInitializeParams{ - DiskSizeGb: tc.physicalEphemeralStorage, + DiskSizeGb: bootDiskSizeGiB, }, }, }, }, } + for i := int64(0); i < tc.attachedLocalSSDCount; i++ { + template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{ + Type: "SCRATCH", + InitializeParams: &gce.AttachedDiskInitializeParams{ + DiskType: "local-ssd", + DiskSizeGb: localSSDSizeGiB, + }, + }) + } if tc.kubeEnv != "" { template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}} } - node, err := tb.BuildNodeFromTemplate(mig, template, tc.physicalCpu, tc.physicalMemory, tc.pods) + node, err := tb.BuildNodeFromTemplate(mig, template, physicalCpu, physicalMemory, tc.pods) if tc.expectedErr { assert.Error(t, err) } else { @@ -176,11 +209,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { assert.NotNil(t, node.Status) assert.NotNil(t, node.Status.Capacity) assert.NotNil(t, node.Status.Allocatable) - physicalEphemeralStorage := tc.physicalEphemeralStorage - if tc.isEphemeralStorageBlocked { - physicalEphemeralStorage = 0 - } - capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods) + capacity, err := tb.BuildCapacity(physicalCpu, physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, tc.physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageOnLocalSSD, tc.pods) assert.NoError(t, err) assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity) if !tc.kubeReserved { @@ -474,7 +503,7 @@ func TestBuildCapacityMemory(t *testing.T) { t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) { tb := GceTemplateBuilder{} noAccelerators := make([]*gce.AcceleratorConfig, 0) - buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil) + buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, false, nil) assert.NoError(t, err) expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110) assert.NoError(t, err)