From 3e19bac810effffc8db97dd90ae6ea3ec5f0912f Mon Sep 17 00:00:00 2001 From: adrienjt Date: Wed, 8 Sep 2021 14:57:52 -0700 Subject: [PATCH] GCE ephemeral storage on local SSDs If EPH_STORAGE_LOCAL_SSD_COUNT=N is specified in kube-env (in AUTOSCALER_ENV_VARS), physical ephemeral storage is N*375GiB instead of the boot disk size, and capacity is measured from GKE experiments. The existing BLOCK_EPH_STORAGE_BOOT_DISK is ignored if EPH_STORAGE_LOCAL_SSD_COUNT>0. --- .../cloudprovider/gce/reserved.go | 79 +++++++++++-- .../cloudprovider/gce/templates.go | 76 ++++++++++-- .../cloudprovider/gce/templates_test.go | 111 +++++++++++++----- 3 files changed, 210 insertions(+), 56 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/gce/reserved.go b/cluster-autoscaler/cloudprovider/gce/reserved.go index aa69bc84f1ca..c3113ff13e4f 100644 --- a/cluster-autoscaler/cloudprovider/gce/reserved.go +++ b/cluster-autoscaler/cloudprovider/gce/reserved.go @@ -23,7 +23,7 @@ import ( "strings" "k8s.io/apimachinery/pkg/api/resource" - klog "k8s.io/klog/v2" + "k8s.io/klog/v2" ) // There should be no imports as it is used standalone in e2e tests @@ -197,23 +197,78 @@ func parsePercentageToRatio(percentString string) (float64, error) { return percentVal / 100, nil } +var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{ + OperatingSystemDistributionCOS: { + 1: 7289472, + 2: 13725224, + 3: 20031312, + 4: 26332924, + 5: 32634536, + 6: 38946604, + 7: 45254008, + 8: 51556096, + 16: 52837800, + 24: 78686620, + }, + OperatingSystemDistributionUbuntu: { + 1: 7219840, + 2: 13651496, + 3: 19953488, + 4: 26255100, + 5: 32556712, + 6: 38860588, + 7: 45163896, + 8: 51465984, + 16: 52747688, + 24: 78601704, + }, +} + +// mapActualToMeasuredEphemeralStorageLocalSSDCount returns the next local SSD +// count for which we measured a filesystem overhead. We measured all possible +// counts in GKE, but custom Kubernetes on GCE may allow intermediate counts, +// attaching the measured count, but not using it all for ephemeral storage. In +// that case, the difference in overhead between GKE and custom node images may +// be higher than the difference in overhead between two disk counts, so +// interpolating wouldn't make much sense. +func mapActualToMeasuredEphemeralStorageLocalSSDCount(count int64) int64 { + if count <= 8 { + return count + } + if count <= 16 { + return 16 + } + return 24 // max attachable +} + // CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold -func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 { +func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution, ephemeralStorageLocalSSDCount int64) (storage int64) { switch osDistribution { case OperatingSystemDistributionCOS, OperatingSystemDistributionCOSContainerd: - storage := int64(math.Ceil(0.015635*float64(diskSize))) + int64(math.Ceil(4.148*GiB)) // os partition estimation - storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer - return storage + if ephemeralStorageLocalSSDCount > 0 { + storage = 1024 * ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[OperatingSystemDistributionCOS][mapActualToMeasuredEphemeralStorageLocalSSDCount(ephemeralStorageLocalSSDCount)] + } else { + storage = int64(math.Ceil(4.148*GiB)) + int64(math.Ceil(0.015635*float64(diskSize))) // os partition estimation + storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer + } case OperatingSystemDistributionUbuntu, OperatingSystemDistributionUbuntuContainerd: - storage := int64(math.Ceil(0.03083*float64(diskSize))) + int64(math.Ceil(0.171*GiB)) // os partition estimation - storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer - return storage + if ephemeralStorageLocalSSDCount > 0 { + storage = 1024 * ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[OperatingSystemDistributionUbuntu][mapActualToMeasuredEphemeralStorageLocalSSDCount(ephemeralStorageLocalSSDCount)] + } else { + storage = int64(math.Ceil(0.171*GiB)) + int64(math.Ceil(0.03083*float64(diskSize))) // os partition estimation + storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer + } case OperatingSystemDistributionWindowsLTSC, OperatingSystemDistributionWindowsSAC: - storage := int64(math.Ceil(0.1133 * GiB)) // os partition estimation - storage += int64(math.Ceil(0.010 * GiB)) // over-provisioning buffer - return storage + if ephemeralStorageLocalSSDCount > 0 { + klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution) + storage = 0 + } else { + storage = int64(math.Ceil(0.1133 * GiB)) // os partition estimation + storage += int64(math.Ceil(0.010 * GiB)) // filesystem overhead and over-provisioning buffer + } default: klog.Errorf("CalculateReservedAndEvictionEphemeralStorage called for unknown os distribution %v", osDistribution) - return 0 + storage = 0 } + return storage } diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index 4183034c6b27..aa51e85cc96e 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -30,10 +30,11 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/autoscaler/cluster-autoscaler/utils/units" - klog "k8s.io/klog/v2" ) // GceTemplateBuilder builds templates for GCE nodes. @@ -54,7 +55,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator } // BuildCapacity builds a list of resource capacities given list of hardware. -func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) { +func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) { capacity := apiv1.ResourceList{} if pods == nil { capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI) @@ -71,7 +72,7 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators [] } if ephemeralStorage > 0 { - storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution) + storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution, ephemeralStorageLocalSSDCount) capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI) } @@ -166,15 +167,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan } var ephemeralStorage int64 = -1 - if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) { - ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties) - if err != nil { - klog.Errorf("could not fetch ephemeral storage from instance template. %s", err) - return nil, err - } + ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue) + if ssdCount > 0 { + ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount) + } else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) { + ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties) + } + if err != nil { + return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err) } - capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods) + capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods) if err != nil { return nil, err } @@ -228,10 +231,57 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan return &node, nil } -// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being +func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 { + v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count") + if err != nil { + klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err) + return 0 + } + + if !found { + return 0 + } + + n, err := strconv.Atoi(v) + if err != nil { + klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err) + return 0 + } + + return int64(n) +} + +func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) { + if instanceProperties.Disks == nil { + return 0, fmt.Errorf("instance properties disks is nil") + } + + var count int64 + var diskSizeGiB int64 + for _, disk := range instanceProperties.Disks { + if disk != nil && disk.InitializeParams != nil { + if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" { + count++ + if diskSizeGiB == 0 { + diskSizeGiB = disk.InitializeParams.DiskSizeGb + } else if diskSizeGiB != disk.InitializeParams.DiskSizeGb { + return 0, fmt.Errorf("local SSDs of different sizes are not supported") + } + } + } + } + + if count < ssdCount { + return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count") + } + + return ssdCount * diskSizeGiB * units.GiB, nil +} + +// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being // picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used // as ephemeral storage -func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { +func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK") if err == nil && found && v == "true" { return true @@ -239,7 +289,7 @@ func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { return false } -func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) { +func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) { if instanceProperties.Disks == nil { return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil") } diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index ea6cb3e52a2e..3cb125c92e19 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -34,23 +34,36 @@ import ( quota "k8s.io/apiserver/pkg/quota/v1" ) +// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable +// are loaded into the node template status, a few error scenarios, and physical +// ephemeral storage (an intermediate result), but it doesn't test that capacity +// and allocatable are computed correctly, (the test itself calls +// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable, +// and ParseEvictionHardOrGetDefault to compute expected values); computations +// are tested separately. func TestBuildNodeFromTemplateSetsResources(t *testing.T) { var thirtyPodsPerNode int64 = 30 type testCase struct { - scenario string - kubeEnv string - accelerators []*gce.AcceleratorConfig - mig Mig - physicalCpu int64 - physicalMemory int64 - physicalEphemeralStorage int64 - kubeReserved bool - reservedCpu string - reservedMemory string - reservedEphemeralStorage string - isEphemeralStorageBlocked bool - expectedErr bool - pods *int64 + scenario string + // test inputs + kubeEnv string + accelerators []*gce.AcceleratorConfig + attachedLocalSSDCount int64 + pods *int64 + // other test inputs (constant across test cases, because they are test invariants for now) + physicalCpu int64 + physicalMemory int64 + bootDiskSizeGiB int64 + localSSDSizeGiB int64 + // dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope + kubeReserved bool + reservedCpu string + reservedMemory string + reservedEphemeralStorage string + isEphemeralStorageBlocked bool + ephemeralStorageLocalSSDCount int64 + // test outputs + expectedErr bool } testCases := []testCase{ { @@ -66,7 +79,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { }, physicalCpu: 8, physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, + bootDiskSizeGiB: 300, kubeReserved: true, reservedCpu: "1000m", reservedMemory: fmt.Sprintf("%v", 1*units.MiB), @@ -112,7 +125,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", physicalCpu: 8, physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, + bootDiskSizeGiB: 300, reservedCpu: "0m", reservedMemory: fmt.Sprintf("%v", 0*units.MiB), reservedEphemeralStorage: "0Gi", @@ -127,15 +140,38 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "DNS_SERVER_IP: '10.0.0.10'\n" + "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" + "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, - reservedCpu: "0m", - reservedMemory: fmt.Sprintf("%v", 0*units.MiB), - reservedEphemeralStorage: "0Gi", - kubeReserved: true, - isEphemeralStorageBlocked: false, - expectedErr: false, + reservedCpu: "0m", + reservedMemory: fmt.Sprintf("%v", 0*units.MiB), + reservedEphemeralStorage: "0Gi", + kubeReserved: true, + expectedErr: false, + }, + { + scenario: "more local SSDs requested for ephemeral storage than attached", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=1\n", + ephemeralStorageLocalSSDCount: 1, + attachedLocalSSDCount: 0, + expectedErr: true, + }, + { + scenario: "all attached local SSDs requested for ephemeral storage", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n", + physicalCpu: 8, + physicalMemory: 200 * units.MiB, + ephemeralStorageLocalSSDCount: 2, + attachedLocalSSDCount: 2, + localSSDSizeGiB: 375, + expectedErr: false, + }, + { + scenario: "more local SSDs attached than requested for ephemeral storage", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n", + physicalCpu: 8, + physicalMemory: 200 * units.MiB, + ephemeralStorageLocalSSDCount: 2, + attachedLocalSSDCount: 4, + localSSDSizeGiB: 375, + expectedErr: false, }, } for _, tc := range testCases { @@ -158,12 +194,21 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { { Boot: true, InitializeParams: &gce.AttachedDiskInitializeParams{ - DiskSizeGb: tc.physicalEphemeralStorage, + DiskSizeGb: tc.bootDiskSizeGiB, }, }, }, }, } + for i := int64(0); i < tc.attachedLocalSSDCount; i++ { + template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{ + Type: "SCRATCH", + InitializeParams: &gce.AttachedDiskInitializeParams{ + DiskType: "local-ssd", + DiskSizeGb: tc.localSSDSizeGiB, + }, + }) + } if tc.kubeEnv != "" { template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}} } @@ -176,11 +221,15 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { assert.NotNil(t, node.Status) assert.NotNil(t, node.Status.Capacity) assert.NotNil(t, node.Status.Allocatable) - physicalEphemeralStorage := tc.physicalEphemeralStorage - if tc.isEphemeralStorageBlocked { - physicalEphemeralStorage = 0 + // this logic is a duplicate of logic under test and would best be captured by + // specifying physicalEphemeralStorageGiB in the testCase struct + physicalEphemeralStorageGiB := tc.bootDiskSizeGiB + if tc.ephemeralStorageLocalSSDCount > 0 { + physicalEphemeralStorageGiB = tc.ephemeralStorageLocalSSDCount * tc.localSSDSizeGiB + } else if tc.isEphemeralStorageBlocked { + physicalEphemeralStorageGiB = 0 } - capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods) + capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods) assert.NoError(t, err) assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity) if !tc.kubeReserved { @@ -474,7 +523,7 @@ func TestBuildCapacityMemory(t *testing.T) { t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) { tb := GceTemplateBuilder{} noAccelerators := make([]*gce.AcceleratorConfig, 0) - buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil) + buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, 0, nil) assert.NoError(t, err) expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110) assert.NoError(t, err)