From d3dd856a79bc3a21751ab903f89473ea09440684 Mon Sep 17 00:00:00 2001 From: adrienjt Date: Wed, 8 Sep 2021 14:57:52 -0700 Subject: [PATCH] GCE ephemeral storage on local SSDs If ephemeral_storage_local_ssd_count=N is specified in kube-env (in AUTOSCALER_ENV_VARS), physical ephemeral storage is N*375GiB instead of the boot disk size, and capacity is measured from GKE experiments. The existing BLOCK_EPH_STORAGE_BOOT_DISK is ignored if ephemeral_storage_local_ssd_count>0. --- .../cloudprovider/gce/reserved.go | 73 +++++++++- .../cloudprovider/gce/reserved_test.go | 47 +++++++ .../cloudprovider/gce/templates.go | 77 +++++++++-- .../cloudprovider/gce/templates_test.go | 128 +++++++++++++----- 4 files changed, 276 insertions(+), 49 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/gce/reserved.go b/cluster-autoscaler/cloudprovider/gce/reserved.go index aa69bc84f1ca..4d901d3ee689 100644 --- a/cluster-autoscaler/cloudprovider/gce/reserved.go +++ b/cluster-autoscaler/cloudprovider/gce/reserved.go @@ -23,12 +23,14 @@ import ( "strings" "k8s.io/apimachinery/pkg/api/resource" - klog "k8s.io/klog/v2" + "k8s.io/klog/v2" ) // There should be no imports as it is used standalone in e2e tests const ( + // KiB - KibiByte size (2^10) + KiB = 1024 // MiB - MebiByte size (2^20) MiB = 1024 * 1024 // GiB - GibiByte size (2^30) @@ -197,6 +199,75 @@ func parsePercentageToRatio(percentString string) (float64, error) { return percentVal / 100, nil } +// ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount was +// measured by creating 1-node nodepools in a GKE cluster with ephemeral +// storage on N local SSDs, measuring for each node +// N * 375GiB - .status.capacity["ephemeral-storage"] +var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{ + OperatingSystemDistributionCOS: { + 1: 7289472, + 2: 13725224, + 3: 20031312, + 4: 26332924, + 5: 32634536, + 6: 38946604, + 7: 45254008, + 8: 51556096, + 16: 52837800, + 24: 78686620, + }, + OperatingSystemDistributionUbuntu: { + 1: 7219840, + 2: 13651496, + 3: 19953488, + 4: 26255100, + 5: 32556712, + 6: 38860588, + 7: 45163896, + 8: 51465984, + 16: 52747688, + 24: 78601704, + }, +} + +// EphemeralStorageOnLocalSSDFilesystemOverheadInBytes estimates the difference +// between the total physical capacity of the local SSDs and the ephemeral +// storage filesystem capacity. It uses experimental values measured for all +// possible disk counts in GKE. Custom Kubernetes on GCE may allow intermediate +// counts, attaching the measured count, but not using it all for ephemeral +// storage. In that case, the difference in overhead between GKE and custom node +// images may be higher than the difference in overhead between two disk counts, +// so interpolating wouldn't make much sense. Instead, we use the next count for +// which we measured a filesystem overhead, which is a safer approximation +// (better to reserve more and not scale up than not enough and not schedule). +func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDistribution OperatingSystemDistribution) int64 { + var measuredCount int64 + if diskCount <= 8 { + measuredCount = diskCount + } else if diskCount <= 16 { + measuredCount = 16 + } else { + measuredCount = 24 // max attachable + } + + // the container runtime doesn't affect filesystem overhead + var measuredOS OperatingSystemDistribution + if osDistribution == OperatingSystemDistributionCOSContainerd { + measuredOS = OperatingSystemDistributionCOS + } else if osDistribution == OperatingSystemDistributionUbuntuContainerd { + measuredOS = OperatingSystemDistributionUbuntu + } else { + measuredOS = osDistribution + } + + o, ok := ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[measuredOS] + if !ok { + klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution) + return 0 + } + return o[measuredCount] * KiB +} + // CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 { switch osDistribution { diff --git a/cluster-autoscaler/cloudprovider/gce/reserved_test.go b/cluster-autoscaler/cloudprovider/gce/reserved_test.go index 9c9039dd3861..74b3b58b3898 100644 --- a/cluster-autoscaler/cloudprovider/gce/reserved_test.go +++ b/cluster-autoscaler/cloudprovider/gce/reserved_test.go @@ -103,3 +103,50 @@ func TestCalculateKernelReservedLinux(t *testing.T) { }) } } + +func TestEphemeralStorageOnLocalSSDFilesystemOverheadInBytes(t *testing.T) { + type testCase struct { + scenario string + diskCount int64 + osDistribution OperatingSystemDistribution + expected int64 + } + testCases := []testCase{ + { + scenario: "measured disk count and OS (cos)", + diskCount: 1, + osDistribution: OperatingSystemDistributionCOS, + expected: 7289472 * KiB, + }, + { + scenario: "measured disk count but OS with different container runtime (cos_containerd)", + diskCount: 1, + osDistribution: OperatingSystemDistributionCOSContainerd, + expected: 7289472 * KiB, // same as COS + }, + { + scenario: "measured disk count and OS (ubuntu)", + diskCount: 1, + osDistribution: OperatingSystemDistributionUbuntu, + expected: 7219840 * KiB, + }, + { + scenario: "measured disk count but OS with different container runtime (ubuntu_containerd)", + diskCount: 1, + osDistribution: OperatingSystemDistributionUbuntuContainerd, + expected: 7219840 * KiB, // same as Ubuntu + }, + { + scenario: "mapped disk count", + diskCount: 10, + osDistribution: OperatingSystemDistributionCOS, + expected: 52837800 * KiB, // value measured for 16 disks + }, + } + for _, tc := range testCases { + t.Run(tc.scenario, func(t *testing.T) { + actual := EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(tc.diskCount, tc.osDistribution) + assert.Equal(t, tc.expected, actual) + }) + } +} diff --git a/cluster-autoscaler/cloudprovider/gce/templates.go b/cluster-autoscaler/cloudprovider/gce/templates.go index 4183034c6b27..8fc904f453d5 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates.go +++ b/cluster-autoscaler/cloudprovider/gce/templates.go @@ -30,15 +30,18 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/utils/gpu" "k8s.io/autoscaler/cluster-autoscaler/utils/units" - klog "k8s.io/klog/v2" ) // GceTemplateBuilder builds templates for GCE nodes. type GceTemplateBuilder struct{} +const LocalSSDDiskSizeInGiB = 375 + // TODO: This should be imported from sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common/constants.go // This key is applicable to both GCE and GKE const gceCSITopologyKeyZone = "topology.gke.io/zone" @@ -54,7 +57,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator } // BuildCapacity builds a list of resource capacities given list of hardware. -func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) { +func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) { capacity := apiv1.ResourceList{} if pods == nil { capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI) @@ -71,7 +74,12 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators [] } if ephemeralStorage > 0 { - storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution) + var storageTotal int64 + if ephemeralStorageLocalSSDCount > 0 { + storageTotal = ephemeralStorage - EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(ephemeralStorageLocalSSDCount, osDistribution) + } else { + storageTotal = ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution) + } capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI) } @@ -166,15 +174,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan } var ephemeralStorage int64 = -1 - if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) { - ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties) - if err != nil { - klog.Errorf("could not fetch ephemeral storage from instance template. %s", err) - return nil, err - } + ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue) + if ssdCount > 0 { + ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount) + } else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) { + ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties) + } + if err != nil { + return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err) } - capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods) + capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods) if err != nil { return nil, err } @@ -228,10 +238,51 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan return &node, nil } -// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being +func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 { + v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count") + if err != nil { + klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err) + return 0 + } + + if !found { + return 0 + } + + n, err := strconv.Atoi(v) + if err != nil { + klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err) + return 0 + } + + return int64(n) +} + +func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) { + if instanceProperties.Disks == nil { + return 0, fmt.Errorf("instance properties disks is nil") + } + + var count int64 + for _, disk := range instanceProperties.Disks { + if disk != nil && disk.InitializeParams != nil { + if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" { + count++ + } + } + } + + if count < ssdCount { + return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count") + } + + return ssdCount * LocalSSDDiskSizeInGiB * units.GiB, nil +} + +// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being // picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used // as ephemeral storage -func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { +func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK") if err == nil && found && v == "true" { return true @@ -239,7 +290,7 @@ func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool { return false } -func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) { +func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) { if instanceProperties.Disks == nil { return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil") } diff --git a/cluster-autoscaler/cloudprovider/gce/templates_test.go b/cluster-autoscaler/cloudprovider/gce/templates_test.go index ea6cb3e52a2e..496b880ab3c5 100644 --- a/cluster-autoscaler/cloudprovider/gce/templates_test.go +++ b/cluster-autoscaler/cloudprovider/gce/templates_test.go @@ -34,23 +34,35 @@ import ( quota "k8s.io/apiserver/pkg/quota/v1" ) +// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable +// are loaded into the node template status, a few error scenarios, and physical +// ephemeral storage (an intermediate result), but it doesn't test that capacity +// and allocatable are computed correctly, (the test itself calls +// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable, +// and ParseEvictionHardOrGetDefault to compute expected values); computations +// are tested separately. func TestBuildNodeFromTemplateSetsResources(t *testing.T) { var thirtyPodsPerNode int64 = 30 type testCase struct { - scenario string - kubeEnv string - accelerators []*gce.AcceleratorConfig - mig Mig - physicalCpu int64 - physicalMemory int64 - physicalEphemeralStorage int64 - kubeReserved bool - reservedCpu string - reservedMemory string - reservedEphemeralStorage string - isEphemeralStorageBlocked bool - expectedErr bool - pods *int64 + scenario string + // test inputs + kubeEnv string + accelerators []*gce.AcceleratorConfig + attachedLocalSSDCount int64 + pods *int64 + // other test inputs (constant across test cases, because they are test invariants for now) + physicalCpu int64 + physicalMemory int64 + bootDiskSizeGiB int64 + // dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope + kubeReserved bool + reservedCpu string + reservedMemory string + reservedEphemeralStorage string + isEphemeralStorageBlocked bool + ephemeralStorageLocalSSDCount int64 + // test outputs + expectedErr bool } testCases := []testCase{ { @@ -66,7 +78,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { }, physicalCpu: 8, physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, + bootDiskSizeGiB: 300, kubeReserved: true, reservedCpu: "1000m", reservedMemory: fmt.Sprintf("%v", 1*units.MiB), @@ -112,7 +124,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", physicalCpu: 8, physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, + bootDiskSizeGiB: 300, reservedCpu: "0m", reservedMemory: fmt.Sprintf("%v", 0*units.MiB), reservedEphemeralStorage: "0Gi", @@ -127,15 +139,49 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { "DNS_SERVER_IP: '10.0.0.10'\n" + "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" + "NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n", - physicalCpu: 8, - physicalMemory: 200 * units.MiB, - physicalEphemeralStorage: 300, - reservedCpu: "0m", - reservedMemory: fmt.Sprintf("%v", 0*units.MiB), - reservedEphemeralStorage: "0Gi", - kubeReserved: true, - isEphemeralStorageBlocked: false, - expectedErr: false, + reservedCpu: "0m", + reservedMemory: fmt.Sprintf("%v", 0*units.MiB), + reservedEphemeralStorage: "0Gi", + kubeReserved: true, + expectedErr: false, + }, + { + scenario: "more local SSDs requested for ephemeral storage than attached", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=1\n", + ephemeralStorageLocalSSDCount: 1, + attachedLocalSSDCount: 0, + expectedErr: true, + }, + { + scenario: "all attached local SSDs requested for ephemeral storage", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n", + physicalCpu: 8, + physicalMemory: 200 * units.MiB, + ephemeralStorageLocalSSDCount: 2, + attachedLocalSSDCount: 2, + expectedErr: false, + }, + { + scenario: "more local SSDs attached than requested for ephemeral storage", + kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n", + physicalCpu: 8, + physicalMemory: 200 * units.MiB, + ephemeralStorageLocalSSDCount: 2, + attachedLocalSSDCount: 4, + expectedErr: false, + }, + { + scenario: "ephemeral storage on local SSDs with kube-reserved", + kubeEnv: "AUTOSCALER_ENV_VARS: kube_reserved=cpu=0,memory=0,ephemeral-storage=10Gi;os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n", + physicalCpu: 8, + physicalMemory: 200 * units.MiB, + ephemeralStorageLocalSSDCount: 2, + kubeReserved: true, + reservedCpu: "0m", + reservedMemory: fmt.Sprintf("%v", 0*units.MiB), + reservedEphemeralStorage: "10Gi", + attachedLocalSSDCount: 4, + expectedErr: false, }, } for _, tc := range testCases { @@ -158,12 +204,20 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { { Boot: true, InitializeParams: &gce.AttachedDiskInitializeParams{ - DiskSizeGb: tc.physicalEphemeralStorage, + DiskSizeGb: tc.bootDiskSizeGiB, }, }, }, }, } + for i := int64(0); i < tc.attachedLocalSSDCount; i++ { + template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{ + Type: "SCRATCH", + InitializeParams: &gce.AttachedDiskInitializeParams{ + DiskType: "local-ssd", + }, + }) + } if tc.kubeEnv != "" { template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}} } @@ -176,11 +230,15 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) { assert.NotNil(t, node.Status) assert.NotNil(t, node.Status.Capacity) assert.NotNil(t, node.Status.Allocatable) - physicalEphemeralStorage := tc.physicalEphemeralStorage - if tc.isEphemeralStorageBlocked { - physicalEphemeralStorage = 0 + // this logic is a duplicate of logic under test and would best be captured by + // specifying physicalEphemeralStorageGiB in the testCase struct + physicalEphemeralStorageGiB := tc.bootDiskSizeGiB + if tc.ephemeralStorageLocalSSDCount > 0 { + physicalEphemeralStorageGiB = tc.ephemeralStorageLocalSSDCount * LocalSSDDiskSizeInGiB + } else if tc.isEphemeralStorageBlocked { + physicalEphemeralStorageGiB = 0 } - capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods) + capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods) assert.NoError(t, err) assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity) if !tc.kubeReserved { @@ -371,17 +429,17 @@ func TestParseEvictionHard(t *testing.T) { testCases := []testCase{{ memory: "200Mi", ephemeralStorage: "15%", - memoryExpected: 200 * 1024 * 1024, + memoryExpected: 200 * MiB, ephemeralStorageRatioExpected: 0.15, }, { memory: "2Gi", ephemeralStorage: "11.5%", - memoryExpected: 2 * 1024 * 1024 * 1024, + memoryExpected: 2 * GiB, ephemeralStorageRatioExpected: 0.115, }, { memory: "", ephemeralStorage: "", // empty string, fallback to default - memoryExpected: 100 * 1024 * 1024, + memoryExpected: 100 * MiB, ephemeralStorageRatioExpected: 0.1, }, { memory: "110292", @@ -391,7 +449,7 @@ func TestParseEvictionHard(t *testing.T) { }, { memory: "abcb12", // unparsable, fallback to default ephemeralStorage: "-11%", // negative percentage, should fallback to default - memoryExpected: 100 * 1024 * 1024, + memoryExpected: 100 * MiB, ephemeralStorageRatioExpected: 0.1, }} for _, tc := range testCases { @@ -474,7 +532,7 @@ func TestBuildCapacityMemory(t *testing.T) { t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) { tb := GceTemplateBuilder{} noAccelerators := make([]*gce.AcceleratorConfig, 0) - buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil) + buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, 0, nil) assert.NoError(t, err) expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110) assert.NoError(t, err)