Skip to content

Commit

Permalink
GCE ephemeral storage on local SSDs
Browse files Browse the repository at this point in the history
If EPH_STORAGE_LOCAL_SSD_COUNT=N is specified in kube-env (in AUTOSCALER_ENV_VARS), physical ephemeral storage is N*375GiB instead of the boot disk size, and capacity is measured from GKE experiments.

The existing BLOCK_EPH_STORAGE_BOOT_DISK is ignored if EPH_STORAGE_LOCAL_SSD_COUNT>0.
  • Loading branch information
adrienjt committed Sep 20, 2021
1 parent ecf0241 commit 3e19bac
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 56 deletions.
79 changes: 67 additions & 12 deletions cluster-autoscaler/cloudprovider/gce/reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
"strings"

"k8s.io/apimachinery/pkg/api/resource"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)

// There should be no imports as it is used standalone in e2e tests
Expand Down Expand Up @@ -197,23 +197,78 @@ func parsePercentageToRatio(percentString string) (float64, error) {
return percentVal / 100, nil
}

var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{
OperatingSystemDistributionCOS: {
1: 7289472,
2: 13725224,
3: 20031312,
4: 26332924,
5: 32634536,
6: 38946604,
7: 45254008,
8: 51556096,
16: 52837800,
24: 78686620,
},
OperatingSystemDistributionUbuntu: {
1: 7219840,
2: 13651496,
3: 19953488,
4: 26255100,
5: 32556712,
6: 38860588,
7: 45163896,
8: 51465984,
16: 52747688,
24: 78601704,
},
}

// mapActualToMeasuredEphemeralStorageLocalSSDCount returns the next local SSD
// count for which we measured a filesystem overhead. We measured all possible
// counts in GKE, but custom Kubernetes on GCE may allow intermediate counts,
// attaching the measured count, but not using it all for ephemeral storage. In
// that case, the difference in overhead between GKE and custom node images may
// be higher than the difference in overhead between two disk counts, so
// interpolating wouldn't make much sense.
func mapActualToMeasuredEphemeralStorageLocalSSDCount(count int64) int64 {
if count <= 8 {
return count
}
if count <= 16 {
return 16
}
return 24 // max attachable
}

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 {
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution, ephemeralStorageLocalSSDCount int64) (storage int64) {
switch osDistribution {
case OperatingSystemDistributionCOS, OperatingSystemDistributionCOSContainerd:
storage := int64(math.Ceil(0.015635*float64(diskSize))) + int64(math.Ceil(4.148*GiB)) // os partition estimation
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
return storage
if ephemeralStorageLocalSSDCount > 0 {
storage = 1024 * ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[OperatingSystemDistributionCOS][mapActualToMeasuredEphemeralStorageLocalSSDCount(ephemeralStorageLocalSSDCount)]
} else {
storage = int64(math.Ceil(4.148*GiB)) + int64(math.Ceil(0.015635*float64(diskSize))) // os partition estimation
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
}
case OperatingSystemDistributionUbuntu, OperatingSystemDistributionUbuntuContainerd:
storage := int64(math.Ceil(0.03083*float64(diskSize))) + int64(math.Ceil(0.171*GiB)) // os partition estimation
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
return storage
if ephemeralStorageLocalSSDCount > 0 {
storage = 1024 * ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[OperatingSystemDistributionUbuntu][mapActualToMeasuredEphemeralStorageLocalSSDCount(ephemeralStorageLocalSSDCount)]
} else {
storage = int64(math.Ceil(0.171*GiB)) + int64(math.Ceil(0.03083*float64(diskSize))) // os partition estimation
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
}
case OperatingSystemDistributionWindowsLTSC, OperatingSystemDistributionWindowsSAC:
storage := int64(math.Ceil(0.1133 * GiB)) // os partition estimation
storage += int64(math.Ceil(0.010 * GiB)) // over-provisioning buffer
return storage
if ephemeralStorageLocalSSDCount > 0 {
klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution)
storage = 0
} else {
storage = int64(math.Ceil(0.1133 * GiB)) // os partition estimation
storage += int64(math.Ceil(0.010 * GiB)) // filesystem overhead and over-provisioning buffer
}
default:
klog.Errorf("CalculateReservedAndEvictionEphemeralStorage called for unknown os distribution %v", osDistribution)
return 0
storage = 0
}
return storage
}
76 changes: 63 additions & 13 deletions cluster-autoscaler/cloudprovider/gce/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/units"
klog "k8s.io/klog/v2"
)

// GceTemplateBuilder builds templates for GCE nodes.
Expand All @@ -54,7 +55,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}

// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
Expand All @@ -71,7 +72,7 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}

if ephemeralStorage > 0 {
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution, ephemeralStorageLocalSSDCount)
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}

Expand Down Expand Up @@ -166,15 +167,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
}

var ephemeralStorage int64 = -1
if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties)
if err != nil {
klog.Errorf("could not fetch ephemeral storage from instance template. %s", err)
return nil, err
}
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
if ssdCount > 0 {
ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount)
} else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties)
}
if err != nil {
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}

capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -228,18 +231,65 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return &node, nil
}

// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count")
if err != nil {
klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err)
return 0
}

if !found {
return 0
}

n, err := strconv.Atoi(v)
if err != nil {
klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err)
return 0
}

return int64(n)
}

func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("instance properties disks is nil")
}

var count int64
var diskSizeGiB int64
for _, disk := range instanceProperties.Disks {
if disk != nil && disk.InitializeParams != nil {
if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" {
count++
if diskSizeGiB == 0 {
diskSizeGiB = disk.InitializeParams.DiskSizeGb
} else if diskSizeGiB != disk.InitializeParams.DiskSizeGb {
return 0, fmt.Errorf("local SSDs of different sizes are not supported")
}
}
}
}

if count < ssdCount {
return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count")
}

return ssdCount * diskSizeGiB * units.GiB, nil
}

// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
// picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used
// as ephemeral storage
func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK")
if err == nil && found && v == "true" {
return true
}
return false
}

func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil")
}
Expand Down
111 changes: 80 additions & 31 deletions cluster-autoscaler/cloudprovider/gce/templates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,36 @@ import (
quota "k8s.io/apiserver/pkg/quota/v1"
)

// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable
// are loaded into the node template status, a few error scenarios, and physical
// ephemeral storage (an intermediate result), but it doesn't test that capacity
// and allocatable are computed correctly, (the test itself calls
// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable,
// and ParseEvictionHardOrGetDefault to compute expected values); computations
// are tested separately.
func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
var thirtyPodsPerNode int64 = 30
type testCase struct {
scenario string
kubeEnv string
accelerators []*gce.AcceleratorConfig
mig Mig
physicalCpu int64
physicalMemory int64
physicalEphemeralStorage int64
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
expectedErr bool
pods *int64
scenario string
// test inputs
kubeEnv string
accelerators []*gce.AcceleratorConfig
attachedLocalSSDCount int64
pods *int64
// other test inputs (constant across test cases, because they are test invariants for now)
physicalCpu int64
physicalMemory int64
bootDiskSizeGiB int64
localSSDSizeGiB int64
// dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
ephemeralStorageLocalSSDCount int64
// test outputs
expectedErr bool
}
testCases := []testCase{
{
Expand All @@ -66,7 +79,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
},
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
bootDiskSizeGiB: 300,
kubeReserved: true,
reservedCpu: "1000m",
reservedMemory: fmt.Sprintf("%v", 1*units.MiB),
Expand Down Expand Up @@ -112,7 +125,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
bootDiskSizeGiB: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
Expand All @@ -127,15 +140,38 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"DNS_SERVER_IP: '10.0.0.10'\n" +
"AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" +
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
isEphemeralStorageBlocked: false,
expectedErr: false,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
expectedErr: false,
},
{
scenario: "more local SSDs requested for ephemeral storage than attached",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=1\n",
ephemeralStorageLocalSSDCount: 1,
attachedLocalSSDCount: 0,
expectedErr: true,
},
{
scenario: "all attached local SSDs requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
attachedLocalSSDCount: 2,
localSSDSizeGiB: 375,
expectedErr: false,
},
{
scenario: "more local SSDs attached than requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
attachedLocalSSDCount: 4,
localSSDSizeGiB: 375,
expectedErr: false,
},
}
for _, tc := range testCases {
Expand All @@ -158,12 +194,21 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
{
Boot: true,
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskSizeGb: tc.physicalEphemeralStorage,
DiskSizeGb: tc.bootDiskSizeGiB,
},
},
},
},
}
for i := int64(0); i < tc.attachedLocalSSDCount; i++ {
template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{
Type: "SCRATCH",
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskType: "local-ssd",
DiskSizeGb: tc.localSSDSizeGiB,
},
})
}
if tc.kubeEnv != "" {
template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}}
}
Expand All @@ -176,11 +221,15 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
assert.NotNil(t, node.Status)
assert.NotNil(t, node.Status.Capacity)
assert.NotNil(t, node.Status.Allocatable)
physicalEphemeralStorage := tc.physicalEphemeralStorage
if tc.isEphemeralStorageBlocked {
physicalEphemeralStorage = 0
// this logic is a duplicate of logic under test and would best be captured by
// specifying physicalEphemeralStorageGiB in the testCase struct
physicalEphemeralStorageGiB := tc.bootDiskSizeGiB
if tc.ephemeralStorageLocalSSDCount > 0 {
physicalEphemeralStorageGiB = tc.ephemeralStorageLocalSSDCount * tc.localSSDSizeGiB
} else if tc.isEphemeralStorageBlocked {
physicalEphemeralStorageGiB = 0
}
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods)
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods)
assert.NoError(t, err)
assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity)
if !tc.kubeReserved {
Expand Down Expand Up @@ -474,7 +523,7 @@ func TestBuildCapacityMemory(t *testing.T) {
t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
tb := GceTemplateBuilder{}
noAccelerators := make([]*gce.AcceleratorConfig, 0)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, 0, nil)
assert.NoError(t, err)
expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110)
assert.NoError(t, err)
Expand Down

0 comments on commit 3e19bac

Please sign in to comment.