Skip to content

Commit

Permalink
GCE ephemeral storage on local SSDs
Browse files Browse the repository at this point in the history
If ephemeral_storage_local_ssd_count=N is specified in kube-env (in AUTOSCALER_ENV_VARS), physical ephemeral storage is N*375GiB instead of the boot disk size, and capacity is measured from GKE experiments.

The existing BLOCK_EPH_STORAGE_BOOT_DISK is ignored if ephemeral_storage_local_ssd_count>0.
  • Loading branch information
adrienjt committed Sep 22, 2021
1 parent ecf0241 commit 255594e
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 45 deletions.
64 changes: 63 additions & 1 deletion cluster-autoscaler/cloudprovider/gce/reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
"strings"

"k8s.io/apimachinery/pkg/api/resource"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)

// There should be no imports as it is used standalone in e2e tests
Expand Down Expand Up @@ -197,6 +197,68 @@ func parsePercentageToRatio(percentString string) (float64, error) {
return percentVal / 100, nil
}

// ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount was
// measured by creating 1-node nodepools in a GKE cluster with ephemeral
// storage on N local SSDs, measuring for each node
// N * 375GiB - .status.capacity["ephemeral-storage"]
var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{
OperatingSystemDistributionCOS: {
1: 7289472,
2: 13725224,
3: 20031312,
4: 26332924,
5: 32634536,
6: 38946604,
7: 45254008,
8: 51556096,
16: 52837800,
24: 78686620,
},
OperatingSystemDistributionUbuntu: {
1: 7219840,
2: 13651496,
3: 19953488,
4: 26255100,
5: 32556712,
6: 38860588,
7: 45163896,
8: 51465984,
16: 52747688,
24: 78601704,
},
}

// mapActualToMeasuredEphemeralStorageLocalSSDCount returns the next local SSD
// count for which we measured a filesystem overhead. We measured all possible
// counts in GKE, but custom Kubernetes on GCE may allow intermediate counts,
// attaching the measured count, but not using it all for ephemeral storage. In
// that case, the difference in overhead between GKE and custom node images may
// be higher than the difference in overhead between two disk counts, so
// interpolating wouldn't make much sense.
func mapActualToMeasuredEphemeralStorageLocalSSDCount(count int64) int64 {
if count <= 8 {
return count
}
if count <= 16 {
return 16
}
return 24 // max attachable
}

// EphemeralStorageOnLocalSSDFilesystemOverheadInBytes estimates the difference
// between the total physical capacity of the local SSDs and the ephemeral
// storage filesystem capacity. It uses experimental values measured in GKE,
// which are good-enough approximations for custom Kubernetes on GCE.
func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDistribution OperatingSystemDistribution) int64 {
c := mapActualToMeasuredEphemeralStorageLocalSSDCount(diskCount)
o, ok := ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[osDistribution]
if !ok {
klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution)
return 0
}
return 1024 * o[c]
}

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 {
switch osDistribution {
Expand Down
77 changes: 64 additions & 13 deletions cluster-autoscaler/cloudprovider/gce/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/units"
klog "k8s.io/klog/v2"
)

// GceTemplateBuilder builds templates for GCE nodes.
Expand All @@ -54,7 +55,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}

// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
Expand All @@ -71,7 +72,12 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}

if ephemeralStorage > 0 {
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
var storageTotal int64
if ephemeralStorageLocalSSDCount > 0 {
storageTotal = ephemeralStorage - EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(ephemeralStorageLocalSSDCount, osDistribution)
} else {
storageTotal = ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
}
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}

Expand Down Expand Up @@ -166,15 +172,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
}

var ephemeralStorage int64 = -1
if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties)
if err != nil {
klog.Errorf("could not fetch ephemeral storage from instance template. %s", err)
return nil, err
}
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
if ssdCount > 0 {
ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount)
} else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties)
}
if err != nil {
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}

capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -228,18 +236,61 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return &node, nil
}

// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count")
if err != nil {
klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err)
return 0
}

if !found {
return 0
}

n, err := strconv.Atoi(v)
if err != nil {
klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err)
return 0
}

return int64(n)
}

const localSSDDiskSizeInGiB = 375

func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("instance properties disks is nil")
}

var count int64
for _, disk := range instanceProperties.Disks {
if disk != nil && disk.InitializeParams != nil {
if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" && disk.InitializeParams.DiskSizeGb == localSSDDiskSizeInGiB {
count++
}
}
}

if count < ssdCount {
return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count")
}

return ssdCount * localSSDDiskSizeInGiB * units.GiB, nil
}

// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
// picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used
// as ephemeral storage
func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK")
if err == nil && found && v == "true" {
return true
}
return false
}

func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil")
}
Expand Down
108 changes: 77 additions & 31 deletions cluster-autoscaler/cloudprovider/gce/templates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,35 @@ import (
quota "k8s.io/apiserver/pkg/quota/v1"
)

// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable
// are loaded into the node template status, a few error scenarios, and physical
// ephemeral storage (an intermediate result), but it doesn't test that capacity
// and allocatable are computed correctly, (the test itself calls
// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable,
// and ParseEvictionHardOrGetDefault to compute expected values); computations
// are tested separately.
func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
var thirtyPodsPerNode int64 = 30
type testCase struct {
scenario string
kubeEnv string
accelerators []*gce.AcceleratorConfig
mig Mig
physicalCpu int64
physicalMemory int64
physicalEphemeralStorage int64
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
expectedErr bool
pods *int64
scenario string
// test inputs
kubeEnv string
accelerators []*gce.AcceleratorConfig
attachedLocalSSDCount int64
pods *int64
// other test inputs (constant across test cases, because they are test invariants for now)
physicalCpu int64
physicalMemory int64
bootDiskSizeGiB int64
// dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
ephemeralStorageLocalSSDCount int64
// test outputs
expectedErr bool
}
testCases := []testCase{
{
Expand All @@ -66,7 +78,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
},
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
bootDiskSizeGiB: 300,
kubeReserved: true,
reservedCpu: "1000m",
reservedMemory: fmt.Sprintf("%v", 1*units.MiB),
Expand Down Expand Up @@ -112,7 +124,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
bootDiskSizeGiB: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
Expand All @@ -127,15 +139,36 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"DNS_SERVER_IP: '10.0.0.10'\n" +
"AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" +
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
isEphemeralStorageBlocked: false,
expectedErr: false,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
expectedErr: false,
},
{
scenario: "more local SSDs requested for ephemeral storage than attached",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=1\n",
ephemeralStorageLocalSSDCount: 1,
attachedLocalSSDCount: 0,
expectedErr: true,
},
{
scenario: "all attached local SSDs requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
attachedLocalSSDCount: 2,
expectedErr: false,
},
{
scenario: "more local SSDs attached than requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;ephemeral_storage_local_ssd_count=2\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
ephemeralStorageLocalSSDCount: 2,
attachedLocalSSDCount: 4,
expectedErr: false,
},
}
for _, tc := range testCases {
Expand All @@ -158,12 +191,21 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
{
Boot: true,
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskSizeGb: tc.physicalEphemeralStorage,
DiskSizeGb: tc.bootDiskSizeGiB,
},
},
},
},
}
for i := int64(0); i < tc.attachedLocalSSDCount; i++ {
template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{
Type: "SCRATCH",
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskType: "local-ssd",
DiskSizeGb: localSSDDiskSizeInGiB,
},
})
}
if tc.kubeEnv != "" {
template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}}
}
Expand All @@ -176,11 +218,15 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
assert.NotNil(t, node.Status)
assert.NotNil(t, node.Status.Capacity)
assert.NotNil(t, node.Status.Allocatable)
physicalEphemeralStorage := tc.physicalEphemeralStorage
if tc.isEphemeralStorageBlocked {
physicalEphemeralStorage = 0
// this logic is a duplicate of logic under test and would best be captured by
// specifying physicalEphemeralStorageGiB in the testCase struct
physicalEphemeralStorageGiB := tc.bootDiskSizeGiB
if tc.ephemeralStorageLocalSSDCount > 0 {
physicalEphemeralStorageGiB = tc.ephemeralStorageLocalSSDCount * localSSDDiskSizeInGiB
} else if tc.isEphemeralStorageBlocked {
physicalEphemeralStorageGiB = 0
}
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods)
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageLocalSSDCount, tc.pods)
assert.NoError(t, err)
assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity)
if !tc.kubeReserved {
Expand Down Expand Up @@ -474,7 +520,7 @@ func TestBuildCapacityMemory(t *testing.T) {
t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
tb := GceTemplateBuilder{}
noAccelerators := make([]*gce.AcceleratorConfig, 0)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, 0, nil)
assert.NoError(t, err)
expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110)
assert.NoError(t, err)
Expand Down

0 comments on commit 255594e

Please sign in to comment.