Skip to content

Commit

Permalink
GCE ephemeral storage on local SSDs
Browse files Browse the repository at this point in the history
If EPH_STORAGE_LOCAL_SSD_COUNT=N is specified in kube-env (in AUTOSCALER_ENV_VARS), physical ephemeral storage is N*375GiB instead of the boot disk size, and only the variable part of OS reserved (filesystem overhead vs. fixed OS partition) is subtracted to compute capacity.

The existing BLOCK_EPH_STORAGE_BOOT_DISK is ignored if EPH_STORAGE_LOCAL_SSD_COUNT>0.
  • Loading branch information
adrienjt committed Sep 8, 2021
1 parent ecf0241 commit 3455568
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 86 deletions.
22 changes: 14 additions & 8 deletions cluster-autoscaler/cloudprovider/gce/reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
"strings"

"k8s.io/apimachinery/pkg/api/resource"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)

// There should be no imports as it is used standalone in e2e tests
Expand Down Expand Up @@ -198,19 +198,25 @@ func parsePercentageToRatio(percentString string) (float64, error) {
}

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 {
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution, ephemeralStorageOnLocalSSDs bool) (storage int64) {
switch osDistribution {
case OperatingSystemDistributionCOS, OperatingSystemDistributionCOSContainerd:
storage := int64(math.Ceil(0.015635*float64(diskSize))) + int64(math.Ceil(4.148*GiB)) // os partition estimation
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
if !ephemeralStorageOnLocalSSDs {
storage += int64(math.Ceil(4.148 * GiB)) // os partition estimation
}
storage += int64(math.Ceil(0.015635 * float64(diskSize))) // filesystem overhead
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
return storage
case OperatingSystemDistributionUbuntu, OperatingSystemDistributionUbuntuContainerd:
storage := int64(math.Ceil(0.03083*float64(diskSize))) + int64(math.Ceil(0.171*GiB)) // os partition estimation
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
if !ephemeralStorageOnLocalSSDs {
storage += int64(math.Ceil(0.171 * GiB)) // os partition estimation
}
storage += int64(math.Ceil(0.03083 * float64(diskSize))) // filesystem overhead
storage += int64(math.Min(100*MiB, math.Ceil(0.001*float64(diskSize)))) // over-provisioning buffer
return storage
case OperatingSystemDistributionWindowsLTSC, OperatingSystemDistributionWindowsSAC:
storage := int64(math.Ceil(0.1133 * GiB)) // os partition estimation
storage += int64(math.Ceil(0.010 * GiB)) // over-provisioning buffer
storage += int64(math.Ceil(0.1133 * GiB)) // os partition estimation
storage += int64(math.Ceil(0.010 * GiB)) // filesystem overhead and over-provisioning buffer
return storage
default:
klog.Errorf("CalculateReservedAndEvictionEphemeralStorage called for unknown os distribution %v", osDistribution)
Expand Down
72 changes: 59 additions & 13 deletions cluster-autoscaler/cloudprovider/gce/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/units"
klog "k8s.io/klog/v2"
)

// GceTemplateBuilder builds templates for GCE nodes.
Expand All @@ -54,7 +55,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}

// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageOnLocalSSDs bool, pods *int64) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
Expand All @@ -71,7 +72,7 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}

if ephemeralStorage > 0 {
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution, ephemeralStorageOnLocalSSDs)
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}

Expand Down Expand Up @@ -166,15 +167,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
}

var ephemeralStorage int64 = -1
if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties)
if err != nil {
klog.Errorf("could not fetch ephemeral storage from instance template. %s", err)
return nil, err
}
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
if ssdCount > 0 {
ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount)
} else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties)
}
if err != nil {
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}

capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount > 0, pods)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -228,18 +231,61 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return &node, nil
}

// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "EPH_STORAGE_LOCAL_SSD_COUNT")
if err != nil {
klog.Warningf("cannot extract EPH_STORAGE_LOCAL_SSD_COUNT from kube-env, default to 0: %v", err)
return 0
}

if !found {
return 0
}

n, err := strconv.Atoi(v)
if err != nil {
klog.Warningf("cannot parse EPH_STORAGE_LOCAL_SSD_COUNT value, default to 0: %v", err)
return 0
}

return int64(n)
}

func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("instance properties disks is nil")
}

var count int64
for _, disk := range instanceProperties.Disks {
if disk != nil && disk.InitializeParams != nil {
if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" {
// we could just multiply ssdCount by a constant 375GiB, because all GCE local SSDs are 375GiB
// but this loop verifies that the instance template has at least the requested number of local SSDs
// and is forward-compatible with a potentially different size
count++
if count == ssdCount {
return ssdCount * disk.InitializeParams.DiskSizeGb * units.GiB, nil
}
}
}
}

return 0, fmt.Errorf("actual local SSD count is lower than EPH_STORAGE_LOCAL_SSD_COUNT")
}

// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
// picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used
// as ephemeral storage
func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK")
if err == nil && found && v == "true" {
return true
}
return false
}

func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil")
}
Expand Down
159 changes: 94 additions & 65 deletions cluster-autoscaler/cloudprovider/gce/templates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,35 @@ import (
quota "k8s.io/apiserver/pkg/quota/v1"
)

// TestBuildNodeFromTemplateSetsResources tests that capacity and allocatable
// are loaded into the node template status, a few error scenarios, and physical
// ephemeral storage (an intermediate result), but it doesn't test that capacity
// and allocatable are computed correctly, (the test itself calls
// GceTemplateBuilder.BuildCapacity, GceTemplateBuilder.CalculateAllocatable,
// and ParseEvictionHardOrGetDefault to compute expected values); computations
// are tested separately.
func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
var thirtyPodsPerNode int64 = 30
var physicalCpu int64 = 8
var physicalMemory int64 = 200 * units.MiB
var bootDiskSizeGiB int64 = 300
var localSSDSizeGiB int64 = 375
type testCase struct {
scenario string
kubeEnv string
accelerators []*gce.AcceleratorConfig
mig Mig
physicalCpu int64
physicalMemory int64
physicalEphemeralStorage int64
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
isEphemeralStorageBlocked bool
expectedErr bool
pods *int64
scenario string
// test inputs
kubeEnv string
accelerators []*gce.AcceleratorConfig
attachedLocalSSDCount int64
pods *int64
// dependent inputs, should match kubeEnv, used to compute expected capacity and allocatable, out of test scope
ephemeralStorageOnLocalSSD bool
kubeReserved bool
reservedCpu string
reservedMemory string
reservedEphemeralStorage string
// test outputs
expectedErr bool
physicalEphemeralStorageGiB int64
}
testCases := []testCase{
{
Expand All @@ -64,44 +76,39 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
{AcceleratorType: "nvidia-tesla-k80", AcceleratorCount: 3},
{AcceleratorType: "nvidia-tesla-p100", AcceleratorCount: 8},
},
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
kubeReserved: true,
reservedCpu: "1000m",
reservedMemory: fmt.Sprintf("%v", 1*units.MiB),
reservedEphemeralStorage: "30Gi",
expectedErr: false,
kubeReserved: true,
reservedCpu: "1000m",
reservedMemory: fmt.Sprintf("%v", 1*units.MiB),
reservedEphemeralStorage: "30Gi",
expectedErr: false,
physicalEphemeralStorageGiB: bootDiskSizeGiB,
},
{
scenario: "no kube-reserved in kube-env",
kubeEnv: "ENABLE_NODE_PROBLEM_DETECTOR: 'daemonset'\n" +
"NODE_LABELS: a=b,c=d,cloud.google.com/gke-nodepool=pool-3,cloud.google.com/gke-preemptible=true\n" +
"DNS_SERVER_IP: '10.0.0.10'\n" +
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
kubeReserved: false,
expectedErr: false,
kubeReserved: false,
expectedErr: false,
physicalEphemeralStorageGiB: bootDiskSizeGiB,
}, {
scenario: "no kube-env at all",
kubeEnv: "",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
kubeReserved: false,
expectedErr: false,
scenario: "no kube-env at all",
kubeEnv: "",
kubeReserved: false,
expectedErr: false,
physicalEphemeralStorageGiB: bootDiskSizeGiB,
}, {
scenario: "totally messed up kube-env",
kubeEnv: "This kube-env is totally messed up",
expectedErr: true,
}, {
scenario: "max pods per node specified",
kubeEnv: "",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
pods: &thirtyPodsPerNode,
kubeReserved: false,
expectedErr: false,
scenario: "max pods per node specified",
kubeEnv: "",
pods: &thirtyPodsPerNode,
kubeReserved: false,
expectedErr: false,
physicalEphemeralStorageGiB: bootDiskSizeGiB,
},
{
scenario: "BLOCK_EPH_STORAGE_BOOT_DISK in kube-env",
Expand All @@ -110,15 +117,12 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"DNS_SERVER_IP: '10.0.0.10'\n" +
"AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=true\n" +
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
isEphemeralStorageBlocked: true,
expectedErr: false,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
expectedErr: false,
physicalEphemeralStorageGiB: 0,
},
{
scenario: "BLOCK_EPH_STORAGE_BOOT_DISK is false in kube-env",
Expand All @@ -127,15 +131,35 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
"DNS_SERVER_IP: '10.0.0.10'\n" +
"AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;kube_reserved=cpu=0,memory=0,ephemeral-storage=0;BLOCK_EPH_STORAGE_BOOT_DISK=false\n" +
"NODE_TAINTS: 'dedicated=ml:NoSchedule,test=dev:PreferNoSchedule,a=b:c'\n",
physicalCpu: 8,
physicalMemory: 200 * units.MiB,
physicalEphemeralStorage: 300,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
isEphemeralStorageBlocked: false,
expectedErr: false,
reservedCpu: "0m",
reservedMemory: fmt.Sprintf("%v", 0*units.MiB),
reservedEphemeralStorage: "0Gi",
kubeReserved: true,
expectedErr: false,
physicalEphemeralStorageGiB: bootDiskSizeGiB,
},
{
scenario: "more local SSDs requested for ephemeral storage than attached",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;EPH_STORAGE_LOCAL_SSD_COUNT=1\n",
ephemeralStorageOnLocalSSD: true,
attachedLocalSSDCount: 0,
expectedErr: true,
},
{
scenario: "all attached local SSDs requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;EPH_STORAGE_LOCAL_SSD_COUNT=2\n",
ephemeralStorageOnLocalSSD: true,
attachedLocalSSDCount: 2,
expectedErr: false,
physicalEphemeralStorageGiB: localSSDSizeGiB * 2,
},
{
scenario: "more local SSDs attached than requested for ephemeral storage",
kubeEnv: "AUTOSCALER_ENV_VARS: os_distribution=cos;os=linux;EPH_STORAGE_LOCAL_SSD_COUNT=2\n",
ephemeralStorageOnLocalSSD: true,
attachedLocalSSDCount: 4,
expectedErr: false,
physicalEphemeralStorageGiB: localSSDSizeGiB * 2,
},
}
for _, tc := range testCases {
Expand All @@ -158,16 +182,25 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
{
Boot: true,
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskSizeGb: tc.physicalEphemeralStorage,
DiskSizeGb: bootDiskSizeGiB,
},
},
},
},
}
for i := int64(0); i < tc.attachedLocalSSDCount; i++ {
template.Properties.Disks = append(template.Properties.Disks, &gce.AttachedDisk{
Type: "SCRATCH",
InitializeParams: &gce.AttachedDiskInitializeParams{
DiskType: "local-ssd",
DiskSizeGb: localSSDSizeGiB,
},
})
}
if tc.kubeEnv != "" {
template.Properties.Metadata.Items = []*gce.MetadataItems{{Key: "kube-env", Value: &tc.kubeEnv}}
}
node, err := tb.BuildNodeFromTemplate(mig, template, tc.physicalCpu, tc.physicalMemory, tc.pods)
node, err := tb.BuildNodeFromTemplate(mig, template, physicalCpu, physicalMemory, tc.pods)
if tc.expectedErr {
assert.Error(t, err)
} else {
Expand All @@ -176,11 +209,7 @@ func TestBuildNodeFromTemplateSetsResources(t *testing.T) {
assert.NotNil(t, node.Status)
assert.NotNil(t, node.Status.Capacity)
assert.NotNil(t, node.Status.Allocatable)
physicalEphemeralStorage := tc.physicalEphemeralStorage
if tc.isEphemeralStorageBlocked {
physicalEphemeralStorage = 0
}
capacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, physicalEphemeralStorage*units.GiB, tc.pods)
capacity, err := tb.BuildCapacity(physicalCpu, physicalMemory, tc.accelerators, OperatingSystemLinux, OperatingSystemDistributionCOS, tc.physicalEphemeralStorageGiB*units.GiB, tc.ephemeralStorageOnLocalSSD, tc.pods)
assert.NoError(t, err)
assertEqualResourceLists(t, "Capacity", capacity, node.Status.Capacity)
if !tc.kubeReserved {
Expand Down Expand Up @@ -474,7 +503,7 @@ func TestBuildCapacityMemory(t *testing.T) {
t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
tb := GceTemplateBuilder{}
noAccelerators := make([]*gce.AcceleratorConfig, 0)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, nil)
buildCapacity, err := tb.BuildCapacity(tc.physicalCpu, tc.physicalMemory, noAccelerators, tc.os, OperatingSystemDistributionCOS, -1, false, nil)
assert.NoError(t, err)
expectedCapacity, err := makeResourceList2(tc.physicalCpu, tc.expectedCapacityMemory, 0, 110)
assert.NoError(t, err)
Expand Down

0 comments on commit 3455568

Please sign in to comment.