Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GCE ephemeral storage on local SSDs #4318

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 72 additions & 1 deletion cluster-autoscaler/cloudprovider/gce/reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ import (
"strings"

"k8s.io/apimachinery/pkg/api/resource"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)

// There should be no imports as it is used standalone in e2e tests

const (
// KiB - KibiByte size (2^10)
KiB = 1024
// MiB - MebiByte size (2^20)
MiB = 1024 * 1024
// GiB - GibiByte size (2^30)
Expand Down Expand Up @@ -197,6 +199,75 @@ func parsePercentageToRatio(percentString string) (float64, error) {
return percentVal / 100, nil
}

// ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount was
// measured by creating 1-node nodepools in a GKE cluster with ephemeral
// storage on N local SSDs, measuring for each node
// N * 375GiB - .status.capacity["ephemeral-storage"]
var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{
OperatingSystemDistributionCOS: {
1: 7289472,
2: 13725224,
3: 20031312,
4: 26332924,
5: 32634536,
6: 38946604,
7: 45254008,
8: 51556096,
16: 52837800,
24: 78686620,
},
OperatingSystemDistributionUbuntu: {
1: 7219840,
2: 13651496,
3: 19953488,
4: 26255100,
5: 32556712,
6: 38860588,
7: 45163896,
8: 51465984,
16: 52747688,
24: 78601704,
},
}

// EphemeralStorageOnLocalSSDFilesystemOverheadInBytes estimates the difference
// between the total physical capacity of the local SSDs and the ephemeral
// storage filesystem capacity. It uses experimental values measured for all
// possible disk counts in GKE. Custom Kubernetes on GCE may allow intermediate
// counts, attaching the measured count, but not using it all for ephemeral
// storage. In that case, the difference in overhead between GKE and custom node
// images may be higher than the difference in overhead between two disk counts,
// so interpolating wouldn't make much sense. Instead, we use the next count for
// which we measured a filesystem overhead, which is a safer approximation
// (better to reserve more and not scale up than not enough and not schedule).
func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDistribution OperatingSystemDistribution) int64 {
var measuredCount int64
if diskCount <= 8 {
measuredCount = diskCount
} else if diskCount <= 16 {
measuredCount = 16
} else {
measuredCount = 24 // max attachable
}

// the container runtime doesn't affect filesystem overhead
var measuredOS OperatingSystemDistribution
if osDistribution == OperatingSystemDistributionCOSContainerd {
measuredOS = OperatingSystemDistributionCOS
} else if osDistribution == OperatingSystemDistributionUbuntuContainerd {
measuredOS = OperatingSystemDistributionUbuntu
} else {
measuredOS = osDistribution
}

o, ok := ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[measuredOS]
if !ok {
klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution)
return 0
}
return o[measuredCount] * KiB
}

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 {
switch osDistribution {
Expand Down
47 changes: 47 additions & 0 deletions cluster-autoscaler/cloudprovider/gce/reserved_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,50 @@ func TestCalculateKernelReservedLinux(t *testing.T) {
})
}
}

func TestEphemeralStorageOnLocalSSDFilesystemOverheadInBytes(t *testing.T) {
type testCase struct {
scenario string
diskCount int64
osDistribution OperatingSystemDistribution
expected int64
}
testCases := []testCase{
{
scenario: "measured disk count and OS (cos)",
diskCount: 1,
osDistribution: OperatingSystemDistributionCOS,
expected: 7289472 * KiB,
},
{
scenario: "measured disk count but OS with different container runtime (cos_containerd)",
diskCount: 1,
osDistribution: OperatingSystemDistributionCOSContainerd,
expected: 7289472 * KiB, // same as COS
},
{
scenario: "measured disk count and OS (ubuntu)",
diskCount: 1,
osDistribution: OperatingSystemDistributionUbuntu,
expected: 7219840 * KiB,
},
{
scenario: "measured disk count but OS with different container runtime (ubuntu_containerd)",
diskCount: 1,
osDistribution: OperatingSystemDistributionUbuntuContainerd,
expected: 7219840 * KiB, // same as Ubuntu
},
{
scenario: "mapped disk count",
diskCount: 10,
osDistribution: OperatingSystemDistributionCOS,
expected: 52837800 * KiB, // value measured for 16 disks
},
}
for _, tc := range testCases {
t.Run(tc.scenario, func(t *testing.T) {
actual := EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(tc.diskCount, tc.osDistribution)
assert.Equal(t, tc.expected, actual)
})
}
}
77 changes: 64 additions & 13 deletions cluster-autoscaler/cloudprovider/gce/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,18 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/units"
klog "k8s.io/klog/v2"
)

// GceTemplateBuilder builds templates for GCE nodes.
type GceTemplateBuilder struct{}

const LocalSSDDiskSizeInGiB = 375

// TODO: This should be imported from sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common/constants.go
// This key is applicable to both GCE and GKE
const gceCSITopologyKeyZone = "topology.gke.io/zone"
Expand All @@ -54,7 +57,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}

// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
Expand All @@ -71,7 +74,12 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}

if ephemeralStorage > 0 {
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
var storageTotal int64
if ephemeralStorageLocalSSDCount > 0 {
storageTotal = ephemeralStorage - EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(ephemeralStorageLocalSSDCount, osDistribution)
} else {
storageTotal = ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
}
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}

Expand Down Expand Up @@ -166,15 +174,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
}

var ephemeralStorage int64 = -1
if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties)
if err != nil {
klog.Errorf("could not fetch ephemeral storage from instance template. %s", err)
return nil, err
}
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
if ssdCount > 0 {
ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount)
} else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties)
}
if err != nil {
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}

capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -228,18 +238,59 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return &node, nil
}

// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count")
if err != nil {
klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err)
return 0
}

if !found {
return 0
}

n, err := strconv.Atoi(v)
if err != nil {
klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err)
return 0
}

return int64(n)
}

func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("instance properties disks is nil")
}

var count int64
for _, disk := range instanceProperties.Disks {
if disk != nil && disk.InitializeParams != nil {
if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" {
count++

This comment was marked as resolved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is correct.

}
}
}

if count < ssdCount {
return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count")
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies for a dumb question, but why do we need both kube-env var and instance property? Wouldn't it be simpler to just rely on init params? Is there any use case for having an instance with more local SSDs than configured in kube-env?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries. We use a subset of local SSDs as a persistent volume and the rest for ephemeral storage.


return ssdCount * LocalSSDDiskSizeInGiB * units.GiB, nil
}

// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
// picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used
// as ephemeral storage
func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK")
if err == nil && found && v == "true" {
return true
}
return false
}

func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil")
}
Expand Down
Loading