Skip to content

Commit

Permalink
Merge pull request #4318 from DataDog/gce-ephemeral-storage-local-ssd
Browse files Browse the repository at this point in the history
GCE ephemeral storage on local SSDs
  • Loading branch information
k8s-ci-robot authored Oct 18, 2021
2 parents d218dca + d3dd856 commit 47bc0f2
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 49 deletions.
73 changes: 72 additions & 1 deletion cluster-autoscaler/cloudprovider/gce/reserved.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ import (
"strings"

"k8s.io/apimachinery/pkg/api/resource"
klog "k8s.io/klog/v2"
"k8s.io/klog/v2"
)

// There should be no imports as it is used standalone in e2e tests

const (
// KiB - KibiByte size (2^10)
KiB = 1024
// MiB - MebiByte size (2^20)
MiB = 1024 * 1024
// GiB - GibiByte size (2^30)
Expand Down Expand Up @@ -197,6 +199,75 @@ func parsePercentageToRatio(percentString string) (float64, error) {
return percentVal / 100, nil
}

// ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount was
// measured by creating 1-node nodepools in a GKE cluster with ephemeral
// storage on N local SSDs, measuring for each node
// N * 375GiB - .status.capacity["ephemeral-storage"]
var ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount = map[OperatingSystemDistribution]map[int64]int64{
OperatingSystemDistributionCOS: {
1: 7289472,
2: 13725224,
3: 20031312,
4: 26332924,
5: 32634536,
6: 38946604,
7: 45254008,
8: 51556096,
16: 52837800,
24: 78686620,
},
OperatingSystemDistributionUbuntu: {
1: 7219840,
2: 13651496,
3: 19953488,
4: 26255100,
5: 32556712,
6: 38860588,
7: 45163896,
8: 51465984,
16: 52747688,
24: 78601704,
},
}

// EphemeralStorageOnLocalSSDFilesystemOverheadInBytes estimates the difference
// between the total physical capacity of the local SSDs and the ephemeral
// storage filesystem capacity. It uses experimental values measured for all
// possible disk counts in GKE. Custom Kubernetes on GCE may allow intermediate
// counts, attaching the measured count, but not using it all for ephemeral
// storage. In that case, the difference in overhead between GKE and custom node
// images may be higher than the difference in overhead between two disk counts,
// so interpolating wouldn't make much sense. Instead, we use the next count for
// which we measured a filesystem overhead, which is a safer approximation
// (better to reserve more and not scale up than not enough and not schedule).
func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDistribution OperatingSystemDistribution) int64 {
var measuredCount int64
if diskCount <= 8 {
measuredCount = diskCount
} else if diskCount <= 16 {
measuredCount = 16
} else {
measuredCount = 24 // max attachable
}

// the container runtime doesn't affect filesystem overhead
var measuredOS OperatingSystemDistribution
if osDistribution == OperatingSystemDistributionCOSContainerd {
measuredOS = OperatingSystemDistributionCOS
} else if osDistribution == OperatingSystemDistributionUbuntuContainerd {
measuredOS = OperatingSystemDistributionUbuntu
} else {
measuredOS = osDistribution
}

o, ok := ephemeralStorageOnLocalSSDFilesystemOverheadInKiBByOSAndDiskCount[measuredOS]
if !ok {
klog.Errorf("Ephemeral storage backed by local SSDs is not supported for image family %v", osDistribution)
return 0
}
return o[measuredCount] * KiB
}

// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
func CalculateOSReservedEphemeralStorage(diskSize int64, osDistribution OperatingSystemDistribution) int64 {
switch osDistribution {
Expand Down
47 changes: 47 additions & 0 deletions cluster-autoscaler/cloudprovider/gce/reserved_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,50 @@ func TestCalculateKernelReservedLinux(t *testing.T) {
})
}
}

func TestEphemeralStorageOnLocalSSDFilesystemOverheadInBytes(t *testing.T) {
type testCase struct {
scenario string
diskCount int64
osDistribution OperatingSystemDistribution
expected int64
}
testCases := []testCase{
{
scenario: "measured disk count and OS (cos)",
diskCount: 1,
osDistribution: OperatingSystemDistributionCOS,
expected: 7289472 * KiB,
},
{
scenario: "measured disk count but OS with different container runtime (cos_containerd)",
diskCount: 1,
osDistribution: OperatingSystemDistributionCOSContainerd,
expected: 7289472 * KiB, // same as COS
},
{
scenario: "measured disk count and OS (ubuntu)",
diskCount: 1,
osDistribution: OperatingSystemDistributionUbuntu,
expected: 7219840 * KiB,
},
{
scenario: "measured disk count but OS with different container runtime (ubuntu_containerd)",
diskCount: 1,
osDistribution: OperatingSystemDistributionUbuntuContainerd,
expected: 7219840 * KiB, // same as Ubuntu
},
{
scenario: "mapped disk count",
diskCount: 10,
osDistribution: OperatingSystemDistributionCOS,
expected: 52837800 * KiB, // value measured for 16 disks
},
}
for _, tc := range testCases {
t.Run(tc.scenario, func(t *testing.T) {
actual := EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(tc.diskCount, tc.osDistribution)
assert.Equal(t, tc.expected, actual)
})
}
}
77 changes: 64 additions & 13 deletions cluster-autoscaler/cloudprovider/gce/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,18 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/utils/units"
klog "k8s.io/klog/v2"
)

// GceTemplateBuilder builds templates for GCE nodes.
type GceTemplateBuilder struct{}

const LocalSSDDiskSizeInGiB = 375

// TODO: This should be imported from sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common/constants.go
// This key is applicable to both GCE and GKE
const gceCSITopologyKeyZone = "topology.gke.io/zone"
Expand All @@ -54,7 +57,7 @@ func (t *GceTemplateBuilder) getAcceleratorCount(accelerators []*gce.Accelerator
}

// BuildCapacity builds a list of resource capacities given list of hardware.
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, pods *int64) (apiv1.ResourceList, error) {
func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []*gce.AcceleratorConfig, os OperatingSystem, osDistribution OperatingSystemDistribution, ephemeralStorage int64, ephemeralStorageLocalSSDCount int64, pods *int64) (apiv1.ResourceList, error) {
capacity := apiv1.ResourceList{}
if pods == nil {
capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)
Expand All @@ -71,7 +74,12 @@ func (t *GceTemplateBuilder) BuildCapacity(cpu int64, mem int64, accelerators []
}

if ephemeralStorage > 0 {
storageTotal := ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
var storageTotal int64
if ephemeralStorageLocalSSDCount > 0 {
storageTotal = ephemeralStorage - EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(ephemeralStorageLocalSSDCount, osDistribution)
} else {
storageTotal = ephemeralStorage - CalculateOSReservedEphemeralStorage(ephemeralStorage, osDistribution)
}
capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(math.Max(float64(storageTotal), 0)), resource.DecimalSI)
}

Expand Down Expand Up @@ -166,15 +174,17 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
}

var ephemeralStorage int64 = -1
if !isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getEphemeralStorageFromInstanceTemplateProperties(template.Properties)
if err != nil {
klog.Errorf("could not fetch ephemeral storage from instance template. %s", err)
return nil, err
}
ssdCount := ephemeralStorageLocalSSDCount(kubeEnvValue)
if ssdCount > 0 {
ephemeralStorage, err = getLocalSSDEphemeralStorageFromInstanceTemplateProperties(template.Properties, ssdCount)
} else if !isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue) {
ephemeralStorage, err = getBootDiskEphemeralStorageFromInstanceTemplateProperties(template.Properties)
}
if err != nil {
return nil, fmt.Errorf("could not fetch ephemeral storage from instance template: %v", err)
}

capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, pods)
capacity, err := t.BuildCapacity(cpu, mem, template.Properties.GuestAccelerators, os, osDistribution, ephemeralStorage, ssdCount, pods)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -228,18 +238,59 @@ func (t *GceTemplateBuilder) BuildNodeFromTemplate(mig Mig, template *gce.Instan
return &node, nil
}

// isEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
func ephemeralStorageLocalSSDCount(kubeEnvValue string) int64 {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "ephemeral_storage_local_ssd_count")
if err != nil {
klog.Warningf("cannot extract ephemeral_storage_local_ssd_count from kube-env, default to 0: %v", err)
return 0
}

if !found {
return 0
}

n, err := strconv.Atoi(v)
if err != nil {
klog.Warningf("cannot parse ephemeral_storage_local_ssd_count value, default to 0: %v", err)
return 0
}

return int64(n)
}

func getLocalSSDEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties, ssdCount int64) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("instance properties disks is nil")
}

var count int64
for _, disk := range instanceProperties.Disks {
if disk != nil && disk.InitializeParams != nil {
if disk.Type == "SCRATCH" && disk.InitializeParams.DiskType == "local-ssd" {
count++
}
}
}

if count < ssdCount {
return 0, fmt.Errorf("actual local SSD count is lower than ephemeral_storage_local_ssd_count")
}

return ssdCount * LocalSSDDiskSizeInGiB * units.GiB, nil
}

// isBootDiskEphemeralStorageWithInstanceTemplateDisabled will allow bypassing Disk Size of Boot Disk from being
// picked up from Instance Template and used as Ephemeral Storage, in case other type of storage are used
// as ephemeral storage
func isEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
func isBootDiskEphemeralStorageWithInstanceTemplateDisabled(kubeEnvValue string) bool {
v, found, err := extractAutoscalerVarFromKubeEnv(kubeEnvValue, "BLOCK_EPH_STORAGE_BOOT_DISK")
if err == nil && found && v == "true" {
return true
}
return false
}

func getEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
func getBootDiskEphemeralStorageFromInstanceTemplateProperties(instanceProperties *gce.InstanceProperties) (ephemeralStorage int64, err error) {
if instanceProperties.Disks == nil {
return 0, fmt.Errorf("unable to get ephemeral storage because instance properties disks is nil")
}
Expand Down
Loading

0 comments on commit 47bc0f2

Please sign in to comment.