Skip to content

Commit

Permalink
Add variable for disk size and increase default (awslabs#782)
Browse files Browse the repository at this point in the history
**Which issue is resolved by this Pull Request:**

DLC gpu images take up too much disk space, which leads to disk pressure
errors.

Second revision:

Adding a variable to configure disk size.

**Description of your changes:**
This PR increases the disk size from 50gb (default) to 75gb for gpu
nodes.

**Testing:**
- [ ] Unit tests pass
- [x] e2e tests pass
- Details about new tests (If this PR adds a new feature)
- Details about any manual tests performed
- Manually tested with
public.ecr.aws/kubeflow-on-aws/notebook-servers/jupyter-pytorch:2.0.0-gpu-py310-cu118-ubuntu20.04-ec2-v1.0,
no disk pressure errors.

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
ananth102 authored and rakuto committed Sep 27, 2023
1 parent 355157a commit f229d49
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 0 deletions.
2 changes: 2 additions & 0 deletions deployments/cognito-rds-s3/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ locals {
min_size = 2
desired_size = 3
max_size = 5
disk_size = var.node_disk_size_cpu
subnet_ids = module.vpc.private_subnets
}

Expand All @@ -40,6 +41,7 @@ locals {
desired_size = 1
max_size = 1
ami_type = "AL2_x86_64_GPU"
disk_size = var.node_disk_size_gpu
subnet_ids = module.vpc.private_subnets
} : null

Expand Down
12 changes: 12 additions & 0 deletions deployments/cognito-rds-s3/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ variable "node_instance_type_gpu_a10g" {
default = null
}

variable "node_disk_size_cpu" {
description = "The disk size of a cpu node."
type = string
default = 50
}

variable "node_disk_size_gpu" {
description = "The disk size of a gpu node."
type = string
default = 75
}

variable "kf_helm_repo_path" {
description = "Full path to the location of the helm repo for KF"
type = string
Expand Down
2 changes: 2 additions & 0 deletions deployments/cognito/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ locals {
min_size = 5
desired_size = 5
max_size = 10
disk_size = var.node_disk_size_cpu
subnet_ids = module.vpc.private_subnets
}

Expand All @@ -40,6 +41,7 @@ locals {
desired_size = 3
max_size = 5
ami_type = "AL2_x86_64_GPU"
disk_size = var.node_disk_size_gpu
subnet_ids = module.vpc.private_subnets
} : null

Expand Down
12 changes: 12 additions & 0 deletions deployments/cognito/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ variable "node_instance_type_gpu" {
default = null
}

variable "node_disk_size_cpu" {
description = "The disk size of a cpu node."
type = string
default = 50
}

variable "node_disk_size_gpu" {
description = "The disk size of a gpu node."
type = string
default = 75
}

variable "cognito_user_pool_name" {
description = "Cognito User Pool name"
type = string
Expand Down
2 changes: 2 additions & 0 deletions deployments/rds-s3/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ locals {
min_size = 5
desired_size = 5
max_size = 10
disk_size = var.node_disk_size_cpu
subnet_ids = module.vpc.private_subnets
}

Expand All @@ -40,6 +41,7 @@ locals {
desired_size = 3
max_size = 5
ami_type = "AL2_x86_64_GPU"
disk_size = var.node_disk_size_gpu
subnet_ids = module.vpc.private_subnets
} : null

Expand Down
12 changes: 12 additions & 0 deletions deployments/rds-s3/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ variable "node_instance_type_gpu" {
default = null
}

variable "node_disk_size_cpu" {
description = "The disk size of a cpu node."
type = string
default = 50
}

variable "node_disk_size_gpu" {
description = "The disk size of a gpu node."
type = string
default = 75
}

variable "use_rds" {
type = bool
default = true
Expand Down
2 changes: 2 additions & 0 deletions deployments/vanilla/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ locals {
min_size = 5
desired_size = 5
max_size = 10
disk_size = var.node_disk_size_cpu
subnet_ids = module.vpc.private_subnets
}

Expand All @@ -40,6 +41,7 @@ locals {
desired_size = 3
max_size = 5
ami_type = "AL2_x86_64_GPU"
disk_size = var.node_disk_size_gpu
subnet_ids = module.vpc.private_subnets
} : null

Expand Down
12 changes: 12 additions & 0 deletions deployments/vanilla/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ variable "node_instance_type_gpu" {
default = null
}

variable "node_disk_size_cpu" {
description = "The disk size of a cpu node."
type = string
default = 50
}

variable "node_disk_size_gpu" {
description = "The disk size of a gpu node."
type = string
default = 75
}

variable "enable_aws_telemetry" {
description = "Enable AWS telemetry component"
type = bool
Expand Down

0 comments on commit f229d49

Please sign in to comment.