From 46c7673f90b75f6ec08e524774f6656dd7a737f2 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 29 Jul 2023 17:38:01 -0600 Subject: [PATCH 1/2] cleanup efa installer archive before install Currently, the UserData section that runs during cloud init happens before any root volumes are expanded with growpart. Although the best solution would be to ensure the filesystem resize happens before these scripts are run, a quick means to fix the current issue is simply to cleanup the efa installer tar.gz, which is very large. I have tested this with hpc7g for a size 2 and size 8 cluster (previously both not working) and can confirm the devices are functioning after. Signed-off-by: vsoch --- pkg/nodebootstrap/assets/scripts/efa.al2.sh | 1 + pkg/nodebootstrap/assets/scripts/efa.managed.boothook | 1 + 2 files changed, 2 insertions(+) diff --git a/pkg/nodebootstrap/assets/scripts/efa.al2.sh b/pkg/nodebootstrap/assets/scripts/efa.al2.sh index 8179c983af..f99ae6b962 100644 --- a/pkg/nodebootstrap/assets/scripts/efa.al2.sh +++ b/pkg/nodebootstrap/assets/scripts/efa.al2.sh @@ -7,6 +7,7 @@ set -o nounset yum install -y wget wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer.tar.gz tar -xf /tmp/aws-efa-installer.tar.gz -C /tmp +rm -rf /tmp/aws-efa-installer.tar.gz cd /tmp/aws-efa-installer ./efa_installer.sh -y -g /opt/amazon/efa/bin/fi_info -p efa diff --git a/pkg/nodebootstrap/assets/scripts/efa.managed.boothook b/pkg/nodebootstrap/assets/scripts/efa.managed.boothook index 32e191cd24..d2863d42c6 100644 --- a/pkg/nodebootstrap/assets/scripts/efa.managed.boothook +++ b/pkg/nodebootstrap/assets/scripts/efa.managed.boothook @@ -2,6 +2,7 @@ cloud-init-per once yum_wget yum install -y wget cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp +cloud-init-per once rm_efa_gz rm -rf /tmp/aws-efa-installer-latest.tar.gz pushd /tmp/aws-efa-installer cloud-init-per once install_efa ./efa_installer.sh -y -g pop /tmp/aws-efa-installer From 20790322992f7b9d287284b9b4a412e85cc47022 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 21 Aug 2024 02:09:43 -0600 Subject: [PATCH 2/2] efa-installer: remove archive in 2023 files Problem: the node consistently runs out of disk space when adding efa, resulting in an unusable cluster with scattered nodes where the installer failed. Solution: the installer archive itself is huge, and we can simply remove it and avoid this error. Signed-off-by: vsoch --- pkg/nodebootstrap/assets/scripts/efa.al2023.sh | 1 + pkg/nodebootstrap/assets/scripts/efa.managed.al2023.boothook | 1 + pkg/nodebootstrap/managed_al2_test.go | 2 ++ 3 files changed, 4 insertions(+) diff --git a/pkg/nodebootstrap/assets/scripts/efa.al2023.sh b/pkg/nodebootstrap/assets/scripts/efa.al2023.sh index 3aef0ce36f..b73f630813 100644 --- a/pkg/nodebootstrap/assets/scripts/efa.al2023.sh +++ b/pkg/nodebootstrap/assets/scripts/efa.al2023.sh @@ -7,6 +7,7 @@ set -o nounset dnf install -y wget wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer.tar.gz tar -xf /tmp/aws-efa-installer.tar.gz -C /tmp +rm -rf /tmp/aws-efa-installer.tar.gz cd /tmp/aws-efa-installer ./efa_installer.sh -y -g /opt/amazon/efa/bin/fi_info -p efa diff --git a/pkg/nodebootstrap/assets/scripts/efa.managed.al2023.boothook b/pkg/nodebootstrap/assets/scripts/efa.managed.al2023.boothook index 5d2a081688..d8440a4520 100644 --- a/pkg/nodebootstrap/assets/scripts/efa.managed.al2023.boothook +++ b/pkg/nodebootstrap/assets/scripts/efa.managed.al2023.boothook @@ -2,6 +2,7 @@ cloud-init-per once dnf_wget dnf install -y wget cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp +cloud-init-per once rm_efa_gz rm -rf /tmp/aws-efa-installer-latest.tar.gz pushd /tmp/aws-efa-installer cloud-init-per once install_efa ./efa_installer.sh -y -g pop /tmp/aws-efa-installer diff --git a/pkg/nodebootstrap/managed_al2_test.go b/pkg/nodebootstrap/managed_al2_test.go index d463eea253..6b9c08dcd4 100644 --- a/pkg/nodebootstrap/managed_al2_test.go +++ b/pkg/nodebootstrap/managed_al2_test.go @@ -111,6 +111,7 @@ cloud-init-per once yum_wget yum install -y wget cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp +cloud-init-per once rm_efa_gz rm -rf /tmp/aws-efa-installer-latest.tar.gz pushd /tmp/aws-efa-installer cloud-init-per once install_efa ./efa_installer.sh -y -g pop /tmp/aws-efa-installer @@ -143,6 +144,7 @@ cloud-init-per once yum_wget yum install -y wget cloud-init-per once wget_efa wget -q --timeout=20 https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz -O /tmp/aws-efa-installer-latest.tar.gz cloud-init-per once tar_efa tar -xf /tmp/aws-efa-installer-latest.tar.gz -C /tmp +cloud-init-per once rm_efa_gz rm -rf /tmp/aws-efa-installer-latest.tar.gz pushd /tmp/aws-efa-installer cloud-init-per once install_efa ./efa_installer.sh -y -g pop /tmp/aws-efa-installer