Skip to content

Create CentOS Stream 9 AMI with NVIDIA Driver #13

Create CentOS Stream 9 AMI with NVIDIA Driver

Create CentOS Stream 9 AMI with NVIDIA Driver #13

name: Create CentOS Stream 9 AMI with NVIDIA Driver
on:
workflow_dispatch:
jobs:
create-ami:
runs-on: ubuntu-latest
outputs:
ami_id: ${{ steps.create_ami.outputs.ami_id }}
instance_id: ${{ steps.launch_instance.outputs.instance_id }}
steps:
- name: Set up AWS credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Generate SSH key and create AWS key pair
id: ssh_key
shell: bash
run: |
# Generate SSH key pair
ssh-keygen -t rsa -b 4096 -f ~/.ssh/ec2_key -N ""
chmod 600 ~/.ssh/ec2_key
# Create a unique key name using timestamp
KEY_NAME="temp-key-$(date +%Y%m%d-%H%M%S)"
# Import public key to AWS
aws ec2 import-key-pair \
--key-name "$KEY_NAME" \
--public-key-material "fileb://~/.ssh/ec2_key.pub"
# Add to known hosts to prevent fingerprint prompt
echo "StrictHostKeyChecking no" >> ~/.ssh/config
chmod 600 ~/.ssh/config
echo "key_name=${KEY_NAME}" >> $GITHUB_OUTPUT
- name: Launch EC2 instance
id: launch_instance
shell: bash
run: |
INSTANCE_ID=$(aws ec2 run-instances \
--image-id 'ami-01529018e3919dace' \
--instance-type g4dn.xlarge \
--key-name ${{ steps.ssh_key.outputs.key_name }} \
--security-group-ids ${{ secrets.AWS_SECURITY_GROUP_ID }} \
--associate-public-ip-address \
--query 'Instances[0].InstanceId' \
--instance-market-options 'MarketType=spot' \
--output text)
# Wait for instance to be running
aws ec2 wait instance-running --instance-ids $INSTANCE_ID
# Get public IP
PUBLIC_IP=$(aws ec2 describe-instances \
--instance-ids $INSTANCE_ID \
--query 'Reservations[0].Instances[0].PublicIpAddress' \
--output text)
echo "instance_id=${INSTANCE_ID}" >> $GITHUB_OUTPUT
echo "public_ip=${PUBLIC_IP}" >> $GITHUB_OUTPUT
- name: Wait for SSH to be available
id: ssh_check
shell: bash
run: |
START_TIME=$(date +%s)
TIMEOUT=$((10 * 60)) # 10 minutes timeout
while true; do
if [ $(($(date +%s) - START_TIME)) -ge $TIMEOUT ]; then
echo "Timeout waiting for SSH connection"
exit 1
fi
if ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} "echo 'SSH connection successful'" 2>/dev/null; then
echo "status=ready" >> $GITHUB_OUTPUT
break
else
sleep 10
fi
done
- name: Install NVIDIA Driver
if: steps.ssh_check.outputs.status == 'ready'
run: |
ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} '
sudo dnf update -y && \
sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-next-release-latest-9.noarch.rpm && \
export KERNEL_VERSION=$(uname -r) && \
sudo dnf install -y kernel-devel-$KERNEL_VERSION kernel-headers-$KERNEL_VERSION dkms && \
sudo dnf config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo && \
sudo dnf install -y nvidia-driver nvidia-driver-cuda && \
sudo dnf clean expire-cache && \
sudo dnf install -y datacenter-gpu-manager && \
sudo systemctl start nvidia-dcgm && \
sudo systemctl enable nvidia-dcgm && \
dcgmi discovery -l
'
- name: Check GPU and DCGM Installation
id: gpu_check
shell: bash
run: |
START_TIME=$(date +%s)
TIMEOUT=$((30 * 60)) # 30 minutes timeout
while true; do
if [ $(($(date +%s) - START_TIME)) -ge $TIMEOUT ]; then
echo "status=timeout" >> $GITHUB_OUTPUT
exit 1
fi
GPU_CHECK=$(ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} "nvidia-smi && dcgmi discovery -l" 2>&1)
if echo "$GPU_CHECK" | grep -q "NVIDIA-SMI" && echo "$GPU_CHECK" | grep -q "GPU ID"; then
echo "status=ready" >> $GITHUB_OUTPUT
echo "GPU and DCGM installation verified:"
echo "$GPU_CHECK"
break
else
echo "Waiting for GPU and DCGM setup to complete..."
sleep 10
fi
done
- name: Clean up SSH keys before AMI creation
if: steps.gpu_check.outputs.status == 'ready'
run: |
ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} '
sudo rm -f /home/centos/.ssh/authorized_keys && \
sudo rm -f /root/.ssh/authorized_keys
'
- name: Create AMI
id: create_ami
if: steps.gpu_check.outputs.status == 'ready'
shell: bash
run: |
AMI_ID=$(aws ec2 create-image \
--instance-id ${{ steps.launch_instance.outputs.instance_id }} \
--name "CentOS-Stream9-NVIDIA-$(date +'%Y-%m-%d')" \
--description "CentOS Stream 9 with NVIDIA Driver and DCGM" \
--no-reboot \
--query 'ImageId' \
--output text)
# Wait for AMI to be available
aws ec2 wait image-available --image-ids $AMI_ID
echo "ami_id=${AMI_ID}" >> $GITHUB_OUTPUT
- name: Cleanup AWS Resources
if: always()
shell: bash
run: |
# Terminate the instance if it exists
if [ -n "${{ steps.launch_instance.outputs.instance_id }}" ]; then
aws ec2 terminate-instances --instance-ids ${{ steps.launch_instance.outputs.instance_id }}
aws ec2 wait instance-terminated --instance-ids ${{ steps.launch_instance.outputs.instance_id }}
fi
# Delete the temporary key pair if it exists
if [ -n "${{ steps.ssh_key.outputs.key_name }}" ]; then
aws ec2 delete-key-pair --key-name ${{ steps.ssh_key.outputs.key_name }}
fi
# Remove local SSH keys
rm -f ~/.ssh/ec2_key*
- name: Output AMI ID
if: steps.create_ami.outputs.ami_id != ''
run: |
echo "New AMI created with ID: ${{ steps.create_ami.outputs.ami_id }}"