Create CentOS Stream 9 AMI with NVIDIA Driver

Create CentOS Stream 9 AMI with NVIDIA Driver #13

Summary
Jobs
- create-ami
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/aws_ami_centos9_nvidia_driver.yml at 76c113b

	name: Create CentOS Stream 9 AMI with NVIDIA Driver

	on:
	workflow_dispatch:

	jobs:
	create-ami:
	runs-on: ubuntu-latest
	outputs:
	ami_id: ${{ steps.create_ami.outputs.ami_id }}
	instance_id: ${{ steps.launch_instance.outputs.instance_id }}

	steps:
	- name: Set up AWS credentials
	uses: aws-actions/configure-aws-credentials@v2
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ secrets.AWS_REGION }}

	- name: Generate SSH key and create AWS key pair
	id: ssh_key
	shell: bash
	run: \|
	# Generate SSH key pair
	ssh-keygen -t rsa -b 4096 -f ~/.ssh/ec2_key -N ""
	chmod 600 ~/.ssh/ec2_key

	# Create a unique key name using timestamp
	KEY_NAME="temp-key-$(date +%Y%m%d-%H%M%S)"

	# Import public key to AWS
	aws ec2 import-key-pair \
	--key-name "$KEY_NAME" \
	--public-key-material "fileb://~/.ssh/ec2_key.pub"

	# Add to known hosts to prevent fingerprint prompt
	echo "StrictHostKeyChecking no" >> ~/.ssh/config
	chmod 600 ~/.ssh/config

	echo "key_name=${KEY_NAME}" >> $GITHUB_OUTPUT

	- name: Launch EC2 instance
	id: launch_instance
	shell: bash
	run: \|
	INSTANCE_ID=$(aws ec2 run-instances \
	--image-id 'ami-01529018e3919dace' \
	--instance-type g4dn.xlarge \
	--key-name ${{ steps.ssh_key.outputs.key_name }} \
	--security-group-ids ${{ secrets.AWS_SECURITY_GROUP_ID }} \
	--associate-public-ip-address \
	--query 'Instances[0].InstanceId' \
	--instance-market-options 'MarketType=spot' \
	--output text)

	# Wait for instance to be running
	aws ec2 wait instance-running --instance-ids $INSTANCE_ID

	# Get public IP
	PUBLIC_IP=$(aws ec2 describe-instances \
	--instance-ids $INSTANCE_ID \
	--query 'Reservations[0].Instances[0].PublicIpAddress' \
	--output text)

	echo "instance_id=${INSTANCE_ID}" >> $GITHUB_OUTPUT
	echo "public_ip=${PUBLIC_IP}" >> $GITHUB_OUTPUT

	- name: Wait for SSH to be available
	id: ssh_check
	shell: bash
	run: \|
	START_TIME=$(date +%s)
	TIMEOUT=$((10 * 60)) # 10 minutes timeout

	while true; do
	if [ $(($(date +%s) - START_TIME)) -ge $TIMEOUT ]; then
	echo "Timeout waiting for SSH connection"
	exit 1
	fi

	if ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} "echo 'SSH connection successful'" 2>/dev/null; then
	echo "status=ready" >> $GITHUB_OUTPUT
	break
	else
	sleep 10
	fi
	done

	- name: Install NVIDIA Driver
	if: steps.ssh_check.outputs.status == 'ready'
	run: \|
	ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} '
	sudo dnf update -y && \
	sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
	sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-next-release-latest-9.noarch.rpm && \
	export KERNEL_VERSION=$(uname -r) && \
	sudo dnf install -y kernel-devel-$KERNEL_VERSION kernel-headers-$KERNEL_VERSION dkms && \
	sudo dnf config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo && \
	sudo dnf install -y nvidia-driver nvidia-driver-cuda && \
	sudo dnf clean expire-cache && \
	sudo dnf install -y datacenter-gpu-manager && \
	sudo systemctl start nvidia-dcgm && \
	sudo systemctl enable nvidia-dcgm && \
	dcgmi discovery -l
	'

	- name: Check GPU and DCGM Installation
	id: gpu_check
	shell: bash
	run: \|
	START_TIME=$(date +%s)
	TIMEOUT=$((30 * 60)) # 30 minutes timeout

	while true; do
	if [ $(($(date +%s) - START_TIME)) -ge $TIMEOUT ]; then
	echo "status=timeout" >> $GITHUB_OUTPUT
	exit 1
	fi

	GPU_CHECK=$(ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} "nvidia-smi && dcgmi discovery -l" 2>&1)

	if echo "$GPU_CHECK" \| grep -q "NVIDIA-SMI" && echo "$GPU_CHECK" \| grep -q "GPU ID"; then
	echo "status=ready" >> $GITHUB_OUTPUT
	echo "GPU and DCGM installation verified:"
	echo "$GPU_CHECK"
	break
	else
	echo "Waiting for GPU and DCGM setup to complete..."
	sleep 10
	fi
	done

	- name: Clean up SSH keys before AMI creation
	if: steps.gpu_check.outputs.status == 'ready'
	run: \|
	ssh -i ~/.ssh/ec2_key ec2-user@${{ steps.launch_instance.outputs.public_ip }} '
	sudo rm -f /home/centos/.ssh/authorized_keys && \
	sudo rm -f /root/.ssh/authorized_keys
	'

	- name: Create AMI
	id: create_ami
	if: steps.gpu_check.outputs.status == 'ready'
	shell: bash
	run: \|
	AMI_ID=$(aws ec2 create-image \
	--instance-id ${{ steps.launch_instance.outputs.instance_id }} \
	--name "CentOS-Stream9-NVIDIA-$(date +'%Y-%m-%d')" \
	--description "CentOS Stream 9 with NVIDIA Driver and DCGM" \
	--no-reboot \
	--query 'ImageId' \
	--output text)

	# Wait for AMI to be available
	aws ec2 wait image-available --image-ids $AMI_ID
	echo "ami_id=${AMI_ID}" >> $GITHUB_OUTPUT

	- name: Cleanup AWS Resources
	if: always()
	shell: bash
	run: \|
	# Terminate the instance if it exists
	if [ -n "${{ steps.launch_instance.outputs.instance_id }}" ]; then
	aws ec2 terminate-instances --instance-ids ${{ steps.launch_instance.outputs.instance_id }}
	aws ec2 wait instance-terminated --instance-ids ${{ steps.launch_instance.outputs.instance_id }}
	fi

	# Delete the temporary key pair if it exists
	if [ -n "${{ steps.ssh_key.outputs.key_name }}" ]; then
	aws ec2 delete-key-pair --key-name ${{ steps.ssh_key.outputs.key_name }}
	fi

	# Remove local SSH keys
	rm -f ~/.ssh/ec2_key*

	- name: Output AMI ID
	if: steps.create_ami.outputs.ami_id != ''
	run: \|
	echo "New AMI created with ID: ${{ steps.create_ami.outputs.ami_id }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Create CentOS Stream 9 AMI with NVIDIA Driver #13

Workflow file

Create CentOS Stream 9 AMI with NVIDIA Driver #13

Jobs

Run details

Workflow file for this run