Skip to content

Commit

Permalink
Improved Installation / Prepare Script (#160)
Browse files Browse the repository at this point in the history
* Slimmed down prepare

* Revert "New syntax for swarm"

This reverts commit fa942ce.

* Update libnvidia-container

* Avoid prompts

* Sequence

* Syntax harmonize

* WSL2 compatible detection

* Case insensitive Microsoft

* Remove set -e

* Tune final message
  • Loading branch information
larsll authored Apr 7, 2024
1 parent 10b8483 commit 2daf796
Showing 1 changed file with 62 additions and 58 deletions.
120 changes: 62 additions & 58 deletions bin/prepare.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/bash
#!/usr/bin/env bash

set -e

trap ctrl_c INT

Expand All @@ -7,81 +9,75 @@ function ctrl_c() {
exit 1
}

export DEBIAN_FRONTEND=noninteractive
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"

## Check distribution
distribution=$(
. /etc/os-release
echo $ID$VERSION_ID | sed 's/\.//'
)

## Check if WSL2
if grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
IS_WSL2="yes"
fi

## Remove needsreboot in Ubuntu 22.04
if [[ "${distribution}" == "ubuntu2204" && -z "${IS_WSL2}" ]]; then
sudo apt remove -y needrestart
fi

## Patch system
sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \
sudo apt update && sudo apt-mark hold grub-pc && sudo apt -y -o \
DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade &&
sudo apt-get install --no-install-recommends -y jq
sudo apt install --no-install-recommends -y jq awscli python3-boto3
source $DIR/detect.sh
echo "Detected cloud type ${CLOUD_NAME}"

## Do I have a GPU
GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l)
GPUS=0
if [[ -z "${IS_WSL2}" ]]; then
GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l)
else
if [[ -f /usr/lib/wsl/lib/nvidia-smi ]]; then
GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
fi
if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then
ARCH="cpu"
echo "No NVIDIA GPU detected. Will not install drivers."
else
ARCH="gpu"
fi

## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure)

if [[ "${CLOUD_NAME}" == "azure" ]]; then
ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}')
ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}')

if [ -n "$ADDL_DISK" ] && [ -z "$ADDL_PART" ]; then
echo "Found $ADDL_DISK, preparing it for use"
echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK
sleep 1s
ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1")
sudo mkfs.ext4 $ADDL_DEVICE
sudo mkdir -p /var/lib/docker
echo "$ADDL_DEVICE /var/lib/docker ext4 rw,user,auto 0 0" | sudo tee -a /etc/fstab
mount /var/lib/docker
if [ $? -ne 0 ]; then
echo "Error during preparing of additional disk. Exiting."
exit 1
fi
elif [ -n "$ADDL_DISK" ] && [ -n "$ADDL_PART" ]; then
echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure."

else
echo "Did not find $ADDL_DISK. Installing into present drive/directory structure."
fi
fi

## Adding Nvidia Drivers
if [[ "${ARCH}" == "gpu" ]]; then
distribution=$(
. /etc/os-release
echo $ID$VERSION_ID | sed 's/\.//'
)
curl -sS https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub | sudo gpg --dearmour -o /etc/apt/trusted.gpg.d/nvidia-cuda.gpg
curl -sS https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub | sudo gpg --dearmour -o /etc/apt/trusted.gpg.d/nvidia-machine-learning.gpg
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list
echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list
sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite"
if [[ "${ARCH}" == "gpu" && -z "${IS_WSL2}" ]]; then
case $distribution in
ubuntu2004)
sudo apt install -y nvidia-driver-525-server --no-install-recommends -o Dpkg::Options::="--force-overwrite"
;;
ubuntu2204)
sudo apt install -y nvidia-driver-535-server --no-install-recommends -o Dpkg::Options::="--force-overwrite"
;;
*)
echo "Unsupported distribution: $distribution"
exit 1
;;
esac
fi

## Adding AWSCli
sudo apt-get install -y --no-install-recommends awscli python3-boto3

## Installing Docker
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/docker.gpg
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli docker-buildx-plugin docker-compose-plugin containerd.io
sudo apt install -y --no-install-recommends docker.io docker-buildx docker-compose-v2

## Install Nvidia Docker Container
if [[ "${ARCH}" == "gpu" ]]; then
distribution=$(
. /etc/os-release
echo $ID$VERSION_ID
)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/nvidia-docker.gpg
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg &&
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list

sudo apt update && sudo apt install -y --no-install-recommends nvidia-docker2 nvidia-container-runtime
if [ -f "/etc/docker/daemon.json" ]; then
echo "Altering /etc/docker/daemon.json with default-rutime nvidia."
cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json
Expand All @@ -90,21 +86,29 @@ if [[ "${ARCH}" == "gpu" ]]; then
sudo cp $DIR/../defaults/docker-daemon.json /etc/docker/daemon.json
fi
fi
sudo systemctl enable docker
sudo systemctl restart docker

## Enable and start docker
if [[ -n "${IS_WSL2}" ]]; then
sudo service docker restart
else
sudo systemctl enable docker
sudo systemctl restart docker
fi

## Ensure user can run docker
sudo usermod -a -G docker $(id -un)

## Reboot to load driver -- continue install if in cloud-init
CLOUD_INIT=$(pstree -s $BASHPID | awk /cloud-init/ | wc -l)

if [[ "$CLOUD_INIT" -ne 0 ]]; then
if [[ "${CLOUD_INIT}" -ne 0 ]]; then
echo "Rebooting in 5 seconds. Will continue with install."
cd $DIR
./runonce.sh "./init.sh -c ${CLOUD_NAME} -a ${ARCH}"
sleep 5s
sudo shutdown -r +1
elif [[ -n "${IS_WSL2}" || "${ARCH}" == "cpu" ]]; then
echo "First stage done. Log out, then log back in and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
else
echo "First stage done. Please reboot and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
fi

0 comments on commit 2daf796

Please sign in to comment.