Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved Installation / Prepare Script #160

Merged
merged 13 commits into from
Apr 7, 2024
120 changes: 62 additions & 58 deletions bin/prepare.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/bash
#!/usr/bin/env bash

set -e

trap ctrl_c INT

Expand All @@ -7,81 +9,75 @@ function ctrl_c() {
exit 1
}

export DEBIAN_FRONTEND=noninteractive
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"

## Check distribution
distribution=$(
. /etc/os-release
echo $ID$VERSION_ID | sed 's/\.//'
)

## Check if WSL2
if grep -qi Microsoft /proc/version && grep -q "WSL2" /proc/version; then
IS_WSL2="yes"
fi

## Remove needsreboot in Ubuntu 22.04
if [[ "${distribution}" == "ubuntu2204" && -z "${IS_WSL2}" ]]; then
sudo apt remove -y needrestart
fi

## Patch system
sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \
sudo apt update && sudo apt-mark hold grub-pc && sudo apt -y -o \
DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade &&
sudo apt-get install --no-install-recommends -y jq
sudo apt install --no-install-recommends -y jq awscli python3-boto3
source $DIR/detect.sh
echo "Detected cloud type ${CLOUD_NAME}"

## Do I have a GPU
GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l)
GPUS=0
if [[ -z "${IS_WSL2}" ]]; then
GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l)
else
if [[ -f /usr/lib/wsl/lib/nvidia-smi ]]; then
GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
fi
fi
if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; then
ARCH="cpu"
echo "No NVIDIA GPU detected. Will not install drivers."
else
ARCH="gpu"
fi

## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure)

if [[ "${CLOUD_NAME}" == "azure" ]]; then
ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}')
ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}')

if [ -n "$ADDL_DISK" ] && [ -z "$ADDL_PART" ]; then
echo "Found $ADDL_DISK, preparing it for use"
echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK
sleep 1s
ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1")
sudo mkfs.ext4 $ADDL_DEVICE
sudo mkdir -p /var/lib/docker
echo "$ADDL_DEVICE /var/lib/docker ext4 rw,user,auto 0 0" | sudo tee -a /etc/fstab
mount /var/lib/docker
if [ $? -ne 0 ]; then
echo "Error during preparing of additional disk. Exiting."
exit 1
fi
elif [ -n "$ADDL_DISK" ] && [ -n "$ADDL_PART" ]; then
echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure."

else
echo "Did not find $ADDL_DISK. Installing into present drive/directory structure."
fi
fi

## Adding Nvidia Drivers
if [[ "${ARCH}" == "gpu" ]]; then
distribution=$(
. /etc/os-release
echo $ID$VERSION_ID | sed 's/\.//'
)
curl -sS https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub | sudo gpg --dearmour -o /etc/apt/trusted.gpg.d/nvidia-cuda.gpg
curl -sS https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub | sudo gpg --dearmour -o /etc/apt/trusted.gpg.d/nvidia-machine-learning.gpg
echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list
echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list
sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite"
if [[ "${ARCH}" == "gpu" && -z "${IS_WSL2}" ]]; then
case $distribution in
ubuntu2004)
sudo apt install -y nvidia-driver-525-server --no-install-recommends -o Dpkg::Options::="--force-overwrite"
;;
ubuntu2204)
sudo apt install -y nvidia-driver-535-server --no-install-recommends -o Dpkg::Options::="--force-overwrite"
;;
*)
echo "Unsupported distribution: $distribution"
exit 1
;;
esac
fi

## Adding AWSCli
sudo apt-get install -y --no-install-recommends awscli python3-boto3

## Installing Docker
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/docker.gpg
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli docker-buildx-plugin docker-compose-plugin containerd.io
sudo apt install -y --no-install-recommends docker.io docker-buildx docker-compose-v2

## Install Nvidia Docker Container
if [[ "${ARCH}" == "gpu" ]]; then
distribution=$(
. /etc/os-release
echo $ID$VERSION_ID
)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/nvidia-docker.gpg
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg &&
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list

sudo apt update && sudo apt install -y --no-install-recommends nvidia-docker2 nvidia-container-runtime
if [ -f "/etc/docker/daemon.json" ]; then
echo "Altering /etc/docker/daemon.json with default-rutime nvidia."
cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json
Expand All @@ -90,21 +86,29 @@ if [[ "${ARCH}" == "gpu" ]]; then
sudo cp $DIR/../defaults/docker-daemon.json /etc/docker/daemon.json
fi
fi
sudo systemctl enable docker
sudo systemctl restart docker

## Enable and start docker
if [[ -n "${IS_WSL2}" ]]; then
sudo service docker restart
else
sudo systemctl enable docker
sudo systemctl restart docker
fi

## Ensure user can run docker
sudo usermod -a -G docker $(id -un)

## Reboot to load driver -- continue install if in cloud-init
CLOUD_INIT=$(pstree -s $BASHPID | awk /cloud-init/ | wc -l)

if [[ "$CLOUD_INIT" -ne 0 ]]; then
if [[ "${CLOUD_INIT}" -ne 0 ]]; then
echo "Rebooting in 5 seconds. Will continue with install."
cd $DIR
./runonce.sh "./init.sh -c ${CLOUD_NAME} -a ${ARCH}"
sleep 5s
sudo shutdown -r +1
elif [[ -n "${IS_WSL2}" || "${ARCH}" == "cpu" ]]; then
echo "First stage done. Log out, then log back in and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
else
echo "First stage done. Please reboot and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
fi