From c151aad060bb2d22c105b6a5a792f79d66f99e42 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Wed, 16 May 2018 11:27:56 -0700 Subject: [PATCH] k8s provision only in CSE (#2881) --- parts/k8s/kubernetesagentcustomdata.yml | 33 +- parts/k8s/kubernetescustomscript.sh | 445 ++++++++++------------- parts/k8s/kubernetesmastercustomdata.yml | 40 +- parts/k8s/kubernetesmastervars.t | 10 +- parts/k8s/kubernetesprovisionsource.sh | 98 ++++- pkg/acsengine/engine.go | 7 +- 6 files changed, 304 insertions(+), 329 deletions(-) diff --git a/parts/k8s/kubernetesagentcustomdata.yml b/parts/k8s/kubernetesagentcustomdata.yml index d54196f33f..f31a7ab48d 100644 --- a/parts/k8s/kubernetesagentcustomdata.yml +++ b/parts/k8s/kubernetesagentcustomdata.yml @@ -174,38 +174,13 @@ coreos: ExecStart=/opt/azure/containers/provision-setup.sh {{else}} runcmd: -- echo `date`,`hostname`, startruncmd>>/opt/m -# the first arg is the number of retries, the second arg is the wait duration between two retries and the rest of the args are the cmd to run - set -x - . /opt/azure/containers/provision_source.sh -- retrycmd_if_failure 20 1 5 apt-mark hold walinuxagent{{GetKubernetesAgentPreprovisionYaml .}} -- echo `date`,`hostname`, preaptupdate>>/opt/m -- apt_get_update -- echo `date`,`hostname`, postaptupdate>>/opt/m -- retrycmd_if_failure 5 10 120 apt-get install -y apt-transport-https ca-certificates nfs-common iptables iproute2 socat util-linux mount ebtables ethtool init-system-helpers -- echo `date`,`hostname`, aptinstall>>/opt/m -- retrycmd_if_failure 10 1 3 systemctl enable rpcbind rpc-statd -- systemctl_restart 20 1 10 rpcbind -- systemctl_restart 20 1 10 rpc-statd -- echo `date`,`hostname`, predockerinstall>>/opt/m -- retrycmd_if_failure_no_stats 180 1 5 curl -fsSL https://aptdocker.azureedge.net/gpg > /tmp/aptdocker.gpg -- retrycmd_if_failure 10 1 5 apt-key add /tmp/aptdocker.gpg -- echo "deb {{WrapAsVariable "dockerEngineDownloadRepo"}} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list -- "echo \"Package: docker-engine\nPin: version {{WrapAsVariable "dockerEngineVersion"}}\nPin-Priority: 550\n\" > /etc/apt/preferences.d/docker.pref" -- apt_get_update -- retrycmd_if_failure 20 1 120 apt-get install -y ebtables docker-engine -- echo "ExecStartPost=/sbin/iptables -P FORWARD ACCEPT" >> /etc/systemd/system/docker.service.d/exec_start.conf -- touch /opt/azure/containers/dockerinstall.complete -- echo `date`,`hostname`, postdockerinstall>>/opt/m -- mkdir -p /etc/kubernetes/manifests -- usermod -aG docker {{WrapAsVariable "username"}} +- timeout 10 apt-mark hold walinuxagent{{GetKubernetesAgentPreprovisionYaml .}} +- timeout 10 apt-mark unhold walinuxagent {{if IsNSeriesSKU .}} +- . /opt/azure/containers/provision_source.sh {{GetGPUDriversInstallScript .}} {{end}} -- echo `date`,`hostname`, PRE-APT-SYSTEMD-DAILY>>/opt/m -- retrycmd_if_failure 20 1 10 /usr/lib/apt/apt.systemd.daily -- echo `date`,`hostname`, POST-APT-SYSTEMD-DAILY>>/opt/m -- retrycmd_if_failure 20 5 5 apt-mark unhold walinuxagent -- mkdir -p /opt/azure/containers && touch /opt/azure/containers/runcmd.complete -- echo `date`,`hostname`, endruncmd>>/opt/m +- mkdir -p /var/log/azure && touch /var/log/azure/cloud-init.complete {{end}} diff --git a/parts/k8s/kubernetescustomscript.sh b/parts/k8s/kubernetescustomscript.sh index fd7225fbd2..dca566c6fe 100644 --- a/parts/k8s/kubernetescustomscript.sh +++ b/parts/k8s/kubernetescustomscript.sh @@ -1,19 +1,31 @@ #!/bin/bash - -# This script runs on every Kubernetes VM -# Exit codes represent the following: -# | exit code number | meaning | -# | 20 | Timeout waiting for docker install to finish | -# | 3 | Service could not be enabled by systemctl | -# | 4 | Service could not be started by systemctl | -# | 5 | Timeout waiting for cloud-init runcmd to complete | -# | 6 | Timeout waiting for a file | -# | 10 | Etcd data dir not found | -# | 11 | Timeout waiting for etcd to be accessible | -# | 30 | Timeout waiting for k8s cluster to be healthy| - set -x +echo `date`,`hostname`, startscript>>/opt/m source /opt/azure/containers/provision_source.sh +# TODO standardize/generalize CSE exit codes +ERR_SYSTEMCTL_ENABLE_FAIL=3 # Service could not be enabled by systemctl +ERR_SYSTEMCTL_START_FAIL=4 # Service could not be started by systemctl +ERR_CLOUD_INIT_TIMEOUT=5 # Timeout waiting for cloud-init runcmd to complete +ERR_FILE_WATCH_TIMEOUT=6 # Timeout waiting for a file +ERR_HOLD_WALINUXAGENT=7 # Unable to place walinuxagent apt package on hold during install +ERR_RELEASE_HOLD_WALINUXAGENT=8 # Unable to release hold on walinuxagent apt package after install +ERR_APT_INSTALL_TIMEOUT=9 # Timeout installing required apt packages +ERR_ETCD_DATA_DIR_NOT_FOUND=10 # Etcd data dir not found +ERR_ETCD_RUNNING_TIMEOUT=11 # Timeout waiting for etcd to be accessible +ERR_ETCD_DOWNLOAD_TIMEOUT=12 # Timeout waiting for etcd to download +ERR_ETCD_VOL_MOUNT_FAIL=13 # Unable to mount etcd disk volume +ERR_ETCD_START_TIMEOUT=14 # Unable to start etcd runtime +ERR_ETCD_CONFIG_FAIL=15 # Unable to configure etcd cluster +ERR_DOCKER_INSTALL_TIMEOUT=20 # Timeout waiting for docker install +ERR_DOCKER_DOWNLOAD_TIMEOUT=21 # Timout waiting for docker download(s) +ERR_DOCKER_KEY_DOWNLOAD_TIMEOUT=22 # Timeout waiting to download docker repo key +ERR_DOCKER_APT_KEY_TIMEOUT=23 # Timeout waiting for docker apt-key +ERR_K8S_RUNNING_TIMEOUT=30 # Timeout waiting for k8s cluster to be healthy +ERR_K8S_DOWNLOAD_TIMEOUT=31 # Timeout waiting for Kubernetes download(s) +ERR_KUBECTL_NOT_FOUND=32 # kubectl client binary not found on local disk +ERR_CNI_DOWNLOAD_TIMEOUT=41 # Timeout waiting for CNI download(s) +ERR_APT_DAILY_TIMEOUT=98 # Timeout waiting for apt daily updates +ERR_APT_UPDATE_TIMEOUT=99 # Timeout waiting for apt-get update to complete OS=$(cat /etc/*-release | grep ^ID= | tr -d 'ID="' | awk '{print toupper($0)}') UBUNTU_OS_NAME="UBUNTU" @@ -21,6 +33,7 @@ RHEL_OS_NAME="RHEL" COREOS_OS_NAME="COREOS" KUBECTL=/usr/local/bin/kubectl DOCKER=/usr/bin/docker +CNI_BIN_DIR=/opt/cni/bin set +x ETCD_PEER_CERT=$(echo ${ETCD_PEER_CERTIFICATES} | cut -d'[' -f 2 | cut -d']' -f 1 | cut -d',' -f $((${MASTER_INDEX}+1))) @@ -32,79 +45,22 @@ if [[ $OS == $COREOS_OS_NAME ]]; then KUBECTL=/opt/kubectl fi -ensureRunCommandCompleted() -{ - echo "waiting for runcmd to finish" - wait_for_file 900 1 /opt/azure/containers/runcmd.complete - if [ ! -f /opt/azure/containers/runcmd.complete ]; then - echo "Timeout waiting for cloud-init runcmd to complete" - exit 5 - fi -} - -ensureDockerInstallCompleted() -{ - echo "waiting for docker install to finish" - wait_for_file 3600 1 /opt/azure/containers/dockerinstall.complete - if [ ! -f /opt/azure/containers/dockerinstall.complete ]; then - echo "Timeout waiting for docker install to finish" - exit 20 - fi -} - -configAddons() { - if [[ "${CLUSTER_AUTOSCALER_ADDON}" = True ]]; then - configClusterAutoscalerAddon - fi - echo `date`,`hostname`, configAddonsDone>>/opt/m -} - -configClusterAutoscalerAddon() { - echo `date`,`hostname`, configClusterAutoscalerAddonStart>>/opt/m - - if [[ "${USE_MANAGED_IDENTITY_EXTENSION}" == true ]]; then - echo `date`,`hostname`, configClusterAutoscalerAddonManagedIdentityStart>>/opt/m - CLUSTER_AUTOSCALER_MSI_VOLUME_MOUNT="- mountPath: /var/lib/waagent/\n\ name: waagent\n\ readOnly: true" - CLUSTER_AUTOSCALER_MSI_VOLUME="- hostPath:\n\ path: /var/lib/waagent/\n\ name: waagent" - CLUSTER_AUTOSCALER_MSI_HOST_NETWORK="hostNetwork: true" - - sed -i "s||${CLUSTER_AUTOSCALER_MSI_VOLUME_MOUNT}|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||${CLUSTER_AUTOSCALER_MSI_VOLUME}|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo "${CLUSTER_AUTOSCALER_MSI_HOST_NETWORK}")|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - echo `date`,`hostname`, configClusterAutoscalerAddonManagedIdentityDone>>/opt/m - elif [[ "${USE_MANAGED_IDENTITY_EXTENSION}" == false ]]; then - sed -i "s||""|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||""|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||""|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - fi - - sed -i "s||$(echo $SERVICE_PRINCIPAL_CLIENT_ID | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo $SERVICE_PRINCIPAL_CLIENT_SECRET | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo $SUBSCRIPTION_ID | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo $TENANT_ID | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo $RESOURCE_GROUP | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo $VM_TYPE | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - sed -i "s||$(echo $PRIMARY_SCALE_SET)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" - echo `date`,`hostname`, configClusterAutoscalerAddonDone>>/opt/m -} - -echo `date`,`hostname`, startscript>>/opt/m - if [ -f /var/run/reboot-required ]; then REBOOTREQUIRED=true else REBOOTREQUIRED=false fi -if [[ ! -z "${MASTER_NODE}" ]]; then - echo "executing master node provision operations" +function waitForCloudInit() { + wait_for_file 900 1 /var/log/azure/cloud-init.complete || exit $ERR_CLOUD_INIT_TIMEOUT +} +function installEtcd() { useradd -U "etcd" usermod -p "$(head -c 32 /dev/urandom | base64)" "etcd" passwd -u "etcd" id "etcd" - echo `date`,`hostname`, beginGettingEtcdCerts>>/opt/m APISERVER_PRIVATE_KEY_PATH="/etc/kubernetes/certs/apiserver.key" touch "${APISERVER_PRIVATE_KEY_PATH}" chmod 0600 "${APISERVER_PRIVATE_KEY_PATH}" @@ -156,34 +112,58 @@ if [[ ! -z "${MASTER_NODE}" ]]; then echo "${ETCD_PEER_CERT}" | base64 --decode > "${ETCD_PEER_CERTIFICATE_PATH}" set -x - echo `date`,`hostname`, endGettingEtcdCerts>>/opt/m - mkdir -p /opt/azure/containers && touch /opt/azure/containers/certs.ready + /opt/azure/containers/setup-etcd.sh > /opt/azure/containers/setup-etcd.log 2>&1 + RET=$? + if [ $RET -ne 0 ]; then + exit $RET + fi - echo `date`,`hostname`, configAddonsStart>>/opt/m - configAddons -else - echo "skipping master node provision operations, this is an agent node" -fi + /opt/azure/containers/mountetcd.sh || exit $ERR_ETCD_VOL_MOUNT_FAIL + systemctl_restart 10 5 30 etcd || exit $ERR_ETCD_START_TIMEOUT + MEMBER="$(sudo etcdctl member list | grep -E ${MASTER_VM_NAME} | cut -d':' -f 1)" + retrycmd_if_failure 10 1 5 sudo etcdctl member update $MEMBER ${ETCD_PEER_URL} || exit $ERR_ETCD_CONFIG_FAIL +} -KUBELET_PRIVATE_KEY_PATH="/etc/kubernetes/certs/client.key" -touch "${KUBELET_PRIVATE_KEY_PATH}" -chmod 0600 "${KUBELET_PRIVATE_KEY_PATH}" -chown root:root "${KUBELET_PRIVATE_KEY_PATH}" +function installDocker() { + apt_get_install 20 30 120 apt-transport-https ca-certificates iptables iproute2 socat util-linux mount ebtables ethtool init-system-helpers || exit $ERR_APT_INSTALL_TIMEOUT + retrycmd_if_failure_no_stats 20 1 5 curl -fsSL https://aptdocker.azureedge.net/gpg > /tmp/aptdocker.gpg || exit $ERR_DOCKER_KEY_DOWNLOAD_TIMEOUT + retrycmd_if_failure 10 5 10 apt-key add /tmp/aptdocker.gpg || exit $ERR_DOCKER_APT_KEY_TIMEOUT + echo "deb ${DOCKER_REPO} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list + printf "Package: docker-engine\nPin: version ${DOCKER_ENGINE_VERSION}\nPin-Priority: 550\n" > /etc/apt/preferences.d/docker.pref + apt_get_update || exit $ERR_APT_UPDATE_TIMEOUT + apt_get_install 20 30 120 ebtables docker-engine || exit $ERR_DOCKER_INSTALL_TIMEOUT + echo "ExecStartPost=/sbin/iptables -P FORWARD ACCEPT" >> /etc/systemd/system/docker.service.d/exec_start.conf + usermod -aG docker ${ADMINUSER} +} -APISERVER_PUBLIC_KEY_PATH="/etc/kubernetes/certs/apiserver.crt" -touch "${APISERVER_PUBLIC_KEY_PATH}" -chmod 0644 "${APISERVER_PUBLIC_KEY_PATH}" -chown root:root "${APISERVER_PUBLIC_KEY_PATH}" +function runAptDaily() { + retrycmd_if_failure 20 30 60 /usr/lib/apt/apt.systemd.daily || exit $ERR_APT_DAILY_TIMEOUT +} -AZURE_JSON_PATH="/etc/kubernetes/azure.json" -touch "${AZURE_JSON_PATH}" -chmod 0600 "${AZURE_JSON_PATH}" -chown root:root "${AZURE_JSON_PATH}" +function generateAggregatedAPICerts() { + wait_for_file 1 1 /etc/kubernetes/generate-proxy-certs.sh && /etc/kubernetes/generate-proxy-certs.sh +} -set +x -echo "${KUBELET_PRIVATE_KEY}" | base64 --decode > "${KUBELET_PRIVATE_KEY_PATH}" -echo "${APISERVER_PUBLIC_KEY}" | base64 --decode > "${APISERVER_PUBLIC_KEY_PATH}" -cat << EOF > "${AZURE_JSON_PATH}" +function configureK8s() { + KUBELET_PRIVATE_KEY_PATH="/etc/kubernetes/certs/client.key" + touch "${KUBELET_PRIVATE_KEY_PATH}" + chmod 0600 "${KUBELET_PRIVATE_KEY_PATH}" + chown root:root "${KUBELET_PRIVATE_KEY_PATH}" + + APISERVER_PUBLIC_KEY_PATH="/etc/kubernetes/certs/apiserver.crt" + touch "${APISERVER_PUBLIC_KEY_PATH}" + chmod 0644 "${APISERVER_PUBLIC_KEY_PATH}" + chown root:root "${APISERVER_PUBLIC_KEY_PATH}" + + AZURE_JSON_PATH="/etc/kubernetes/azure.json" + touch "${AZURE_JSON_PATH}" + chmod 0600 "${AZURE_JSON_PATH}" + chown root:root "${AZURE_JSON_PATH}" + + set +x + echo "${KUBELET_PRIVATE_KEY}" | base64 --decode > "${KUBELET_PRIVATE_KEY_PATH}" + echo "${APISERVER_PUBLIC_KEY}" | base64 --decode > "${APISERVER_PUBLIC_KEY_PATH}" + cat << EOF > "${AZURE_JSON_PATH}" { "cloud":"${TARGET_ENVIRONMENT}", "tenantId": "${TENANT_ID}", @@ -215,19 +195,8 @@ cat << EOF > "${AZURE_JSON_PATH}" "providerKeyVersion": "" } EOF - -set -x - -function ensureFilepath() { - if $REBOOTREQUIRED; then - return - fi - wait_for_file 600 1 $1 - if [ ! -f $1 ]; then - echo "Timeout waiting for $1" - exit 6 - fi - + set -x + generateAggregatedAPICerts } function setKubeletOpts () { @@ -235,10 +204,9 @@ function setKubeletOpts () { } function installCNI() { - CNI_BIN_DIR=/opt/cni/bin mkdir -p $CNI_BIN_DIR CONTAINERNETWORKING_CNI_TGZ_TMP=/tmp/containernetworking_cni.tgz - retrycmd_get_tarball 60 1 $CONTAINERNETWORKING_CNI_TGZ_TMP ${CNI_PLUGINS_URL} + retrycmd_get_tarball 60 5 $CONTAINERNETWORKING_CNI_TGZ_TMP ${CNI_PLUGINS_URL} || exit $ERR_CNI_DOWNLOAD_TIMEOUT tar -xzf $CONTAINERNETWORKING_CNI_TGZ_TMP -C $CNI_BIN_DIR chown -R root:root $CNI_BIN_DIR chmod -R 755 $CNI_BIN_DIR @@ -249,10 +217,9 @@ function configAzureCNI() { mkdir -p $CNI_CONFIG_DIR chown -R root:root $CNI_CONFIG_DIR chmod 755 $CNI_CONFIG_DIR - CNI_BIN_DIR=/opt/cni/bin mkdir -p $CNI_BIN_DIR AZURE_CNI_TGZ_TMP=/tmp/azure_cni.tgz - retrycmd_get_tarball 60 1 $AZURE_CNI_TGZ_TMP ${VNET_CNI_PLUGINS_URL} + retrycmd_get_tarball 60 5 $AZURE_CNI_TGZ_TMP ${VNET_CNI_PLUGINS_URL} || exit $ERR_CNI_DOWNLOAD_TIMEOUT tar -xzf $AZURE_CNI_TGZ_TMP -C $CNI_BIN_DIR installCNI mv $CNI_BIN_DIR/10-azure.conflist $CNI_CONFIG_DIR/ @@ -270,10 +237,28 @@ function configNetworkPlugin() { fi } +function systemctlEnableAndStart() { + systemctl_restart 20 1 10 $1 + RESTART_STATUS=$? + systemctl status $1 --no-pager -l > /var/log/azure/$1-status.log + if [ $RESTART_STATUS -ne 0 ]; then + echo "$1 could not be started" + exit $ERR_SYSTEMCTL_START_FAIL + fi + retrycmd_if_failure 10 1 3 systemctl enable $1 + if [ $? -ne 0 ]; then + echo "$1 could not be enabled by systemctl" + exit $ERR_SYSTEMCTL_ENABLE_FAIL + fi +} + function installClearContainersRuntime() { # Add Clear Containers repository key echo "Adding Clear Containers repository key..." - curl -sSL --retry 5 --retry-delay 10 --retry-max-time 30 "https://download.opensuse.org/repositories/home:clearcontainers:clear-containers-3/xUbuntu_16.04/Release.key" | apt-key add - + CC_RELEASE_KEY_TMP=/tmp/clear-containers-release.key + CC_URL=https://download.opensuse.org/repositories/home:clearcontainers:clear-containers-3/xUbuntu_16.04/Release.key + retrycmd_if_failure_no_stats 20 1 5 curl -fsSL $CC_URL > $CC_RELEASE_KEY_TMP || exit $ERR_APT_INSTALL_TIMEOUT + retrycmd_if_failure 10 5 10 apt-key add $CC_RELEASE_KEY_TMP || exit $ERR_APT_INSTALL_TIMEOUT # Add Clear Container repository echo "Adding Clear Containers repository..." @@ -281,13 +266,17 @@ function installClearContainersRuntime() { # Install Clear Containers runtime echo "Installing Clear Containers runtime..." - apt-get update && apt-get install --no-install-recommends -y \ - cc-runtime + apt_get_update + apt_get_install 20 30 120 cc-runtime # Install the systemd service and socket files. local repo_uri="https://raw.githubusercontent.com/clearcontainers/proxy/3.0.23" - curl -sSL --retry 5 --retry-delay 10 --retry-max-time 30 "${repo_uri}/cc-proxy.service.in" | sed 's#@libexecdir@#/usr/libexec#' > /etc/systemd/system/cc-proxy.service - curl -sSL --retry 5 --retry-delay 10 --retry-max-time 30 "${repo_uri}/cc-proxy.socket.in" | sed 's#@localstatedir@#/var#' > /etc/systemd/system/cc-proxy.socket + CC_SERVICE_IN_TMP=/tmp/cc-proxy.service.in + CC_SOCKET_IN_TMP=/tmp/cc-proxy.socket.in + retrycmd_if_failure_no_stats 20 1 5 curl -fsSL "${repo_uri}/cc-proxy.service.in" > $CC_SERVICE_IN_TMP + retrycmd_if_failure_no_stats 20 1 5 curl -fsSL "${repo_uri}/cc-proxy.socket.in" > $CC_SOCKET_IN_TMP + cat $CC_SERVICE_IN_TMP | sed 's#@libexecdir@#/usr/libexec#' > /etc/systemd/system/cc-proxy.service + cat $CC_SOCKET_IN_TMP sed 's#@localstatedir@#/var#' > /etc/systemd/system/cc-proxy.socket # Enable and start Clear Containers proxy service echo "Enabling and starting Clear Containers proxy service..." @@ -296,15 +285,15 @@ function installClearContainersRuntime() { function installContainerd() { CRI_CONTAINERD_VERSION="1.1.0" - local CONTAINERD_DOWNLOAD_URL="https://storage.googleapis.com/cri-containerd-release/cri-containerd-${CRI_CONTAINERD_VERSION}.linux-amd64.tar.gz" + CONTAINERD_DOWNLOAD_URL="https://storage.googleapis.com/cri-containerd-release/cri-containerd-${CRI_CONTAINERD_VERSION}.linux-amd64.tar.gz" CONTAINERD_TGZ_TMP=/tmp/containerd.tar.gz - retrycmd_get_tarball 60 1 "$CONTAINERD_TGZ_TMP" "$CONTAINERD_DOWNLOAD_URL" + retrycmd_get_tarball 60 5 "$CONTAINERD_TGZ_TMP" "$CONTAINERD_DOWNLOAD_URL" tar -xzf "$CONTAINERD_TGZ_TMP" -C / rm -f "$CONTAINERD_TGZ_TMP" echo "Successfully installed cri-containerd..." - setupContainerd; + setupContainerd } function setupContainerd() { @@ -337,38 +326,8 @@ function ensureContainerd() { fi } -function systemctlEnableAndStart() { - retrycmd_if_failure 10 1 3 systemctl daemon-reload - systemctl enable $1 - systemctl is-enabled $1 - enabled=$? - for i in {1..900}; do - if [ $enabled -ne 0 ]; then - systemctl enable $1 - systemctl is-enabled $1 - enabled=$? - else - echo "$1 took $i seconds to be enabled by systemctl" - break - fi - sleep 1 - done - if [ $enabled -ne 0 ] - then - echo "$1 could not be enabled by systemctl" - exit 3 - fi - systemctl_restart 100 1 10 $1 - retrycmd_if_failure 10 1 3 systemctl status $1 --no-pager -l > /var/log/azure/$1-status.log - systemctl is-failed $1 - if [ $? -eq 0 ] - then - echo "$1 could not be started" - exit 4 - fi -} - function ensureDocker() { + wait_for_file 600 1 $DOCKER || exit $ERR_FILE_WATCH_TIMEOUT systemctlEnableAndStart docker } function ensureKMS() { @@ -380,7 +339,7 @@ function ensureKubelet() { } function extractHyperkube(){ - retrycmd_if_failure 100 1 60 docker pull $HYPERKUBE_URL + retrycmd_if_failure 100 1 60 docker pull $HYPERKUBE_URL || $ERR_K8S_DOWNLOAD_TIMEOUT systemctlEnableAndStart hyperkube-extract } @@ -392,84 +351,24 @@ function ensureJournal(){ echo "ForwardToSyslog=no" >> /etc/systemd/journald.conf } -function ensureK8s() { - if $REBOOTREQUIRED; then - return - fi - k8sHealthy=1 - nodesActive=1 - nodesReady=1 - wait_for_file 600 1 $KUBECTL - for i in {1..600}; do - $KUBECTL 2>/dev/null cluster-info - if [ "$?" = "0" ] - then - echo "k8s cluster is healthy, took $i seconds" - k8sHealthy=0 - break - fi - sleep 1 - done - if [ $k8sHealthy -ne 0 ] - then - echo "k8s cluster is not healthy after $i seconds" - exit 30 - fi - ensurePodSecurityPolicy -} - -function ensureEtcd() { - etcdIsRunning=1 - for i in {1..600}; do - curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --max-time 60 https://127.0.0.1:2379/v2/machines; - if [ $? -eq 0 ] - then - etcdIsRunning=0 - echo "Etcd setup successfully, took $i seconds" - break - fi - sleep 1 - done - if [ $etcdIsRunning -ne 0 ] - then - echo "Etcd not accessible after $i seconds" - exit 11 +function ensurePodSecurityPolicy() { + POD_SECURITY_POLICY_FILE="/etc/kubernetes/manifests/pod-security-policy.yaml" + if [ -f $POD_SECURITY_POLICY_FILE ]; then + $KUBECTL create -f $POD_SECURITY_POLICY_FILE fi } -function ensureEtcdDataDir() { - mount | grep /dev/sdc1 | grep /var/lib/etcddisk - if [ "$?" = "0" ] - then - echo "Etcd is running with data dir at: /var/lib/etcddisk" +function ensureK8sControlPlane() { + if $REBOOTREQUIRED; then return - else - echo "/var/lib/etcddisk was not found at /dev/sdc1. Trying to mount all devices." - s = 5 - for i in {1..60}; do - sudo mount -a && mount | grep /dev/sdc1 | grep /var/lib/etcddisk; - if [ "$?" = "0" ] - then - (( t = ${i} * ${s} )) - echo "/var/lib/etcddisk mounted at: /dev/sdc1, took $t seconds" - return - fi - sleep $s - done fi - - echo "Etcd data dir was not found at: /var/lib/etcddisk" - exit 10 + wait_for_file 600 1 $KUBECTL || exit $ERR_FILE_WATCH_TIMEOUT + retrycmd_if_failure 100 1 20 $KUBECTL 2>/dev/null cluster-info || exit $ERR_K8S_RUNNING_TIMEOUT + ensurePodSecurityPolicy } -function ensurePodSecurityPolicy(){ - if $REBOOTREQUIRED; then - return - fi - POD_SECURITY_POLICY_FILE="/etc/kubernetes/manifests/pod-security-policy.yaml" - if [ -f $POD_SECURITY_POLICY_FILE ]; then - $KUBECTL create -f $POD_SECURITY_POLICY_FILE - fi +function ensureEtcd() { + retrycmd_if_failure 100 1 10 curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --retry 5 --retry-delay 10 --retry-max-time 10 --max-time 60 ${ETCD_CLIENT_URL}/v2/machines || exit $ERR_ETCD_RUNNING_TIMEOUT } function writeKubeConfig() { @@ -509,30 +408,73 @@ users: set -x } -if [[ "$CONTAINER_RUNTIME" == "clear-containers" ]]; then - # If the container runtime is "clear-containers" we need to ensure the - # run command is completed _before_ we start installing all the dependencies - # for clear-containers to make sure there is not a dpkg lock. - ensureRunCommandCompleted - echo `date`,`hostname`, RunCmdCompleted>>/opt/m -fi +function configClusterAutoscalerAddon() { + if [[ "${USE_MANAGED_IDENTITY_EXTENSION}" == true ]]; then + CLUSTER_AUTOSCALER_MSI_VOLUME_MOUNT="- mountPath: /var/lib/waagent/\n\ name: waagent\n\ readOnly: true" + CLUSTER_AUTOSCALER_MSI_VOLUME="- hostPath:\n\ path: /var/lib/waagent/\n\ name: waagent" + CLUSTER_AUTOSCALER_MSI_HOST_NETWORK="hostNetwork: true" + + sed -i "s||${CLUSTER_AUTOSCALER_MSI_VOLUME_MOUNT}|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||${CLUSTER_AUTOSCALER_MSI_VOLUME}|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo "${CLUSTER_AUTOSCALER_MSI_HOST_NETWORK}")|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + elif [[ "${USE_MANAGED_IDENTITY_EXTENSION}" == false ]]; then + sed -i "s||""|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||""|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||""|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + fi + + sed -i "s||$(echo $SERVICE_PRINCIPAL_CLIENT_ID | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo $SERVICE_PRINCIPAL_CLIENT_SECRET | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo $SUBSCRIPTION_ID | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo $TENANT_ID | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo $RESOURCE_GROUP | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo $VM_TYPE | base64)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" + sed -i "s||$(echo $PRIMARY_SCALE_SET)|g" "/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml" +} + +function configAddons() { + if [[ "${CLUSTER_AUTOSCALER_ADDON}" = True ]]; then + configClusterAutoscalerAddon + fi +} if [[ $OS == $UBUNTU_OS_NAME ]]; then + echo `date`,`hostname`, apt-get_update_begin>>/opt/m + apt_get_update || exit $ERR_APT_INSTALL_TIMEOUT + echo `date`,`hostname`, apt-get_update_end>>/opt/m # make sure walinuxagent doesn't get updated in the middle of running this script - retrycmd_if_failure 20 5 5 apt-mark hold walinuxagent + retrycmd_if_failure 20 5 30 apt-mark hold walinuxagent || exit $ERR_HOLD_WALINUXAGENT + +fi + +waitForCloudInit + +if [[ ! -z "${MASTER_NODE}" ]]; then + echo "executing master node provision operations" + installEtcd +else + echo "skipping master node provision operations, this is an agent node" fi -echo `date`,`hostname`, EnsureDockerStart>>/opt/m -ensureDockerInstallCompleted +installDocker +runAptDaily +configureK8s ensureDocker -echo `date`,`hostname`, configNetworkPluginStart>>/opt/m configNetworkPlugin -echo `date`,`hostname`, configAddonsStart>>/opt/m -configAddons + +if [[ ! -z "${MASTER_NODE}" ]]; then + echo `date`,`hostname`, configAddonsStart>>/opt/m + configAddons + echo `date`,`hostname`, configAddonsDone>>/opt/m +fi + +echo `date`,`hostname`, extractHyperkubeStart>>/opt/m +extractHyperkube +echo `date`,`hostname`, extractHyperkubeDone>>/opt/m + if [[ "$CONTAINER_RUNTIME" == "clear-containers" ]]; then # Ensure we can nest virtualization if grep -q vmx /proc/cpuinfo; then - echo `date`,`hostname`, installClearContainersRuntimeStart>>/opt/m installClearContainersRuntime fi fi @@ -542,27 +484,18 @@ if [[ "$CONTAINER_RUNTIME" == "clear-containers" ]] || [[ "$CONTAINER_RUNTIME" = fi echo `date`,`hostname`, ensureContainerdStart>>/opt/m ensureContainerd -echo `date`,`hostname`, extractHyperkubeStart>>/opt/m -extractHyperkube + if [[ ! -z "${MASTER_NODE}" && ! -z "${EnableEncryptionWithExternalKms}" ]]; then - echo `date`,`hostname`, ensureKMSStart>>/opt/m ensureKMS fi -echo `date`,`hostname`, ensureKubeletStart>>/opt/m + ensureKubelet -echo `date`,`hostname`, ensureJournalStart>>/opt/m ensureJournal -echo `date`,`hostname`, ensureJournalDone>>/opt/m -ensureRunCommandCompleted -echo `date`,`hostname`, RunCmdCompleted>>/opt/m if [[ ! -z "${MASTER_NODE}" ]]; then writeKubeConfig - ensureFilepath $KUBECTL - ensureFilepath $DOCKER - ensureEtcdDataDir ensureEtcd - ensureK8s + ensureK8sControlPlane fi if [[ $OS == $UBUNTU_OS_NAME ]]; then @@ -570,18 +503,16 @@ if [[ $OS == $UBUNTU_OS_NAME ]]; then echo 2dd1ce17-079e-403c-b352-a1921ee207ee > /sys/bus/vmbus/drivers/hv_util/unbind sed -i "13i\echo 2dd1ce17-079e-403c-b352-a1921ee207ee > /sys/bus/vmbus/drivers/hv_util/unbind\n" /etc/rc.local - retrycmd_if_failure 20 5 5 apt-mark unhold walinuxagent + retrycmd_if_failure 20 5 30 apt-mark unhold walinuxagent || exit $ERR_RELEASE_HOLD_WALINUXAGENTs fi echo "Install complete successfully" +mkdir -p /opt/azure/containers && touch /opt/azure/containers/provision.complete +ps auxfww > /opt/azure/provision-ps.log & + if $REBOOTREQUIRED; then # wait 1 minute to restart node, so that the custom script extension can complete echo 'reboot required, rebooting node in 1 minute' /bin/bash -c "shutdown -r 1 &" fi - -echo `date`,`hostname`, endscript>>/opt/m - -mkdir -p /opt/azure/containers && touch /opt/azure/containers/provision.complete -ps auxfww > /opt/azure/provision-ps.log & diff --git a/parts/k8s/kubernetesmastercustomdata.yml b/parts/k8s/kubernetesmastercustomdata.yml index f16cf9d4d6..f0bb6496c0 100644 --- a/parts/k8s/kubernetesmastercustomdata.yml +++ b/parts/k8s/kubernetesmastercustomdata.yml @@ -348,11 +348,20 @@ MASTER_ARTIFACTS_CONFIG_PLACEHOLDER #!/bin/bash set -x source /opt/azure/containers/provision_source.sh + # TODO standardize/generalize CSE exit codes + ERR_ETCD_DOWNLOAD_TIMEOUT=12 + ERR_SYSTEMCTL_ENABLE_FAIL=3 ETCD_VER=v{{WrapAsVariable "etcdVersion"}} DOWNLOAD_URL={{WrapAsVariable "etcdDownloadURLBase"}} retrycmd_get_tarball 60 1 /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz ${DOWNLOAD_URL}/etcd-${ETCD_VER}-linux-amd64.tar.gz + if [ $? -ne 0 ]; then + exit $ERR_ETCD_DOWNLOAD_TIMEOUT + fi tar xzvf /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C /usr/bin/ --strip-components=1 retrycmd_if_failure 10 1 3 systemctl enable etcd.service + if [ $? -ne 0 ]; then + exit $ERR_SYSTEMCTL_ENABLE_FAIL + fi sudo sed -i "1iETCDCTL_ENDPOINTS=https://127.0.0.1:2379" /etc/environment sudo sed -i "1iETCDCTL_CA_FILE={{WrapAsVariable "etcdCaFilepath"}}" /etc/environment sudo sed -i "1iETCDCTL_KEY_FILE={{WrapAsVariable "etcdClientKeyFilepath"}}" /etc/environment @@ -397,33 +406,8 @@ coreos: ExecStart=/opt/azure/containers/provision-setup.sh {{else}} runcmd: -# the first arg is the number of retries, the second arg is the wait duration between two retries and the rest of the args are the cmd to run - set -x -- . /opt/azure/containers/provision_source.sh -- wait_for_file 1800 1 /opt/azure/containers/certs.ready -- /opt/azure/containers/setup-etcd.sh > /opt/azure/containers/setup-etcd.log 2>&1 -- retrycmd_if_failure 20 5 5 apt-mark hold walinuxagent {{GetKubernetesMasterPreprovisionYaml}} -- /opt/azure/containers/mountetcd.sh -- systemctl_restart 10 1 5 etcd -- MEMBER="$(sudo etcdctl member list | grep -E {{WrapAsVerbatim "variables('masterVMNames')[copyIndex(variables('masterOffset'))]"}} | cut -d{{WrapAsVariable "singleQuote"}}:{{WrapAsVariable "singleQuote"}} -f 1)" -- retrycmd_if_failure 10 1 5 sudo etcdctl member update ${MEMBER} {{WrapAsVerbatim "variables('masterEtcdPeerURLs')[copyIndex(variables('masterOffset'))]"}} -- retrycmd_if_failure 5 1 10 curl --cacert /etc/kubernetes/certs/ca.crt --cert /etc/kubernetes/certs/etcdclient.crt --key /etc/kubernetes/certs/etcdclient.key --retry 5 --retry-delay 10 --retry-max-time 10 --max-time 60 "{{WrapAsVerbatim "variables('masterEtcdClientURLs')[copyIndex(variables('masterOffset'))]"}}"/v2/machines -- apt_get_update -- retrycmd_if_failure 5 1 120 apt-get install -y apt-transport-https ca-certificates iptables iproute2 socat util-linux mount ebtables ethtool init-system-helpers -- retrycmd_if_failure_no_stats 180 1 5 curl -fsSL https://aptdocker.azureedge.net/gpg > /tmp/aptdocker.gpg -- retrycmd_if_failure 10 1 5 apt-key add /tmp/aptdocker.gpg -- echo "deb {{WrapAsVariable "dockerEngineDownloadRepo"}} ubuntu-xenial main" | sudo tee /etc/apt/sources.list.d/docker.list -- "echo \"Package: docker-engine\nPin: version {{WrapAsVariable "dockerEngineVersion"}}\nPin-Priority: 550\n\" > /etc/apt/preferences.d/docker.pref" -- apt_get_update -- retrycmd_if_failure 20 1 120 apt-get install -y ebtables docker-engine -- echo "ExecStartPost=/sbin/iptables -P FORWARD ACCEPT" >> /etc/systemd/system/docker.service.d/exec_start.conf -- touch /opt/azure/containers/dockerinstall.complete -- mkdir -p /etc/kubernetes/manifests -- usermod -aG docker {{WrapAsVariable "username"}} -- retrycmd_if_failure 20 1 10 /usr/lib/apt/apt.systemd.daily -{{if EnableAggregatedAPIs}} -- bash /etc/kubernetes/generate-proxy-certs.sh -{{end}} -- retrycmd_if_failure 20 1 5 apt-mark unhold walinuxagent -- touch /opt/azure/containers/runcmd.complete +- timeout 10 apt-mark hold walinuxagent{{GetKubernetesMasterPreprovisionYaml}} +- timeout 10 apt-mark unhold walinuxagent +- mkdir -p /var/log/azure && touch /var/log/azure/cloud-init.complete {{end}} diff --git a/parts/k8s/kubernetesmastervars.t b/parts/k8s/kubernetesmastervars.t index 09e152744a..d7aacd41af 100644 --- a/parts/k8s/kubernetesmastervars.t +++ b/parts/k8s/kubernetesmastervars.t @@ -237,23 +237,23 @@ "mountetcdScript": "{{GetKubernetesB64Mountetcd}}", {{if not IsOpenShift}} {{if not IsHostedMaster}} - "provisionScriptParametersMaster": "[concat('MASTER_NODE=true CLUSTER_AUTOSCALER_ADDON=',variables('kubernetesClusterAutoscalerEnabled'),' APISERVER_PRIVATE_KEY=',variables('apiServerPrivateKey'),' CA_CERTIFICATE=',variables('caCertificate'),' CA_PRIVATE_KEY=',variables('caPrivateKey'),' MASTER_FQDN=',variables('masterFqdnPrefix'),' KUBECONFIG_CERTIFICATE=',variables('kubeConfigCertificate'),' KUBECONFIG_KEY=',variables('kubeConfigPrivateKey'),' ETCD_SERVER_CERTIFICATE=',variables('etcdServerCertificate'),' ETCD_CLIENT_CERTIFICATE=',variables('etcdClientCertificate'),' ETCD_SERVER_PRIVATE_KEY=',variables('etcdServerPrivateKey'),' ETCD_CLIENT_PRIVATE_KEY=',variables('etcdClientPrivateKey'),' ETCD_PEER_CERTIFICATES=',string(variables('etcdPeerCertificates')),' ETCD_PEER_PRIVATE_KEYS=',string(variables('etcdPeerPrivateKeys')),' ADMINUSER=',variables('username'))]", + "provisionScriptParametersMaster": "[concat('MASTER_VM_NAME=',variables('masterVMNames')[variables('masterOffset')],' ETCD_PEER_URL=',variables('masterEtcdPeerURLs')[variables('masterOffset')],' ETCD_CLIENT_URL=',variables('masterEtcdClientURLs')[variables('masterOffset')],' MASTER_NODE=true CLUSTER_AUTOSCALER_ADDON=',variables('kubernetesClusterAutoscalerEnabled'),' APISERVER_PRIVATE_KEY=',variables('apiServerPrivateKey'),' CA_CERTIFICATE=',variables('caCertificate'),' CA_PRIVATE_KEY=',variables('caPrivateKey'),' MASTER_FQDN=',variables('masterFqdnPrefix'),' KUBECONFIG_CERTIFICATE=',variables('kubeConfigCertificate'),' KUBECONFIG_KEY=',variables('kubeConfigPrivateKey'),' ETCD_SERVER_CERTIFICATE=',variables('etcdServerCertificate'),' ETCD_CLIENT_CERTIFICATE=',variables('etcdClientCertificate'),' ETCD_SERVER_PRIVATE_KEY=',variables('etcdServerPrivateKey'),' ETCD_CLIENT_PRIVATE_KEY=',variables('etcdClientPrivateKey'),' ETCD_PEER_CERTIFICATES=',string(variables('etcdPeerCertificates')),' ETCD_PEER_PRIVATE_KEYS=',string(variables('etcdPeerPrivateKeys')))]", {{if EnableEncryptionWithExternalKms}} {{ if not UseManagedIdentity}} "servicePrincipalObjectId": "[parameters('servicePrincipalObjectId')]", {{end}} - "provisionScriptParametersCommon": "[concat('TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'),' KUBECONFIG_SERVER=',variables('kubeconfigServer'),' KMS_PROVIDER_VAULT_NAME=',variables('clusterKeyVaultName'), ' EnableEncryptionWithExternalKms=true')]", + "provisionScriptParametersCommon": "[concat('ADMINUSER=',variables('username'),' DOCKER_ENGINE_VERSION=',variables('dockerEngineVersion'),' DOCKER_REPO=',variables('dockerEngineDownloadRepo'),' TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'),' KUBECONFIG_SERVER=',variables('kubeconfigServer'),' KMS_PROVIDER_VAULT_NAME=',variables('clusterKeyVaultName'), ' EnableEncryptionWithExternalKms=true')]", {{else}} - "provisionScriptParametersCommon": "[concat('TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'),' KUBECONFIG_SERVER=',variables('kubeconfigServer'))]", + "provisionScriptParametersCommon": "[concat('ADMINUSER=',variables('username'),' DOCKER_ENGINE_VERSION=',variables('dockerEngineVersion'),' DOCKER_REPO=',variables('dockerEngineDownloadRepo'),' TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'),' KUBECONFIG_SERVER=',variables('kubeconfigServer'))]", {{end}} {{else}} {{if EnableEncryptionWithExternalKms}} {{ if not UseManagedIdentity}} "servicePrincipalObjectId": "[parameters('servicePrincipalObjectId')]", {{end}} - "provisionScriptParametersCommon": "[concat('TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'),' KMS_PROVIDER_VAULT_NAME=',variables('clusterKeyVaultName'), ' EnableEncryptionWithExternalKms=true')]", + "provisionScriptParametersCommon": "[concat('ADMINUSER=',variables('username'),' DOCKER_ENGINE_VERSION=',variables('dockerEngineVersion'),' DOCKER_REPO=',variables('dockerEngineDownloadRepo'),' TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'),' KMS_PROVIDER_VAULT_NAME=',variables('clusterKeyVaultName'), ' EnableEncryptionWithExternalKms=true')]", {{else}} - "provisionScriptParametersCommon": "[concat('TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'))]", + "provisionScriptParametersCommon": "[concat('ADMINUSER=',variables('username'),' DOCKER_ENGINE_VERSION=',variables('dockerEngineVersion'),' DOCKER_REPO=',variables('dockerEngineDownloadRepo'),' TENANT_ID=',variables('tenantID'),' HYPERKUBE_URL=',variables('kubernetesHyperkubeSpec'),' APISERVER_PUBLIC_KEY=',variables('apiserverCertificate'),' SUBSCRIPTION_ID=',variables('subscriptionId'),' RESOURCE_GROUP=',variables('resourceGroup'),' LOCATION=',variables('location'),' VM_TYPE=',variables('vmType'),' SUBNET=',variables('subnetName'),' NETWORK_SECURITY_GROUP=',variables('nsgName'),' VIRTUAL_NETWORK=',variables('virtualNetworkName'),' VIRTUAL_NETWORK_RESOURCE_GROUP=',variables('virtualNetworkResourceGroupName'),' ROUTE_TABLE=',variables('routeTableName'),' PRIMARY_AVAILABILITY_SET=',variables('primaryAvailabilitySetName'),' PRIMARY_SCALE_SET=',variables('primaryScaleSetName'),' SERVICE_PRINCIPAL_CLIENT_ID=',variables('servicePrincipalClientId'),' SERVICE_PRINCIPAL_CLIENT_SECRET=',variables('singleQuote'),variables('servicePrincipalClientSecret'),variables('singleQuote'),' KUBELET_PRIVATE_KEY=',variables('clientPrivateKey'),' TARGET_ENVIRONMENT=',variables('targetEnvironment'),' NETWORK_PLUGIN=',variables('networkPlugin'),' FQDNSuffix=',variables('fqdnEndpointSuffix'),' VNET_CNI_PLUGINS_URL=',variables('vnetCniLinuxPluginsURL'),' CNI_PLUGINS_URL=',variables('cniPluginsURL'),' CLOUDPROVIDER_BACKOFF=',variables('cloudProviderBackoff'),' CLOUDPROVIDER_BACKOFF_RETRIES=',variables('cloudProviderBackoffRetries'),' CLOUDPROVIDER_BACKOFF_EXPONENT=',variables('cloudProviderBackoffExponent'),' CLOUDPROVIDER_BACKOFF_DURATION=',variables('cloudProviderBackoffDuration'),' CLOUDPROVIDER_BACKOFF_JITTER=',variables('cloudProviderBackoffJitter'),' CLOUDPROVIDER_RATELIMIT=',variables('cloudProviderRatelimit'),' CLOUDPROVIDER_RATELIMIT_QPS=',variables('cloudProviderRatelimitQPS'),' CLOUDPROVIDER_RATELIMIT_BUCKET=',variables('cloudProviderRatelimitBucket'),' USE_MANAGED_IDENTITY_EXTENSION=',variables('useManagedIdentityExtension'),' USE_INSTANCE_METADATA=',variables('useInstanceMetadata'),' CONTAINER_RUNTIME=',variables('containerRuntime'))]", {{end}} {{end}} {{end}} diff --git a/parts/k8s/kubernetesprovisionsource.sh b/parts/k8s/kubernetesprovisionsource.sh index 980a490df7..0a038cf851 100644 --- a/parts/k8s/kubernetesprovisionsource.sh +++ b/parts/k8s/kubernetesprovisionsource.sh @@ -1,8 +1,94 @@ #!/bin/sh -retrycmd_if_failure() { retries=$1; wait=$2; timeout=$3; shift && shift && shift; for i in $(seq 1 $retries); do timeout $timeout ${@}; [ $? -eq 0 ] && break || sleep $wait; done; echo Executed \"$@\" $i times; } -retrycmd_if_failure_no_stats() { retries=$1; wait=$2; timeout=$3; shift && shift && shift; for i in $(seq 1 $retries); do timeout $timeout ${@}; [ $? -eq 0 ] && break || sleep $wait; done; } -retrycmd_get_tarball() { retries=$1; wait=$2; tarball=$3; url=$4; for i in $(seq 1 $retries); do tar -tzf $tarball; [ $? -eq 0 ] && break || retrycmd_if_failure_no_stats $retries 1 10 curl -fsSL $url -o $tarball; sleep $wait; done; } -wait_for_file() { retries=$1; wait=$2; filepath=$3; for i in $(seq 1 $retries); do if [ -f $filepath ]; then break; fi; sleep $wait; done } -apt_get_update() { for i in $(seq 1 100); do apt-get update 2>&1 | grep -x "[WE]:.*"; [ $? -ne 0 ] && break || sleep 1; done; echo Executed apt-get update $i times; } -systemctl_restart() { retries=$1; wait=$2; timeout=$3 svcname=$4; for i in $(seq 1 $retries); do timeout $timeout systemctl daemon-reload && systemctl restart $svcname && systemctl daemon-reload; [ $? -eq 0 ] && break || sleep $wait; done; } \ No newline at end of file +retrycmd_if_failure() { + retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift + for i in $(seq 1 $retries); do + timeout $timeout ${@} + [ $? -eq 0 ] && break || \ + if [ $i -eq $retries ]; then + echo Executed \"$@\" $i times; + return 1 + else + sleep $wait_sleep + fi + done + echo Executed \"$@\" $i times; +} +retrycmd_if_failure_no_stats() { + retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift + for i in $(seq 1 $retries); do + timeout $timeout ${@} + [ $? -eq 0 ] && break || \ + if [ $i -eq $retries ]; then + return 1 + else + sleep $wait_sleep + fi + done +} +retrycmd_get_tarball() { + tar_retries=$1; wait_sleep=$2; tarball=$3; url=$4 + echo "${tar_retries} retries" + for i in $(seq 1 $tar_retries); do + tar -tzf $tarball + [ $? -eq 0 ] && break || \ + if [ $i -eq $tar_retries ]; then + return 1 + else + retrycmd_if_failure_no_stats 1 1 30 curl -fsSL $url -o $tarball + sleep $wait_sleep + fi + done +} +wait_for_file() { + retries=$1; wait_sleep=$2; filepath=$3 + for i in $(seq 1 $retries); do + if [ -f $filepath ]; then + break + fi + if [ $i -eq $retries ]; then + return 1 + else + sleep $wait_sleep + fi + done +} +apt_get_update() { + retries=10 + for i in $(seq 1 $retries); do + timeout 30 dpkg --configure -a + timeout 120 apt-get update 2>&1 | grep -x "[WE]:.*" + [ $? -ne 0 ] && break || \ + if [ $i -eq $retries ]; then + return 1 + else sleep 30 + fi + done + echo Executed apt-get update $i times +} +apt_get_install() { + retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift + for i in $(seq 1 $retries); do + timeout 30 dpkg --configure -a + timeout $timeout apt-get install --no-install-recommends -y ${@} + [ $? -eq 0 ] && break || \ + if [ $i -eq $retries ]; then + return 1 + else + sleep $wait_sleep + fi + done + echo Executed apt-get install --no-install-recommends -y \"$@\" $i times; +} +systemctl_restart() { + retries=$1; wait_sleep=$2; timeout=$3 svcname=$4 + for i in $(seq 1 $retries); do + timeout $timeout systemctl daemon-reload && systemctl restart $svcname && systemctl daemon-reload + [ $? -eq 0 ] && break || \ + if [ $i -eq $retries ]; then + return 1 + else + sleep $wait_sleep + fi + done +} \ No newline at end of file diff --git a/pkg/acsengine/engine.go b/pkg/acsengine/engine.go index b156d44d3c..26928a41c0 100644 --- a/pkg/acsengine/engine.go +++ b/pkg/acsengine/engine.go @@ -2197,7 +2197,7 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { - sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf" - update-initramfs -u - apt_get_update -- retrycmd_if_failure 5 10 120 apt-get install -y linux-headers-$(uname -r) gcc make +- retrycmd_if_failure 5 5 300 apt-get install -y linux-headers-$(uname -r) gcc make - mkdir -p %s - cd %s`, dest, dest) @@ -2207,7 +2207,7 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64 */ installScript += fmt.Sprintf(` -- retrycmd_if_failure 5 10 30 curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s +- retrycmd_if_failure 5 10 60 curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s - mkdir -p lib64 overlay-workdir - mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv) @@ -2221,8 +2221,7 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { - ldconfig - umount /usr/lib/x86_64-linux-gnu - nvidia-modprobe -u -c0 -- %s/bin/nvidia-smi -- systemctl_restart 10 1 10 kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest) +- %s/bin/nvidia-smi`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest) /* If a new GPU sku becomes available, add a key to this map, but only provide an installation script if you have a confirmation that we have an agreement with NVIDIA for this specific gpu. Otherwise use the warning message.