Skip to content
This repository has been archived by the owner on Jan 11, 2023. It is now read-only.

add restarts to nvidia drivers download in cloud-init #2316

Merged
merged 10 commits into from
Feb 22, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions examples/e2e-tests/kubernetes/gpu-enabled/definition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"apiVersion": "vlabs",
"properties": {
"orchestratorProfile": {
"orchestratorType": "Kubernetes",
"orchestratorRelease": "1.8"
},
"masterProfile": {
"count": 1,
"dnsPrefix": "",
"vmSize": "Standard_D2_v2"
},
"agentPoolProfiles": [
{
"name": "linuxpool1",
"count": 3,
"vmSize": "Standard_NC6",
"availabilityProfile": "AvailabilitySet"
}
],
"linuxProfile": {
"adminUsername": "azureuser",
"ssh": {
"publicKeys": [
{
"keyData": ""
}
]
}
},
"servicePrincipalProfile": {
"clientId": "",
"secret": ""
},
"certificateProfile": {}
}
}
4 changes: 3 additions & 1 deletion parts/k8s/kubernetesagentcustomdata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ runcmd:
- systemctl restart docker
- mkdir -p /etc/kubernetes/manifests
- usermod -aG docker {{WrapAsVariable "username"}}
{{GetGPUDriversInstallScript .}}
{{if IsNSeriesSKU .}}
{{GetGPUDriversInstallScript .}}
{{end}}
- echo `date`,`hostname`, PRE-APT-SYSTEMD-DAILY>>/opt/m
- /usr/lib/apt/apt.systemd.daily
- echo `date`,`hostname`, POST-APT-SYSTEMD-DAILY>>/opt/m
Expand Down
26 changes: 17 additions & 9 deletions pkg/acsengine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -1248,6 +1248,9 @@ func (t *TemplateGenerator) getTemplateFuncMap(cs *api.ContainerService) templat
}
return false
},
"IsNSeriesSKU": func(profile *api.AgentPoolProfile) bool {
return isNSeriesSKU(profile)
},
"GetGPUDriversInstallScript": func(profile *api.AgentPoolProfile) string {
return getGPUDriversInstallScript(profile)
},
Expand Down Expand Up @@ -1745,6 +1748,10 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC
return ""
}

func isNSeriesSKU(profile *api.AgentPoolProfile) bool {
return strings.Contains(profile.VMSize, "Standard_N")
}

func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {

// latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions.
Expand All @@ -1758,7 +1765,8 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
installScript := fmt.Sprintf(`- rmmod nouveau
- sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf"
- update-initramfs -u
- sudo apt-get update && sudo apt-get install -y linux-headers-$(uname -r) gcc make
- apt_get_update
- retrycmd_if_failure 5 10 apt-get install -y linux-headers-$(uname -r) gcc make
- mkdir -p %s
- cd %s`, dest, dest)

Expand All @@ -1768,22 +1776,22 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64
*/
installScript += fmt.Sprintf(`
- curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s
- retrycmd_if_failure 5 10 curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s
- mkdir -p lib64 overlay-workdir
- sudo mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv)
- mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv)

/*
Install the drivers and update /etc/ld.so.conf.d/nvidia.conf which will make the libraries discoverable through $LD_LIBRARY_PATH.
Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts)
*/
installScript += fmt.Sprintf(`
- sudo sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
- sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
- echo "%s" > /etc/ld.so.conf.d/nvidia.conf
- sudo ldconfig
- sudo umount /usr/lib/x86_64-linux-gnu
- sudo nvidia-modprobe -u -c0
- sudo %s/bin/nvidia-smi
- sudo systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest)
- ldconfig
- umount /usr/lib/x86_64-linux-gnu
- nvidia-modprobe -u -c0
- %s/bin/nvidia-smi
- retrycmd_if_failure 5 10 systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest)

// We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message.
na := getGPUDriversNotInstalledWarningMessage(profile.VMSize)
Expand Down
181 changes: 181 additions & 0 deletions pkg/acsengine/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,184 @@ func TestGetGPUDriversInstallScript(t *testing.T) {
t.Fatalf("VMSize without GPU should not receive a script, expected empty string, received %v", s)
}
}
func TestIsNSeriesSKU(t *testing.T) {
// VMSize with GPU and NVIDIA agreement for drivers distribution
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: should simply read "VMSize with GPU" instead. We only have agreement for NC6, NC12, NC24, NV6, NV12 and NV24 so far. This is very minor, but I don't want anyone to get confused on this agreement stuff and distributes drivers without NVIDIA approval by mistake.

validSkus := []string{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming you got this list from somewhere, I have no idea which are which so I'm gonna trust you on this one

"Standard_NC12",
"Standard_NC12s_v2",
"Standard_NC12s_v3",
"Standard_NC24",
"Standard_NC24r",
"Standard_NC24rs_v2",
"Standard_NC24rs_v3",
"Standard_NC24s_v2",
"Standard_NC24s_v3",
"Standard_NC6",
"Standard_NC6s_v2",
"Standard_NC6s_v3",
"Standard_ND12s",
"Standard_ND24rs",
"Standard_ND24s",
"Standard_ND6s",
"Standard_NV12",
"Standard_NV24",
"Standard_NV6",
}

invalidSkus := []string{
"Standard_A10",
"Standard_A11",
"Standard_A2",
"Standard_A2_v2",
"Standard_A2m_v2",
"Standard_A3",
"Standard_A4",
"Standard_A4_v2",
"Standard_A4m_v2",
"Standard_A5",
"Standard_A6",
"Standard_A7",
"Standard_A8",
"Standard_A8_v2",
"Standard_A8m_v2",
"Standard_A9",
"Standard_B2ms",
"Standard_B4ms",
"Standard_B8ms",
"Standard_D11",
"Standard_D11_v2",
"Standard_D11_v2_Promo",
"Standard_D12",
"Standard_D12_v2",
"Standard_D12_v2_Promo",
"Standard_D13",
"Standard_D13_v2",
"Standard_D13_v2_Promo",
"Standard_D14",
"Standard_D14_v2",
"Standard_D14_v2_Promo",
"Standard_D15_v2",
"Standard_D16_v3",
"Standard_D16s_v3",
"Standard_D2",
"Standard_D2_v2",
"Standard_D2_v2_Promo",
"Standard_D2_v3",
"Standard_D2s_v3",
"Standard_D3",
"Standard_D32_v3",
"Standard_D32s_v3",
"Standard_D3_v2",
"Standard_D3_v2_Promo",
"Standard_D4",
"Standard_D4_v2",
"Standard_D4_v2_Promo",
"Standard_D4_v3",
"Standard_D4s_v3",
"Standard_D5_v2",
"Standard_D5_v2_Promo",
"Standard_D64_v3",
"Standard_D64s_v3",
"Standard_D8_v3",
"Standard_D8s_v3",
"Standard_DS11",
"Standard_DS11_v2",
"Standard_DS11_v2_Promo",
"Standard_DS12",
"Standard_DS12_v2",
"Standard_DS12_v2_Promo",
"Standard_DS13",
"Standard_DS13-2_v2",
"Standard_DS13-4_v2",
"Standard_DS13_v2",
"Standard_DS13_v2_Promo",
"Standard_DS14",
"Standard_DS14-4_v2",
"Standard_DS14-8_v2",
"Standard_DS14_v2",
"Standard_DS14_v2_Promo",
"Standard_DS15_v2",
"Standard_DS3",
"Standard_DS3_v2",
"Standard_DS3_v2_Promo",
"Standard_DS4",
"Standard_DS4_v2",
"Standard_DS4_v2_Promo",
"Standard_DS5_v2",
"Standard_DS5_v2_Promo",
"Standard_E16_v3",
"Standard_E16s_v3",
"Standard_E2_v3",
"Standard_E2s_v3",
"Standard_E32-16s_v3",
"Standard_E32-8s_v3",
"Standard_E32_v3",
"Standard_E32s_v3",
"Standard_E4_v3",
"Standard_E4s_v3",
"Standard_E64-16s_v3",
"Standard_E64-32s_v3",
"Standard_E64_v3",
"Standard_E64s_v3",
"Standard_E8_v3",
"Standard_E8s_v3",
"Standard_F16",
"Standard_F16s",
"Standard_F16s_v2",
"Standard_F2",
"Standard_F2s_v2",
"Standard_F32s_v2",
"Standard_F4",
"Standard_F4s",
"Standard_F4s_v2",
"Standard_F64s_v2",
"Standard_F72s_v2",
"Standard_F8",
"Standard_F8s",
"Standard_F8s_v2",
"Standard_G1",
"Standard_G2",
"Standard_G3",
"Standard_G4",
"Standard_G5",
"Standard_GS1",
"Standard_GS2",
"Standard_GS3",
"Standard_GS4",
"Standard_GS4-4",
"Standard_GS4-8",
"Standard_GS5",
"Standard_GS5-16",
"Standard_GS5-8",
"Standard_H16",
"Standard_H16m",
"Standard_H16mr",
"Standard_H16r",
"Standard_H8",
"Standard_H8m",
"Standard_L16s",
"Standard_L32s",
"Standard_L4s",
"Standard_L8s",
"Standard_M128-32ms",
"Standard_M128-64ms",
"Standard_M128ms",
"Standard_M128s",
"Standard_M64-16ms",
"Standard_M64-32ms",
"Standard_M64ms",
"Standard_M64s",
}

for _, sku := range validSkus {
if !isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) {
t.Fatalf("Expected isNSeriesSKU(%s) to be true", sku)
}
}

for _, sku := range invalidSkus {
if isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) {
t.Fatalf("Expected isNSeriesSKU(%s) to be false", sku)
}
}
}
11 changes: 11 additions & 0 deletions test/e2e/engine/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"io/ioutil"
"log"
"path/filepath"
"strings"

"github.com/Azure/acs-engine/pkg/api"
"github.com/Azure/acs-engine/pkg/api/vlabs"
Expand Down Expand Up @@ -158,6 +159,16 @@ func (e *Engine) HasWindowsAgents() bool {
return false
}

// HasGPUNodes will return true if the VM SKU is GPU-enabled
func (e *Engine) HasGPUNodes() bool {
for _, ap := range e.ExpandedDefinition.Properties.AgentPoolProfiles {
if strings.Contains(ap.VMSize, "Standard_N") {
return true
}
}
return false
}

// HasAddon will return true if an addon is enabled
func (e *Engine) HasAddon(name string) (bool, api.KubernetesAddon) {
for _, addon := range e.ExpandedDefinition.Properties.OrchestratorProfile.KubernetesConfig.Addons {
Expand Down
Loading