Skip to content
This repository has been archived by the owner on Jan 11, 2023. It is now read-only.

Commit

Permalink
add restarts to nvidia drivers download in cloud-init (#2316)
Browse files Browse the repository at this point in the history
* add restarts to nvidia drivers download

and only create cloud-init string if necessary

* add tests

* add v1.8 gpu-enabled api model for e2e testing

* trying Standard_NC6

* e2e

* lint

* updated comment

* bad match string, less freq checks, - unused func

* more general success determination, typo

* more typo
  • Loading branch information
jackfrancis authored Feb 22, 2018
1 parent 47f2a4f commit 65668e0
Show file tree
Hide file tree
Showing 8 changed files with 465 additions and 10 deletions.
37 changes: 37 additions & 0 deletions examples/e2e-tests/kubernetes/gpu-enabled/definition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"apiVersion": "vlabs",
"properties": {
"orchestratorProfile": {
"orchestratorType": "Kubernetes",
"orchestratorRelease": "1.8"
},
"masterProfile": {
"count": 1,
"dnsPrefix": "",
"vmSize": "Standard_D2_v2"
},
"agentPoolProfiles": [
{
"name": "linuxpool1",
"count": 3,
"vmSize": "Standard_NC6",
"availabilityProfile": "AvailabilitySet"
}
],
"linuxProfile": {
"adminUsername": "azureuser",
"ssh": {
"publicKeys": [
{
"keyData": ""
}
]
}
},
"servicePrincipalProfile": {
"clientId": "",
"secret": ""
},
"certificateProfile": {}
}
}
4 changes: 3 additions & 1 deletion parts/k8s/kubernetesagentcustomdata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,9 @@ runcmd:
- systemctl restart docker
- mkdir -p /etc/kubernetes/manifests
- usermod -aG docker {{WrapAsVariable "username"}}
{{GetGPUDriversInstallScript .}}
{{if IsNSeriesSKU .}}
{{GetGPUDriversInstallScript .}}
{{end}}
- echo `date`,`hostname`, PRE-APT-SYSTEMD-DAILY>>/opt/m
- /usr/lib/apt/apt.systemd.daily
- echo `date`,`hostname`, POST-APT-SYSTEMD-DAILY>>/opt/m
Expand Down
26 changes: 17 additions & 9 deletions pkg/acsengine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,9 @@ func (t *TemplateGenerator) getTemplateFuncMap(cs *api.ContainerService) templat
}
return false
},
"IsNSeriesSKU": func(profile *api.AgentPoolProfile) bool {
return isNSeriesSKU(profile)
},
"GetGPUDriversInstallScript": func(profile *api.AgentPoolProfile) string {
return getGPUDriversInstallScript(profile)
},
Expand Down Expand Up @@ -1687,6 +1690,10 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC
return ""
}

func isNSeriesSKU(profile *api.AgentPoolProfile) bool {
return strings.Contains(profile.VMSize, "Standard_N")
}

func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {

// latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions.
Expand All @@ -1700,7 +1707,8 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
installScript := fmt.Sprintf(`- rmmod nouveau
- sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf"
- update-initramfs -u
- sudo apt-get update && sudo apt-get install -y linux-headers-$(uname -r) gcc make
- apt_get_update
- retrycmd_if_failure 5 10 apt-get install -y linux-headers-$(uname -r) gcc make
- mkdir -p %s
- cd %s`, dest, dest)

Expand All @@ -1710,22 +1718,22 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64
*/
installScript += fmt.Sprintf(`
- curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s
- retrycmd_if_failure 5 10 curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s
- mkdir -p lib64 overlay-workdir
- sudo mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv)
- mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv)

/*
Install the drivers and update /etc/ld.so.conf.d/nvidia.conf which will make the libraries discoverable through $LD_LIBRARY_PATH.
Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts)
*/
installScript += fmt.Sprintf(`
- sudo sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
- sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
- echo "%s" > /etc/ld.so.conf.d/nvidia.conf
- sudo ldconfig
- sudo umount /usr/lib/x86_64-linux-gnu
- sudo nvidia-modprobe -u -c0
- sudo %s/bin/nvidia-smi
- sudo systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest)
- ldconfig
- umount /usr/lib/x86_64-linux-gnu
- nvidia-modprobe -u -c0
- %s/bin/nvidia-smi
- retrycmd_if_failure 5 10 systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest)

// We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message.
na := getGPUDriversNotInstalledWarningMessage(profile.VMSize)
Expand Down
181 changes: 181 additions & 0 deletions pkg/acsengine/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,184 @@ func TestGetGPUDriversInstallScript(t *testing.T) {
t.Fatalf("VMSize without GPU should not receive a script, expected empty string, received %v", s)
}
}
func TestIsNSeriesSKU(t *testing.T) {
// VMSize with GPU
validSkus := []string{
"Standard_NC12",
"Standard_NC12s_v2",
"Standard_NC12s_v3",
"Standard_NC24",
"Standard_NC24r",
"Standard_NC24rs_v2",
"Standard_NC24rs_v3",
"Standard_NC24s_v2",
"Standard_NC24s_v3",
"Standard_NC6",
"Standard_NC6s_v2",
"Standard_NC6s_v3",
"Standard_ND12s",
"Standard_ND24rs",
"Standard_ND24s",
"Standard_ND6s",
"Standard_NV12",
"Standard_NV24",
"Standard_NV6",
}

invalidSkus := []string{
"Standard_A10",
"Standard_A11",
"Standard_A2",
"Standard_A2_v2",
"Standard_A2m_v2",
"Standard_A3",
"Standard_A4",
"Standard_A4_v2",
"Standard_A4m_v2",
"Standard_A5",
"Standard_A6",
"Standard_A7",
"Standard_A8",
"Standard_A8_v2",
"Standard_A8m_v2",
"Standard_A9",
"Standard_B2ms",
"Standard_B4ms",
"Standard_B8ms",
"Standard_D11",
"Standard_D11_v2",
"Standard_D11_v2_Promo",
"Standard_D12",
"Standard_D12_v2",
"Standard_D12_v2_Promo",
"Standard_D13",
"Standard_D13_v2",
"Standard_D13_v2_Promo",
"Standard_D14",
"Standard_D14_v2",
"Standard_D14_v2_Promo",
"Standard_D15_v2",
"Standard_D16_v3",
"Standard_D16s_v3",
"Standard_D2",
"Standard_D2_v2",
"Standard_D2_v2_Promo",
"Standard_D2_v3",
"Standard_D2s_v3",
"Standard_D3",
"Standard_D32_v3",
"Standard_D32s_v3",
"Standard_D3_v2",
"Standard_D3_v2_Promo",
"Standard_D4",
"Standard_D4_v2",
"Standard_D4_v2_Promo",
"Standard_D4_v3",
"Standard_D4s_v3",
"Standard_D5_v2",
"Standard_D5_v2_Promo",
"Standard_D64_v3",
"Standard_D64s_v3",
"Standard_D8_v3",
"Standard_D8s_v3",
"Standard_DS11",
"Standard_DS11_v2",
"Standard_DS11_v2_Promo",
"Standard_DS12",
"Standard_DS12_v2",
"Standard_DS12_v2_Promo",
"Standard_DS13",
"Standard_DS13-2_v2",
"Standard_DS13-4_v2",
"Standard_DS13_v2",
"Standard_DS13_v2_Promo",
"Standard_DS14",
"Standard_DS14-4_v2",
"Standard_DS14-8_v2",
"Standard_DS14_v2",
"Standard_DS14_v2_Promo",
"Standard_DS15_v2",
"Standard_DS3",
"Standard_DS3_v2",
"Standard_DS3_v2_Promo",
"Standard_DS4",
"Standard_DS4_v2",
"Standard_DS4_v2_Promo",
"Standard_DS5_v2",
"Standard_DS5_v2_Promo",
"Standard_E16_v3",
"Standard_E16s_v3",
"Standard_E2_v3",
"Standard_E2s_v3",
"Standard_E32-16s_v3",
"Standard_E32-8s_v3",
"Standard_E32_v3",
"Standard_E32s_v3",
"Standard_E4_v3",
"Standard_E4s_v3",
"Standard_E64-16s_v3",
"Standard_E64-32s_v3",
"Standard_E64_v3",
"Standard_E64s_v3",
"Standard_E8_v3",
"Standard_E8s_v3",
"Standard_F16",
"Standard_F16s",
"Standard_F16s_v2",
"Standard_F2",
"Standard_F2s_v2",
"Standard_F32s_v2",
"Standard_F4",
"Standard_F4s",
"Standard_F4s_v2",
"Standard_F64s_v2",
"Standard_F72s_v2",
"Standard_F8",
"Standard_F8s",
"Standard_F8s_v2",
"Standard_G1",
"Standard_G2",
"Standard_G3",
"Standard_G4",
"Standard_G5",
"Standard_GS1",
"Standard_GS2",
"Standard_GS3",
"Standard_GS4",
"Standard_GS4-4",
"Standard_GS4-8",
"Standard_GS5",
"Standard_GS5-16",
"Standard_GS5-8",
"Standard_H16",
"Standard_H16m",
"Standard_H16mr",
"Standard_H16r",
"Standard_H8",
"Standard_H8m",
"Standard_L16s",
"Standard_L32s",
"Standard_L4s",
"Standard_L8s",
"Standard_M128-32ms",
"Standard_M128-64ms",
"Standard_M128ms",
"Standard_M128s",
"Standard_M64-16ms",
"Standard_M64-32ms",
"Standard_M64ms",
"Standard_M64s",
}

for _, sku := range validSkus {
if !isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) {
t.Fatalf("Expected isNSeriesSKU(%s) to be true", sku)
}
}

for _, sku := range invalidSkus {
if isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) {
t.Fatalf("Expected isNSeriesSKU(%s) to be false", sku)
}
}
}
11 changes: 11 additions & 0 deletions test/e2e/engine/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"io/ioutil"
"log"
"path/filepath"
"strings"

"github.com/Azure/acs-engine/pkg/api"
"github.com/Azure/acs-engine/pkg/api/vlabs"
Expand Down Expand Up @@ -158,6 +159,16 @@ func (e *Engine) HasWindowsAgents() bool {
return false
}

// HasGPUNodes will return true if the VM SKU is GPU-enabled
func (e *Engine) HasGPUNodes() bool {
for _, ap := range e.ExpandedDefinition.Properties.AgentPoolProfiles {
if strings.Contains(ap.VMSize, "Standard_N") {
return true
}
}
return false
}

// HasAddon will return true if an addon is enabled
func (e *Engine) HasAddon(name string) (bool, api.KubernetesAddon) {
for _, addon := range e.ExpandedDefinition.Properties.OrchestratorProfile.KubernetesConfig.Addons {
Expand Down
Loading

0 comments on commit 65668e0

Please sign in to comment.