From 65668e0b327e64079cb283e36f8f3bb73a83bca7 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Thu, 22 Feb 2018 10:42:54 -0800 Subject: [PATCH] add restarts to nvidia drivers download in cloud-init (#2316) * add restarts to nvidia drivers download and only create cloud-init string if necessary * add tests * add v1.8 gpu-enabled api model for e2e testing * trying Standard_NC6 * e2e * lint * updated comment * bad match string, less freq checks, - unused func * more general success determination, typo * more typo --- .../kubernetes/gpu-enabled/definition.json | 37 ++++ parts/k8s/kubernetesagentcustomdata.yml | 4 +- pkg/acsengine/engine.go | 26 ++- pkg/acsengine/engine_test.go | 181 ++++++++++++++++++ test/e2e/engine/template.go | 11 ++ test/e2e/kubernetes/job/job.go | 173 +++++++++++++++++ test/e2e/kubernetes/kubernetes_test.go | 18 ++ test/e2e/kubernetes/workloads/nvidia-smi.yaml | 25 +++ 8 files changed, 465 insertions(+), 10 deletions(-) create mode 100644 examples/e2e-tests/kubernetes/gpu-enabled/definition.json create mode 100644 test/e2e/kubernetes/job/job.go create mode 100644 test/e2e/kubernetes/workloads/nvidia-smi.yaml diff --git a/examples/e2e-tests/kubernetes/gpu-enabled/definition.json b/examples/e2e-tests/kubernetes/gpu-enabled/definition.json new file mode 100644 index 0000000000..14b7594f12 --- /dev/null +++ b/examples/e2e-tests/kubernetes/gpu-enabled/definition.json @@ -0,0 +1,37 @@ +{ + "apiVersion": "vlabs", + "properties": { + "orchestratorProfile": { + "orchestratorType": "Kubernetes", + "orchestratorRelease": "1.8" + }, + "masterProfile": { + "count": 1, + "dnsPrefix": "", + "vmSize": "Standard_D2_v2" + }, + "agentPoolProfiles": [ + { + "name": "linuxpool1", + "count": 3, + "vmSize": "Standard_NC6", + "availabilityProfile": "AvailabilitySet" + } + ], + "linuxProfile": { + "adminUsername": "azureuser", + "ssh": { + "publicKeys": [ + { + "keyData": "" + } + ] + } + }, + "servicePrincipalProfile": { + "clientId": "", + "secret": "" + }, + "certificateProfile": {} + } + } \ No newline at end of file diff --git a/parts/k8s/kubernetesagentcustomdata.yml b/parts/k8s/kubernetesagentcustomdata.yml index 379f016f84..e8af42f35e 100644 --- a/parts/k8s/kubernetesagentcustomdata.yml +++ b/parts/k8s/kubernetesagentcustomdata.yml @@ -197,7 +197,9 @@ runcmd: - systemctl restart docker - mkdir -p /etc/kubernetes/manifests - usermod -aG docker {{WrapAsVariable "username"}} -{{GetGPUDriversInstallScript .}} +{{if IsNSeriesSKU .}} + {{GetGPUDriversInstallScript .}} +{{end}} - echo `date`,`hostname`, PRE-APT-SYSTEMD-DAILY>>/opt/m - /usr/lib/apt/apt.systemd.daily - echo `date`,`hostname`, POST-APT-SYSTEMD-DAILY>>/opt/m diff --git a/pkg/acsengine/engine.go b/pkg/acsengine/engine.go index 1c6eaade37..6bb5cc5d4f 100644 --- a/pkg/acsengine/engine.go +++ b/pkg/acsengine/engine.go @@ -1190,6 +1190,9 @@ func (t *TemplateGenerator) getTemplateFuncMap(cs *api.ContainerService) templat } return false }, + "IsNSeriesSKU": func(profile *api.AgentPoolProfile) bool { + return isNSeriesSKU(profile) + }, "GetGPUDriversInstallScript": func(profile *api.AgentPoolProfile) string { return getGPUDriversInstallScript(profile) }, @@ -1687,6 +1690,10 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC return "" } +func isNSeriesSKU(profile *api.AgentPoolProfile) bool { + return strings.Contains(profile.VMSize, "Standard_N") +} + func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { // latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions. @@ -1700,7 +1707,8 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { installScript := fmt.Sprintf(`- rmmod nouveau - sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf" - update-initramfs -u -- sudo apt-get update && sudo apt-get install -y linux-headers-$(uname -r) gcc make +- apt_get_update +- retrycmd_if_failure 5 10 apt-get install -y linux-headers-$(uname -r) gcc make - mkdir -p %s - cd %s`, dest, dest) @@ -1710,22 +1718,22 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64 */ installScript += fmt.Sprintf(` -- curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s +- retrycmd_if_failure 5 10 curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s - mkdir -p lib64 overlay-workdir -- sudo mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv) +- mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv) /* Install the drivers and update /etc/ld.so.conf.d/nvidia.conf which will make the libraries discoverable through $LD_LIBRARY_PATH. Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts) */ installScript += fmt.Sprintf(` -- sudo sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s" +- sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s" - echo "%s" > /etc/ld.so.conf.d/nvidia.conf -- sudo ldconfig -- sudo umount /usr/lib/x86_64-linux-gnu -- sudo nvidia-modprobe -u -c0 -- sudo %s/bin/nvidia-smi -- sudo systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest) +- ldconfig +- umount /usr/lib/x86_64-linux-gnu +- nvidia-modprobe -u -c0 +- %s/bin/nvidia-smi +- retrycmd_if_failure 5 10 systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest) // We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message. na := getGPUDriversNotInstalledWarningMessage(profile.VMSize) diff --git a/pkg/acsengine/engine_test.go b/pkg/acsengine/engine_test.go index b91fa6af5e..ebeddb5cbd 100644 --- a/pkg/acsengine/engine_test.go +++ b/pkg/acsengine/engine_test.go @@ -371,3 +371,184 @@ func TestGetGPUDriversInstallScript(t *testing.T) { t.Fatalf("VMSize without GPU should not receive a script, expected empty string, received %v", s) } } +func TestIsNSeriesSKU(t *testing.T) { + // VMSize with GPU + validSkus := []string{ + "Standard_NC12", + "Standard_NC12s_v2", + "Standard_NC12s_v3", + "Standard_NC24", + "Standard_NC24r", + "Standard_NC24rs_v2", + "Standard_NC24rs_v3", + "Standard_NC24s_v2", + "Standard_NC24s_v3", + "Standard_NC6", + "Standard_NC6s_v2", + "Standard_NC6s_v3", + "Standard_ND12s", + "Standard_ND24rs", + "Standard_ND24s", + "Standard_ND6s", + "Standard_NV12", + "Standard_NV24", + "Standard_NV6", + } + + invalidSkus := []string{ + "Standard_A10", + "Standard_A11", + "Standard_A2", + "Standard_A2_v2", + "Standard_A2m_v2", + "Standard_A3", + "Standard_A4", + "Standard_A4_v2", + "Standard_A4m_v2", + "Standard_A5", + "Standard_A6", + "Standard_A7", + "Standard_A8", + "Standard_A8_v2", + "Standard_A8m_v2", + "Standard_A9", + "Standard_B2ms", + "Standard_B4ms", + "Standard_B8ms", + "Standard_D11", + "Standard_D11_v2", + "Standard_D11_v2_Promo", + "Standard_D12", + "Standard_D12_v2", + "Standard_D12_v2_Promo", + "Standard_D13", + "Standard_D13_v2", + "Standard_D13_v2_Promo", + "Standard_D14", + "Standard_D14_v2", + "Standard_D14_v2_Promo", + "Standard_D15_v2", + "Standard_D16_v3", + "Standard_D16s_v3", + "Standard_D2", + "Standard_D2_v2", + "Standard_D2_v2_Promo", + "Standard_D2_v3", + "Standard_D2s_v3", + "Standard_D3", + "Standard_D32_v3", + "Standard_D32s_v3", + "Standard_D3_v2", + "Standard_D3_v2_Promo", + "Standard_D4", + "Standard_D4_v2", + "Standard_D4_v2_Promo", + "Standard_D4_v3", + "Standard_D4s_v3", + "Standard_D5_v2", + "Standard_D5_v2_Promo", + "Standard_D64_v3", + "Standard_D64s_v3", + "Standard_D8_v3", + "Standard_D8s_v3", + "Standard_DS11", + "Standard_DS11_v2", + "Standard_DS11_v2_Promo", + "Standard_DS12", + "Standard_DS12_v2", + "Standard_DS12_v2_Promo", + "Standard_DS13", + "Standard_DS13-2_v2", + "Standard_DS13-4_v2", + "Standard_DS13_v2", + "Standard_DS13_v2_Promo", + "Standard_DS14", + "Standard_DS14-4_v2", + "Standard_DS14-8_v2", + "Standard_DS14_v2", + "Standard_DS14_v2_Promo", + "Standard_DS15_v2", + "Standard_DS3", + "Standard_DS3_v2", + "Standard_DS3_v2_Promo", + "Standard_DS4", + "Standard_DS4_v2", + "Standard_DS4_v2_Promo", + "Standard_DS5_v2", + "Standard_DS5_v2_Promo", + "Standard_E16_v3", + "Standard_E16s_v3", + "Standard_E2_v3", + "Standard_E2s_v3", + "Standard_E32-16s_v3", + "Standard_E32-8s_v3", + "Standard_E32_v3", + "Standard_E32s_v3", + "Standard_E4_v3", + "Standard_E4s_v3", + "Standard_E64-16s_v3", + "Standard_E64-32s_v3", + "Standard_E64_v3", + "Standard_E64s_v3", + "Standard_E8_v3", + "Standard_E8s_v3", + "Standard_F16", + "Standard_F16s", + "Standard_F16s_v2", + "Standard_F2", + "Standard_F2s_v2", + "Standard_F32s_v2", + "Standard_F4", + "Standard_F4s", + "Standard_F4s_v2", + "Standard_F64s_v2", + "Standard_F72s_v2", + "Standard_F8", + "Standard_F8s", + "Standard_F8s_v2", + "Standard_G1", + "Standard_G2", + "Standard_G3", + "Standard_G4", + "Standard_G5", + "Standard_GS1", + "Standard_GS2", + "Standard_GS3", + "Standard_GS4", + "Standard_GS4-4", + "Standard_GS4-8", + "Standard_GS5", + "Standard_GS5-16", + "Standard_GS5-8", + "Standard_H16", + "Standard_H16m", + "Standard_H16mr", + "Standard_H16r", + "Standard_H8", + "Standard_H8m", + "Standard_L16s", + "Standard_L32s", + "Standard_L4s", + "Standard_L8s", + "Standard_M128-32ms", + "Standard_M128-64ms", + "Standard_M128ms", + "Standard_M128s", + "Standard_M64-16ms", + "Standard_M64-32ms", + "Standard_M64ms", + "Standard_M64s", + } + + for _, sku := range validSkus { + if !isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) { + t.Fatalf("Expected isNSeriesSKU(%s) to be true", sku) + } + } + + for _, sku := range invalidSkus { + if isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) { + t.Fatalf("Expected isNSeriesSKU(%s) to be false", sku) + } + } +} diff --git a/test/e2e/engine/template.go b/test/e2e/engine/template.go index 237ea8ad87..0e13576f78 100644 --- a/test/e2e/engine/template.go +++ b/test/e2e/engine/template.go @@ -6,6 +6,7 @@ import ( "io/ioutil" "log" "path/filepath" + "strings" "github.com/Azure/acs-engine/pkg/api" "github.com/Azure/acs-engine/pkg/api/vlabs" @@ -158,6 +159,16 @@ func (e *Engine) HasWindowsAgents() bool { return false } +// HasGPUNodes will return true if the VM SKU is GPU-enabled +func (e *Engine) HasGPUNodes() bool { + for _, ap := range e.ExpandedDefinition.Properties.AgentPoolProfiles { + if strings.Contains(ap.VMSize, "Standard_N") { + return true + } + } + return false +} + // HasAddon will return true if an addon is enabled func (e *Engine) HasAddon(name string) (bool, api.KubernetesAddon) { for _, addon := range e.ExpandedDefinition.Properties.OrchestratorProfile.KubernetesConfig.Addons { diff --git a/test/e2e/kubernetes/job/job.go b/test/e2e/kubernetes/job/job.go new file mode 100644 index 0000000000..26af9d4f4c --- /dev/null +++ b/test/e2e/kubernetes/job/job.go @@ -0,0 +1,173 @@ +package job + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os/exec" + "regexp" + "time" + + "github.com/Azure/acs-engine/test/e2e/kubernetes/pod" + "github.com/Azure/acs-engine/test/e2e/kubernetes/util" +) + +// List is a container that holds all jobs returned from doing a kubectl get jobs +type List struct { + Jobs []Job `json:"items"` +} + +// Job is used to parse data from kubectl get jobs +type Job struct { + Metadata pod.Metadata `json:"metadata"` + Spec Spec `json:"spec"` + Status Status `json:"status"` +} + +// Spec holds job spec metadata +type Spec struct { + Completions int `json:"completions"` + Parallelism int `json:"parallelism"` +} + +// Status holds job status information +type Status struct { + Active int `json:"active"` + Succeeded int `json:"succeeded"` +} + +// CreateJobFromFile will create a Job from file with a name +func CreateJobFromFile(filename, name, namespace string) (*Job, error) { + cmd := exec.Command("kubectl", "create", "-f", filename) + util.PrintCommand(cmd) + out, err := cmd.CombinedOutput() + if err != nil { + log.Printf("Error trying to create Job %s:%s\n", name, string(out)) + return nil, err + } + job, err := Get(name, namespace) + if err != nil { + log.Printf("Error while trying to fetch Job %s:%s\n", name, err) + return nil, err + } + return job, nil +} + +// GetAll will return all jobs in a given namespace +func GetAll(namespace string) (*List, error) { + cmd := exec.Command("kubectl", "get", "jobs", "-n", namespace, "-o", "json") + util.PrintCommand(cmd) + out, err := cmd.CombinedOutput() + if err != nil { + return nil, err + } + jl := List{} + err = json.Unmarshal(out, &jl) + if err != nil { + log.Printf("Error unmarshalling jobs json:%s\n", err) + return nil, err + } + return &jl, nil +} + +// Get will return a job with a given name and namespace +func Get(jobName, namespace string) (*Job, error) { + cmd := exec.Command("kubectl", "get", "jobs", jobName, "-n", namespace, "-o", "json") + util.PrintCommand(cmd) + out, err := cmd.CombinedOutput() + if err != nil { + return nil, err + } + j := Job{} + err = json.Unmarshal(out, &j) + if err != nil { + log.Printf("Error unmarshalling jobs json:%s\n", err) + return nil, err + } + return &j, nil +} + +// AreAllJobsCompleted will return true if all jobs with a common prefix in a given namespace are in a Completed State +func AreAllJobsCompleted(jobPrefix, namespace string) (bool, error) { + jl, err := GetAll(namespace) + if err != nil { + return false, err + } + + var status []bool + for _, job := range jl.Jobs { + matched, err := regexp.MatchString(jobPrefix, job.Metadata.Name) + if err != nil { + log.Printf("Error trying to match job name:%s\n", err) + return false, err + } + if matched { + if job.Status.Active > 0 { + status = append(status, false) + } else if job.Status.Succeeded == job.Spec.Completions { + status = append(status, true) + } + } + } + + if len(status) == 0 { + return false, nil + } + + for _, s := range status { + if s == false { + return false, nil + } + } + + return true, nil +} + +// WaitOnReady is used when you dont have a handle on a job but want to wait until its in a Succeeded state. +func WaitOnReady(jobPrefix, namespace string, sleep, duration time.Duration) (bool, error) { + readyCh := make(chan bool, 1) + errCh := make(chan error) + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + go func() { + for { + select { + case <-ctx.Done(): + errCh <- fmt.Errorf("Timeout exceeded (%s) while waiting for Jobs (%s) to complete in namespace (%s)", duration.String(), jobPrefix, namespace) + default: + ready, _ := AreAllJobsCompleted(jobPrefix, namespace) + if ready == true { + readyCh <- true + } else { + time.Sleep(sleep) + } + } + } + }() + for { + select { + case err := <-errCh: + return false, err + case ready := <-readyCh: + return ready, nil + } + } +} + +// WaitOnReady will call the static method WaitOnReady passing in p.Metadata.Name and p.Metadata.Namespace +func (j *Job) WaitOnReady(sleep, duration time.Duration) (bool, error) { + return WaitOnReady(j.Metadata.Name, j.Metadata.Namespace, sleep, duration) +} + +// Delete will delete a Job in a given namespace +func (j *Job) Delete() error { + cmd := exec.Command("kubectl", "delete", "job", "-n", j.Metadata.Namespace, j.Metadata.Name) + util.PrintCommand(cmd) + out, err := cmd.CombinedOutput() + if err != nil { + log.Printf("Error while trying to delete Job %s in namespace %s:%s\n", j.Metadata.Namespace, j.Metadata.Name, string(out)) + return err + } + return nil +} diff --git a/test/e2e/kubernetes/kubernetes_test.go b/test/e2e/kubernetes/kubernetes_test.go index dae0efaa4d..845a927b18 100644 --- a/test/e2e/kubernetes/kubernetes_test.go +++ b/test/e2e/kubernetes/kubernetes_test.go @@ -14,6 +14,7 @@ import ( "github.com/Azure/acs-engine/test/e2e/config" "github.com/Azure/acs-engine/test/e2e/engine" "github.com/Azure/acs-engine/test/e2e/kubernetes/deployment" + "github.com/Azure/acs-engine/test/e2e/kubernetes/job" "github.com/Azure/acs-engine/test/e2e/kubernetes/node" "github.com/Azure/acs-engine/test/e2e/kubernetes/pod" "github.com/Azure/acs-engine/test/e2e/kubernetes/service" @@ -378,6 +379,23 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu }) }) + Describe("with a GPU-enabled agent pool", func() { + It("should be able to run a nvidia-gpu job", func() { + if eng.HasGPUNodes() { + j, err := job.CreateJobFromFile(filepath.Join(WorkloadDir, "nvidia-smi.yaml"), "nvidia-smi", "default") + Expect(err).NotTo(HaveOccurred()) + ready, err := j.WaitOnReady(30*time.Second, cfg.Timeout) + delErr := j.Delete() + if delErr != nil { + fmt.Printf("could not delete job %s\n", j.Metadata.Name) + fmt.Println(delErr) + } + Expect(err).NotTo(HaveOccurred()) + Expect(ready).To(Equal(true)) + } + }) + }) + Describe("with a windows agent pool", func() { // TODO stabilize this test /*It("should be able to deploy an iis webserver", func() { diff --git a/test/e2e/kubernetes/workloads/nvidia-smi.yaml b/test/e2e/kubernetes/workloads/nvidia-smi.yaml new file mode 100644 index 0000000000..579e13b350 --- /dev/null +++ b/test/e2e/kubernetes/workloads/nvidia-smi.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: nvidia-smi +spec: + template: + metadata: + name: nvidia-smi + spec: + restartPolicy: Never + containers: + - name: nvidia-smi + image: nvidia/cuda + command: + - nvidia-smi + resources: + limits: + alpha.kubernetes.io/nvidia-gpu: 1 + volumeMounts: + - name: nvidia + mountPath: /usr/local/nvidia/ + volumes: + - name: nvidia + hostPath: + path: /usr/local/nvidia \ No newline at end of file