Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Install neuron plugin for inf1 instances #2162

Merged
merged 12 commits into from
May 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions integration/matchers/kubernetes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package matchers

import (
"fmt"

"github.com/onsi/gomega"
"github.com/onsi/gomega/format"
"github.com/onsi/gomega/types"
apierrors "k8s.io/apimachinery/pkg/api/errors"
)

//BeNotFoundError succeeds if actual is a non-nil error
//which represents a missing kubernetes resource
func BeNotFoundError() types.GomegaMatcher {
return &notFoundMatcher{}
}

type notFoundMatcher struct {
}

func (matcher *notFoundMatcher) Match(actual interface{}) (success bool, err error) {
isErr, err := gomega.HaveOccurred().Match(actual)
if !isErr || err != nil {
return isErr, err
}
return apierrors.IsNotFound(actual.(error)), nil
}

func (matcher *notFoundMatcher) FailureMessage(actual interface{}) (message string) {
return fmt.Sprintf("Expected a NotFound API error to have occurred. Got:\n%s", format.Object(actual, 1))
}

func (matcher *notFoundMatcher) NegatedFailureMessage(actual interface{}) (message string) {
return fmt.Sprintf("Unexpected NotFound API error:\n%s\n%s\n%s", format.Object(actual, 1), format.IndentString(actual.(error).Error(), 1), "occurred")
}
206 changes: 206 additions & 0 deletions integration/tests/inferentia/inferentia_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
// +build integration

package inferentia

import (
"fmt"
"io/ioutil"
"os"
"testing"

. "github.com/weaveworks/eksctl/integration/matchers"
. "github.com/weaveworks/eksctl/integration/runner"
"github.com/weaveworks/eksctl/integration/tests"
api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5"
"github.com/weaveworks/eksctl/pkg/eks"
"github.com/weaveworks/eksctl/pkg/testutils"
"github.com/weaveworks/eksctl/pkg/utils/file"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"github.com/onsi/gomega/gexec"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var defaultCluster string
var noInstallCluster string
var params *tests.Params

func init() {
// Call testing.Init() prior to tests.NewParams(), as otherwise -test.* will not be recognised. See also: https://golang.org/doc/go1.13#testing
testing.Init()
params = tests.NewParams("inf1")
defaultCluster = params.ClusterName
noInstallCluster = params.NewClusterName("inf1-no-plugin")
}

func TestSuite(t *testing.T) {
testutils.RegisterAndRun(t)
}

var _ = Describe("(Integration) Inferentia nodes", func() {
const (
initNG = "inf1-ng-0"
newNG = "inf1-ng-1"
)
BeforeSuite(func() {
params.KubeconfigTemp = false
if params.KubeconfigPath == "" {
wd, _ := os.Getwd()
f, _ := ioutil.TempFile(wd, "kubeconfig-")
params.KubeconfigPath = f.Name()
params.KubeconfigTemp = true
}
})

AfterSuite(func() {
params.DeleteClusters()
gexec.KillAndWait()
if params.KubeconfigTemp {
os.Remove(params.KubeconfigPath)
}
os.RemoveAll(params.TestDirectory)
})

When("creating a cluster with inf1 nodes", func() {
Context("by default", func() {
It("should not return an error", func() {
if params.SkipCreate {
fmt.Fprintf(GinkgoWriter, "will use existing cluster %s", defaultCluster)
if !file.Exists(params.KubeconfigPath) {
// Generate the Kubernetes configuration that eksctl create
// would have generated otherwise:
cmd := params.EksctlUtilsCmd.WithArgs(
"write-kubeconfig",
"--verbose", "4",
"--cluster", defaultCluster,
"--kubeconfig", params.KubeconfigPath,
)
Expect(cmd).To(RunSuccessfully())
}
return
}

fmt.Fprintf(GinkgoWriter, "Using kubeconfig: %s\n", params.KubeconfigPath)

cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--verbose", "4",
"--name", defaultCluster,
"--tags", "alpha.eksctl.io/description=eksctl integration test",
"--nodegroup-name", initNG,
"--node-labels", "ng-name="+initNG,
"--nodes", "1",
"--node-type", "inf1.xlarge",
"--version", params.Version,
"--kubeconfig", params.KubeconfigPath,
)
Expect(cmd).To(RunSuccessfully())
})
It("should have installed the neuron device plugin", func() {
cfg := &api.ClusterConfig{
Metadata: &api.ClusterMeta{
Name: defaultCluster,
Region: params.Region,
},
}
ctl := eks.New(&api.ProviderConfig{Region: params.Region}, cfg)
err := ctl.RefreshClusterStatus(cfg)
Expect(err).ShouldNot(HaveOccurred())

clientSet, err := ctl.NewStdClientSet(cfg)
Expect(err).ShouldNot(HaveOccurred())

_, err = clientSet.AppsV1().DaemonSets("kube-system").Get("neuron-device-plugin-daemonset", metav1.GetOptions{})
Expect(err).ShouldNot(HaveOccurred())
})
})
Context("with --install-neuron-plugin=false", func() {
It("should not return an error", func() {
if params.SkipCreate {
fmt.Fprintf(GinkgoWriter, "will use existing cluster %s", noInstallCluster)
if !file.Exists(params.KubeconfigPath) {
// Generate the Kubernetes configuration that eksctl create
// would have generated otherwise:
cmd := params.EksctlUtilsCmd.WithArgs(
"write-kubeconfig",
"--verbose", "4",
"--cluster", noInstallCluster,
"--kubeconfig", params.KubeconfigPath,
)
Expect(cmd).To(RunSuccessfully())
}
return
}

fmt.Fprintf(GinkgoWriter, "Using kubeconfig: %s\n", params.KubeconfigPath)

cmd := params.EksctlCreateCmd.WithArgs(
"cluster",
"--verbose", "4",
"--name", noInstallCluster,
"--tags", "alpha.eksctl.io/description=eksctl integration test",
"--install-neuron-plugin=false",
"--nodegroup-name", initNG,
"--node-labels", "ng-name="+initNG,
"--nodes", "1",
"--node-type", "inf1.xlarge",
"--version", params.Version,
"--kubeconfig", params.KubeconfigPath,
)
Expect(cmd).To(RunSuccessfully())
})
It("should not have installed the neuron device plugin", func() {
cfg := &api.ClusterConfig{
Metadata: &api.ClusterMeta{
Name: noInstallCluster,
Region: params.Region,
},
}
ctl := eks.New(&api.ProviderConfig{Region: params.Region}, cfg)
err := ctl.RefreshClusterStatus(cfg)
Expect(err).ShouldNot(HaveOccurred())

clientSet, err := ctl.NewStdClientSet(cfg)
Expect(err).ShouldNot(HaveOccurred())

_, err = clientSet.AppsV1().DaemonSets("kube-system").Get("neuron-device-plugin-daemonset", metav1.GetOptions{})
Expect(err).Should(BeNotFoundError())
})
When("adding a nodegroup by default", func() {
It("should install without error", func() {
cmd := params.EksctlCreateCmd.WithArgs(
"nodegroup",
"--cluster", noInstallCluster,
"--nodes", "1",
"--verbose", "4",
"--name", newNG,
"--tags", "alpha.eksctl.io/description=eksctl integration test",
"--node-labels", "ng-name="+newNG,
"--nodes", "1",
"--node-type", "inf1.xlarge",
"--version", params.Version,
)
Expect(cmd).To(RunSuccessfully())
})
It("should install the neuron device plugin", func() {
cfg := &api.ClusterConfig{
Metadata: &api.ClusterMeta{
Name: noInstallCluster,
Region: params.Region,
},
}
ctl := eks.New(&api.ProviderConfig{Region: params.Region}, cfg)
err := ctl.RefreshClusterStatus(cfg)
Expect(err).ShouldNot(HaveOccurred())

clientSet, err := ctl.NewStdClientSet(cfg)
Expect(err).ShouldNot(HaveOccurred())

_, err = clientSet.AppsV1().DaemonSets("kube-system").Get("neuron-device-plugin-daemonset", metav1.GetOptions{})
Expect(err).ShouldNot(HaveOccurred())
})
})
})
})
})
23 changes: 23 additions & 0 deletions pkg/addons/assets.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 45 additions & 0 deletions pkg/addons/assets/neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: "%s.dkr.ecr.%s.%s/neuron-device-plugin:1.0.8550.0"
imagePullPolicy: IfNotPresent
name: k8s-neuron-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
Loading