Skip to content
This repository has been archived by the owner on Jan 11, 2023. It is now read-only.

E2E test - 50 nodes #2260

Merged
merged 12 commits into from
Feb 14, 2018
39 changes: 39 additions & 0 deletions examples/e2e-tests/kubernetes/node-count/50-nodes/definition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"apiVersion": "vlabs",
"properties": {
"orchestratorProfile": {
"orchestratorType": "Kubernetes",
"orchestratorRelease": "1.8"
},
"masterProfile": {
"count": 5,
"dnsPrefix": "",
"vmSize": "Standard_D2_v2",
"OSDiskSizeGB": 200
},
"agentPoolProfiles": [
{
"name": "agentpool1",
"count": 50,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the docs (https://github.com/Azure/acs-engine/blob/master/docs/kubernetes-large-clusters.md) We recommend the use of smaller pools (e.g., count of 20) over larger pools (e.g., count of 100); produce your desired total node count with lots of pools, as opposed to as few as possible. Would it make sense to change this to several smaller pools since people are likely to look at example files and to copy them for their own implementation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amanohar @khenidak Is the above guidance still valid? (i.e., don't use node pools > 20 count)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i admit this answer is not as clear as i would love it to be

From one end 50 nodes with 4+ pools will hit ARM template limit (800 object). From another, I really don't see a relationship between # of nodes per pool and scale. As long as we don't hit more than 100 per availability set (or scale set) we are fine.

Many pools are better because users will get extra load balancers (Extra public IPs), smaller blast radius (in case of failures). They are however harder to manager

"vmSize": "Standard_D2_v2",
"osType": "Linux",
"storageProfile": "ManagedDisks",
"availabilityProfile": "AvailabilitySet"
}
],
"linuxProfile": {
"adminUsername": "azureuser",
"ssh": {
"publicKeys": [
{
"keyData": ""
}
]
}
},
"servicePrincipalProfile": {
"clientId": "",
"secret": ""
}
}
}
6 changes: 3 additions & 3 deletions test/e2e/engine/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,13 @@ func (e *Engine) HasDashboard() bool {
}

// HasTiller will return true if tiller addon is enabled
func (e *Engine) HasTiller() bool {
func (e *Engine) HasTiller() (bool, api.KubernetesAddon) {
for _, addon := range e.ExpandedDefinition.Properties.OrchestratorProfile.KubernetesConfig.Addons {
if addon.Name == "tiller" {
return *addon.Enabled
return *addon.Enabled, addon
}
}
return false
return false, api.KubernetesAddon{}
}

// HasACIConnector will return true if aci-connector addon is enabled
Expand Down
25 changes: 11 additions & 14 deletions test/e2e/kubernetes/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,23 +125,20 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
})

It("should have tiller running", func() {
if eng.HasTiller() {
if hasTiller, tillerAddon := eng.HasTiller(); hasTiller {
running, err := pod.WaitOnReady("tiller", "kube-system", 3, 30*time.Second, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
} else {
Skip("tiller disabled for this cluster, will not test")
}
})

It("should have a tiller max-history of 5", func() {
if eng.HasTiller() {
pods, err := pod.GetAllByPrefix("tiller-deploy", "kube-system")
Expect(err).NotTo(HaveOccurred())
// There is only one tiller pod and one container in that pod.
actualTillerMaxHistory, err := pods[0].Spec.Containers[0].GetEnvironmentVariable("TILLER_HISTORY_MAX")
Expect(err).NotTo(HaveOccurred())
Expect(actualTillerMaxHistory).To(Equal("5"))
if tillerAddon.Config != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this ever be nil? What happens if it is?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current configuration enforces always non-nil:

https://github.com/Azure/acs-engine/blob/master/pkg/acsengine/defaults.go#L752

So the above is just for good hygiene.

By("Ensuring that the correct max-history has been applied")
maxHistory := tillerAddon.Config["max-history"]
pods, err := pod.GetAllByPrefix("tiller-deploy", "kube-system")
Expect(err).NotTo(HaveOccurred())
// There is only one tiller pod and one container in that pod.
actualTillerMaxHistory, err := pods[0].Spec.Containers[0].GetEnvironmentVariable("TILLER_HISTORY_MAX")
Expect(err).NotTo(HaveOccurred())
Expect(actualTillerMaxHistory).To(Equal(maxHistory))
}
} else {
Skip("tiller disabled for this cluster, will not test")
}
Expand Down
6 changes: 4 additions & 2 deletions test/e2e/remote/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ type Connection struct {
func NewConnection(host, port, user, keyPath string) (*Connection, error) {
conn, err := net.Dial("unix", os.Getenv("SSH_AUTH_SOCK"))
if err != nil {
log.Fatal(err)
log.Printf("unable to establish net connection $SSH_AUTH_SOCK has value %s\n", os.Getenv("SSH_AUTH_SOCK"))
return nil, err
}
defer conn.Close()
ag := agent.NewClient(conn)
Expand All @@ -51,7 +52,8 @@ func NewConnection(host, port, user, keyPath string) (*Connection, error) {
ag.Add(addKey)
signers, err := ag.Signers()
if err != nil {
log.Fatal(err)
log.Println("unable to add key to agent")
return nil, err
}
auths := []ssh.AuthMethod{ssh.PublicKeys(signers...)}

Expand Down
38 changes: 22 additions & 16 deletions test/e2e/runner/cli_provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ type CLIProvisioner struct {
Point *metrics.Point
ResourceGroups []string
Engine *engine.Engine
Masters []azure.VM
Agents []azure.VM
}

// BuildCLIProvisioner will return a ProvisionerConfig object which is used to run a provision
Expand Down Expand Up @@ -155,6 +157,22 @@ func (cli *CLIProvisioner) provision() error {
return fmt.Errorf("Error while trying to create deployment:%s", err)
}

// Store the hosts for future introspection
hosts, err := cli.Account.GetHosts(cli.Config.Name)
if err != nil {
return err
}
var masters, agents []azure.VM
for _, host := range hosts {
if strings.Contains(host.Name, "master") {
masters = append(masters, host)
} else if strings.Contains(host.Name, "agent") {
agents = append(agents, host)
}
}
cli.Masters = masters
cli.Agents = agents

return nil
}

Expand Down Expand Up @@ -205,18 +223,6 @@ func (cli *CLIProvisioner) waitForNodes() error {

// FetchProvisioningMetrics gets provisioning files from all hosts in a cluster
func (cli *CLIProvisioner) FetchProvisioningMetrics(path string, cfg *config.Config, acct *azure.Account) error {
var masters, agents []string
hosts, err := acct.GetHosts("")
if err != nil {
return err
}
for _, host := range hosts {
if strings.Contains(host.Name, "master") {
masters = append(masters, host.Name)
} else if strings.Contains(host.Name, "agent") {
agents = append(agents, host.Name)
}
}
agentFiles := []string{"/var/log/azure/cluster-provision.log", "/var/log/cloud-init.log",
"/var/log/cloud-init-output.log", "/var/log/syslog", "/var/log/azure/custom-script/handler.log",
"/opt/m", "/opt/azure/containers/kubelet.sh", "/opt/azure/containers/provision.sh",
Expand All @@ -228,18 +234,18 @@ func (cli *CLIProvisioner) FetchProvisioningMetrics(path string, cfg *config.Con
if err != nil {
return err
}
for _, master := range masters {
for _, master := range cli.Masters {
for _, fp := range masterFiles {
err := conn.CopyRemote(master, fp)
err := conn.CopyRemote(master.Name, fp)
if err != nil {
log.Printf("Error reading file from path (%s):%s", path, err)
}
}
}

for _, agent := range agents {
for _, agent := range cli.Agents {
for _, fp := range agentFiles {
err := conn.CopyRemote(agent, fp)
err := conn.CopyRemote(agent.Name, fp)
if err != nil {
log.Printf("Error reading file from path (%s):%s", path, err)
}
Expand Down