Skip to content

Commit

Permalink
fix(k8s): make sure Service Endpoints are ready at end of status checks
Browse files Browse the repository at this point in the history
This step is put in to give the cluster a moment to update its network
routing. For example, when a Deployment passes its health check,
Kubernetes doesn't instantly route Service traffic to it (particularly if
the master is under high load). We need to account for this so that
dependant tasks, tests and services can reliably run after the resource
status check resolves.
  • Loading branch information
edvald committed Jul 9, 2019
1 parent fbe36c4 commit 4678f40
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 5 deletions.
4 changes: 2 additions & 2 deletions garden-service/src/plugins/kubernetes/status/pod.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ import chalk from "chalk"

export const podLogLines = 20

export async function checkPodStatus(
export function checkPodStatus(
resource: KubernetesServerResource, pods: KubernetesServerResource<V1Pod>[],
): Promise<ResourceStatus> {
): ResourceStatus {
for (const pod of pods) {
// TODO: detect unhealthy state (currently we just time out)
const ready = some(pod.status!.conditions!.map(c => c.type === "ready"))
Expand Down
70 changes: 70 additions & 0 deletions garden-service/src/plugins/kubernetes/status/service.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Copyright (C) 2018 Garden Technologies, Inc. <[email protected]>
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

import * as Bluebird from "bluebird"
import { flatten } from "lodash"
import { KubeApi } from "../api"
import { KubernetesServerResource } from "../types"
import { TimeoutError } from "../../../exceptions"
import { getPods } from "../util"
import { sleep } from "../../../util/util"
import { LogEntry } from "../../../logger/log-entry"
import { checkPodStatus } from "./pod"

// There's something strange going on if this takes more than 10 seconds to resolve
const timeout = 10000

/**
* Wait until Service Endpoints are correctly routing to the correct Pods.
* Note: This assumes that the Service and Pod/workload statuses have previously been cleared as ready.
*/
export async function waitForServiceEndpoints(
api: KubeApi, log: LogEntry, namespace: string, resources: KubernetesServerResource[],
) {
const services = resources.filter(r => r.apiVersion === "v1" && r.kind === "Service")
const start = new Date().getTime()

return Bluebird.map(services, async (service) => {
const selector = service.spec.selector

if (!selector) {
return
}

const serviceName = service.metadata.name
const serviceNamespace = service.metadata.namespace || namespace

const pods = await getPods(api, serviceNamespace, selector)
const readyPodNames = pods
.filter(p => checkPodStatus(p, [p]).state === "ready")
.map(p => p.metadata.name)

while (true) {
const endpoints = await api.core.readNamespacedEndpoints(serviceName, serviceNamespace)

const addresses = flatten(endpoints.subsets!.map(subset => subset.addresses || []))
const routedPods = addresses
.filter(a => a.targetRef!.kind === "Pod" && readyPodNames.includes(a.targetRef!.name!))

if (routedPods.length === readyPodNames.length) {
// All endpoints routing nicely!
break
}

if (new Date().getTime() - start > timeout) {
throw new TimeoutError(
`Timed out waiting for Service '${serviceName}' Endpoints to resolve to correct Pods`,
{ service, pods },
)
}

log.setState({ symbol: "warning", msg: `Waiting for Service '${serviceName}' Endpoints to resolve...` })
await sleep(1000)
}
})
}
13 changes: 10 additions & 3 deletions garden-service/src/plugins/kubernetes/status/status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import dedent = require("dedent")
import { getPods } from "../util"
import { checkWorkloadStatus } from "./workload"
import { checkPodStatus } from "./pod"
import { waitForServiceEndpoints } from "./service"

export interface ResourceStatus {
state: ServiceState
Expand Down Expand Up @@ -156,7 +157,7 @@ interface WaitParams {
/**
* Wait until the rollout is complete for each of the given Kubernetes objects
*/
export async function waitForResources({ ctx, provider, serviceName, resources: objects, log }: WaitParams) {
export async function waitForResources({ ctx, provider, serviceName, resources, log }: WaitParams) {
let loops = 0
let lastMessage: string | undefined
const startTime = new Date().getTime()
Expand All @@ -171,10 +172,10 @@ export async function waitForResources({ ctx, provider, serviceName, resources:
const namespace = await getAppNamespace(ctx, log, provider)

while (true) {
await sleep(2000 + 1000 * loops)
await sleep(2000 + 500 * loops)
loops += 1

const statuses = await checkResourceStatuses(api, namespace, objects, log)
const statuses = await checkResourceStatuses(api, namespace, resources, log)

for (const status of statuses) {
if (status.state === "unhealthy") {
Expand All @@ -201,6 +202,12 @@ export async function waitForResources({ ctx, provider, serviceName, resources:
}

if (combineStates(statuses.map(s => s.state)) === "ready") {
// If applicable, wait until Services properly point to each Pod in the resource list.
// This step is put in to give the cluster a moment to update its network routing.
// For example, when a Deployment passes its health check, Kubernetes doesn't instantly route Service traffic
// to it. We need to account for this so that dependant tasks, tests and services can reliably run after this
// routine resolves.
await waitForServiceEndpoints(api, statusLine, namespace, resources)
break
}

Expand Down

0 comments on commit 4678f40

Please sign in to comment.