Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Client driver health checks for Docker #3856

Merged
merged 24 commits into from
Mar 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1570972
add concept of health checks to fingerprinters and nodes
chelseakomlo Jan 25, 2018
a340bad
allow nomad to schedule based on the status of a client driver health…
chelseakomlo Feb 22, 2018
9ec5a93
fix scheduler driver name; create node structs file
chelseakomlo Feb 27, 2018
7c34605
fix up gofmt
chelseakomlo Feb 28, 2018
9dfb5c6
go style; update comments
chelseakomlo Feb 28, 2018
70bebd1
fix up scheduler mocks
chelseakomlo Mar 1, 2018
fd25db9
updating comments; locking concurrent node access
chelseakomlo Mar 1, 2018
865b7e0
fix up racy tests
chelseakomlo Mar 3, 2018
240fee4
fix up codereview feedback
chelseakomlo Mar 6, 2018
521fbd4
refresh driver information for non-health checking drivers periodically
chelseakomlo Mar 7, 2018
8a0ed4e
improve tests
chelseakomlo Mar 7, 2018
3ad03d9
notes from walk through
dadgar Mar 7, 2018
8aefd29
Code review feedback
chelseakomlo Mar 9, 2018
115d54c
fix up health check logic comparison; add node events to client drive…
chelseakomlo Mar 14, 2018
8597da0
simplify logic
chelseakomlo Mar 15, 2018
7b2ed01
remove unused function
chelseakomlo Mar 15, 2018
06a306e
improve comments; update watchDriver
chelseakomlo Mar 19, 2018
cba0a4d
function rename and re-arrange functions in fingerprint_manager
chelseakomlo Mar 19, 2018
9c14330
fix issue when updating node events
chelseakomlo Mar 20, 2018
ffe9292
Only run health check if driver is detected
dadgar Mar 20, 2018
b59bea9
Docker driver doesn't return errors but injects into the DriverInfo
dadgar Mar 20, 2018
127b2c6
set driver to unhealthy once if it cannot be detected in periodic check
chelseakomlo Mar 21, 2018
eb3a53e
always set initial health status for every driver
chelseakomlo Mar 21, 2018
89ffc96
fix up scheduling test
chelseakomlo Mar 21, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions api/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"sort"
"strconv"
"time"
)

// Nodes is used to query node-related API endpoints
Expand Down Expand Up @@ -96,6 +97,15 @@ func (n *Nodes) GcAlloc(allocID string, q *QueryOptions) error {
return err
}

// DriverInfo is used to deserialize a DriverInfo entry
type DriverInfo struct {
Attributes map[string]string
Detected bool
Healthy bool
HealthDescription string
UpdateTime time.Time
}

// Node is used to deserialize a node entry.
type Node struct {
ID string
Expand All @@ -114,6 +124,7 @@ type Node struct {
StatusDescription string
StatusUpdatedAt int64
Events []*NodeEvent
Drivers map[string]*DriverInfo
CreateIndex uint64
ModifyIndex uint64
}
Expand Down
83 changes: 81 additions & 2 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
}

fingerprintManager := NewFingerprintManager(c.GetConfig, c.config.Node,
c.shutdownCh, c.updateNodeFromFingerprint, c.logger)
c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromDriver,
c.logger)

// Fingerprint the node and scan for drivers
if err := fingerprintManager.Run(); err != nil {
Expand Down Expand Up @@ -894,6 +895,9 @@ func (c *Client) setupNode() error {
if node.Links == nil {
node.Links = make(map[string]string)
}
if node.Drivers == nil {
node.Drivers = make(map[string]*structs.DriverInfo)
}
if node.Meta == nil {
node.Meta = make(map[string]string)
}
Expand Down Expand Up @@ -1006,6 +1010,81 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons
if nodeHasChanged {
c.updateNode()
}

return c.config.Node
}

// updateNodeFromDriver receives either a fingerprint of the driver or its
// health and merges this into a single DriverInfo object
func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs.DriverInfo) *structs.Node {
c.configLock.Lock()
defer c.configLock.Unlock()

var hasChanged bool

if fingerprint != nil {
if c.config.Node.Drivers[name] == nil {
// If the driver info has not yet been set, do that here
hasChanged = true
c.config.Node.Drivers[name] = fingerprint
} else {
// The driver info has already been set, fix it up
if c.config.Node.Drivers[name].Detected != fingerprint.Detected {
hasChanged = true
c.config.Node.Drivers[name].Detected = fingerprint.Detected
}

for attrName, newVal := range fingerprint.Attributes {
oldVal := c.config.Node.Drivers[name].Attributes[attrName]
if oldVal == newVal {
continue
}

hasChanged = true
if newVal == "" {
delete(c.config.Node.Attributes, attrName)
} else {
c.config.Node.Attributes[attrName] = newVal
}
}
}
}

if health != nil {
if c.config.Node.Drivers[name] == nil {
hasChanged = true
c.config.Node.Drivers[name] = health
} else {
oldVal := c.config.Node.Drivers[name]
if health.HealthCheckEquals(oldVal) {
// Make sure we accurately reflect the last time a health check has been
// performed for the driver.
oldVal.UpdateTime = health.UpdateTime
} else {
hasChanged = true

// Only emit an event if the health status has changed, not if we are
// simply updating a node on startup
if health.Healthy != oldVal.Healthy && oldVal.HealthDescription != "" {
event := &structs.NodeEvent{
Subsystem: "Driver",
Message: health.HealthDescription,
Timestamp: time.Now().Unix(),
}
c.triggerNodeEvent(event)
}

// Update the node with the latest information
c.config.Node.Drivers[name].MergeHealthCheck(health)
}
}
}

if hasChanged {
c.config.Node.Drivers[name].UpdateTime = time.Now()
c.updateNode()
}

return c.config.Node
}

Expand Down Expand Up @@ -1190,7 +1269,7 @@ func (c *Client) watchNodeEvents() {
timer.Reset(c.retryIntv(nodeUpdateRetryIntv))
} else {
// Reset the events since we successfully sent them.
batchEvents = nil
batchEvents = []*structs.NodeEvent{}
}
case <-c.shutdownCh:
return
Expand Down
76 changes: 56 additions & 20 deletions client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,27 +142,63 @@ func TestClient_Fingerprint_Periodic(t *testing.T) {
node := c1.config.Node
mockDriverName := "driver.mock_driver"

// Ensure the mock driver is registered on the client
testutil.WaitForResult(func() (bool, error) {
mockDriverStatus := node.Attributes[mockDriverName]
if mockDriverStatus == "" {
return false, fmt.Errorf("mock driver attribute should be set on the client")
}
return true, nil
}, func(err error) {
t.Fatalf("err: %v", err)
})
{
// Ensure the mock driver is registered on the client
testutil.WaitForResult(func() (bool, error) {
c1.configLock.Lock()
defer c1.configLock.Unlock()
mockDriverStatus := node.Attributes[mockDriverName]
mockDriverInfo := node.Drivers["mock_driver"]
if mockDriverStatus == "" {
return false, fmt.Errorf("mock driver attribute should be set on the client")
}

// Ensure that the client fingerprinter eventually removes this attribute
testutil.WaitForResult(func() (bool, error) {
mockDriverStatus := node.Attributes[mockDriverName]
if mockDriverStatus != "" {
return false, fmt.Errorf("mock driver attribute should not be set on the client")
}
return true, nil
}, func(err error) {
t.Fatalf("err: %v", err)
})
// assert that the Driver information for the node is also set correctly
if mockDriverInfo == nil {
return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers")
}
if !mockDriverInfo.Detected {
return false, fmt.Errorf("mock driver should be set as detected")
}
if !mockDriverInfo.Healthy {
return false, fmt.Errorf("mock driver should be set as healthy")
}
if mockDriverInfo.HealthDescription == "" {
return false, fmt.Errorf("mock driver description should not be empty")
}
return true, nil
}, func(err error) {
t.Fatalf("err: %v", err)
})
}

{
testutil.WaitForResult(func() (bool, error) {
c1.configLock.Lock()
defer c1.configLock.Unlock()
mockDriverStatus := node.Attributes[mockDriverName]
mockDriverInfo := node.Drivers["mock_driver"]
if mockDriverStatus != "" {
return false, fmt.Errorf("mock driver attribute should not be set on the client")
}
// assert that the Driver information for the node is also set correctly
if mockDriverInfo == nil {
return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers")
}
if mockDriverInfo.Detected {
return false, fmt.Errorf("mock driver should be set as detected")
}
if mockDriverInfo.Healthy {
return false, fmt.Errorf("mock driver should be set as healthy")
}
if mockDriverInfo.HealthDescription == "" {
return false, fmt.Errorf("mock driver description should not be empty")
}
return true, nil
}, func(err error) {
t.Fatalf("err: %v", err)
})
}
}

// TestClient_MixedTLS asserts that when a server is running with TLS enabled
Expand Down
41 changes: 40 additions & 1 deletion client/driver/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,6 @@ func (d *DockerDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstru
d.logger.Printf("[INFO] driver.docker: failed to initialize client: %s", err)
}
d.fingerprintSuccess = helper.BoolToPtr(false)
resp.RemoveAttribute(dockerDriverAttr)
return nil
}

Expand Down Expand Up @@ -552,6 +551,46 @@ func (d *DockerDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstru
return nil
}

// HealthCheck implements the interface for the HealthCheck interface. This
// performs a health check on the docker driver, asserting whether the docker
// driver is responsive to a `docker ps` command.
func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstructs.HealthCheckResponse) error {
dinfo := &structs.DriverInfo{
UpdateTime: time.Now(),
}

client, _, err := d.dockerClients()
if err != nil {
d.logger.Printf("[WARN] driver.docker: failed to retrieve Docker client in the process of a docker health check: %v", err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is part of the reason I feel like we skip the health check unless it is detected, otherwise you will mark it as unhealthy and log even if docker just isn't there.

dinfo.HealthDescription = fmt.Sprintf("Failed retrieving Docker client: %v", err)
resp.AddDriverInfo("docker", dinfo)
return nil
}

_, err = client.ListContainers(docker.ListContainersOptions{All: false})
if err != nil {
d.logger.Printf("[WARN] driver.docker: failed to list Docker containers in the process of a Docker health check: %v", err)
dinfo.HealthDescription = fmt.Sprintf("Failed to list Docker containers: %v", err)
resp.AddDriverInfo("docker", dinfo)
return nil
}

d.logger.Printf("[TRACE] driver.docker: docker driver is available and is responsive to `docker ps`")
dinfo.Healthy = true
dinfo.HealthDescription = "Docker driver is available and responsive"
resp.AddDriverInfo("docker", dinfo)
return nil
}

// GetHealthChecks implements the interface for the HealthCheck interface. This
// sets whether the driver is eligible for periodic health checks and the
// interval at which to do them.
func (d *DockerDriver) GetHealthCheckInterval(req *cstructs.HealthCheckIntervalRequest, resp *cstructs.HealthCheckIntervalResponse) error {
resp.Eligible = true
resp.Period = 1 * time.Minute
return nil
}

// Validate is used to validate the driver configuration
func (d *DockerDriver) Validate(config map[string]interface{}) error {
fd := &fields.FieldData{
Expand Down
41 changes: 41 additions & 0 deletions client/driver/docker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/driver/env"
"github.com/hashicorp/nomad/client/fingerprint"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/client/testutil"
"github.com/hashicorp/nomad/helper/uuid"
Expand Down Expand Up @@ -164,6 +165,7 @@ func TestDockerDriver_Fingerprint(t *testing.T) {
if !tu.IsTravis() {
t.Parallel()
}

ctx := testDockerDriverContexts(t, &structs.Task{Name: "foo", Driver: "docker", Resources: basicResources})
//ctx.DriverCtx.config.Options = map[string]string{"docker.cleanup.image": "false"}
defer ctx.AllocDir.Destroy()
Expand Down Expand Up @@ -227,6 +229,7 @@ func TestDockerDriver_Fingerprint_Bridge(t *testing.T) {

request := &cstructs.FingerprintRequest{Config: conf, Node: conf.Node}
var response cstructs.FingerprintResponse

err = dd.Fingerprint(request, &response)
if err != nil {
t.Fatalf("error fingerprinting docker: %v", err)
Expand All @@ -251,6 +254,44 @@ func TestDockerDriver_Fingerprint_Bridge(t *testing.T) {
t.Logf("docker bridge ip: %q", attributes["driver.docker.bridge_ip"])
}

func TestDockerDriver_Check_DockerHealthStatus(t *testing.T) {
if !tu.IsTravis() {
t.Parallel()
}
if !testutil.DockerIsConnected(t) {
t.Skip("requires Docker")
}
if runtime.GOOS != "linux" {
t.Skip("expect only on linux")
}

require := require.New(t)

expectedAddr, err := sockaddr.GetInterfaceIP("docker0")
if err != nil {
t.Fatalf("unable to get ip for docker0: %v", err)
}
if expectedAddr == "" {
t.Fatalf("unable to get ip for docker bridge")
}

conf := testConfig(t)
conf.Node = mock.Node()
dd := NewDockerDriver(NewDriverContext("", "", conf, conf.Node, testLogger(), nil))

request := &cstructs.HealthCheckRequest{}
var response cstructs.HealthCheckResponse

dc, ok := dd.(fingerprint.HealthCheck)
require.True(ok)
err = dc.HealthCheck(request, &response)
require.Nil(err)

driverInfo := response.Drivers["docker"]
require.NotNil(driverInfo)
require.True(driverInfo.Healthy)
}

func TestDockerDriver_StartOpen_Wait(t *testing.T) {
if !tu.IsTravis() {
t.Parallel()
Expand Down
Loading