Skip to content

Commit

Permalink
Add an API method for determining the best status
Browse files Browse the repository at this point in the history
Given a list of HealthChecks, this determines the "best" status for the
collective group. This is useful for nodes and services, which may have
multiple checks associated with them.
  • Loading branch information
sethvargo committed Nov 29, 2016
1 parent 916f3c8 commit 4179aac
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 17 deletions.
57 changes: 57 additions & 0 deletions api/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package api

import (
"fmt"
"log"
"strings"
)

const (
Expand All @@ -11,6 +13,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)

const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"

// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)

// HealthCheck is used to represent a single check
Expand All @@ -25,6 +36,52 @@ type HealthCheck struct {
ServiceName string
}

// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck

// AggregatedStatus returns the "best" status for the list of health checks.
// Because a given entry may have many service and node-level health checks
// attached, this function determines the best representative of the status as
// as single string using the following heuristic:
//
// maintenance > critical > warning > passing
//
func (c HealthChecks) AggregatedStatus() string {
var passing, warning, critical, maintenance bool
for _, check := range c {
id := string(check.CheckID)
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
maintenance = true
continue
}

switch check.Status {
case HealthPassing:
passing = true
case HealthWarning:
warning = true
case HealthCritical:
critical = true
default:
log.Printf("[WARN] unknown status %q", check.Status)
return ""
}
}

switch {
case maintenance:
return HealthMaint
case critical:
return HealthCritical
case warning:
return HealthWarning
case passing:
return HealthPassing
default:
return HealthPassing
}
}

// ServiceEntry is used for the health service endpoint
type ServiceEntry struct {
Node *Node
Expand Down
133 changes: 133 additions & 0 deletions api/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
})
}

func TestHealthChecks_AggregatedStatus(t *testing.T) {
t.Parallel()

cases := []struct {
name string
checks HealthChecks
exp string
}{
{
"empty",
nil,
HealthPassing,
},
{
"passing",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
},
HealthPassing,
},
{
"warning",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
{
"critical",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
},
HealthCritical,
},
{
"node_maintenance",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
},
HealthMaint,
},
{
"service_maintenance",
HealthChecks{
&HealthCheck{
CheckID: ServiceMaintPrefix + "service",
},
},
HealthMaint,
},
{
"unknown",
HealthChecks{
&HealthCheck{
Status: "nope-nope-noper",
},
},
"",
},
{
"maintenance_over_critical",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
&HealthCheck{
Status: HealthCritical,
},
},
HealthMaint,
},
{
"critical_over_warning",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthCritical,
},
{
"warning_over_passing",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
&HealthCheck{
Status: HealthPassing,
},
},
HealthWarning,
},
{
"lots",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
}

for i, tc := range cases {
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
act := tc.checks.AggregatedStatus()
if tc.exp != act {
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
}
})
}
}

func TestHealth_Checks(t *testing.T) {
t.Parallel()
c, s := makeClient(t)
Expand Down
14 changes: 5 additions & 9 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ const (
checksDir = "checks"
checkStateDir = "checks/state"

// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
nodeMaintCheckID = "_node_maintenance"

// Default reasons for node/service maintenance mode
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
"but no reason was provided. This is a default message."
Expand Down Expand Up @@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {

// serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) types.CheckID {
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
}

// EnableServiceMaintenance will register a false health check against the given
Expand Down Expand Up @@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
// EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
return
}

Expand All @@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Create and register the node maintenance check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: nodeMaintCheckID,
CheckID: structs.NodeMaint,
Name: "Node Maintenance Mode",
Notes: reason,
Status: structs.HealthCritical,
Expand All @@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {

// DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
return
}
a.RemoveCheck(nodeMaintCheckID, true)
a.RemoveCheck(structs.NodeMaint, true)
a.logger.Printf("[INFO] agent: Node left maintenance mode")
}

Expand Down
6 changes: 3 additions & 3 deletions command/agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
}

// Ensure the maintenance check was registered
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered maintenance check")
}

// Check that the token was used
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got '%s'", token)
}

Expand Down Expand Up @@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
}

// Ensure the maintenance check was removed
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have removed maintenance check")
}
}
Expand Down
8 changes: 4 additions & 4 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("broken", "mytoken")

// Make sure the critical health check was added
check, ok := agent.state.Checks()[nodeMaintCheckID]
check, ok := agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}

// Check that the token was used to register the check
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got: '%s'", token)
}

Expand All @@ -1596,15 +1596,15 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.DisableNodeMaintenance()

// Ensure the check was deregistered
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have deregistered critical node check")
}

// Enter maintenance mode without passing a reason
agent.EnableNodeMaintenance("", "")

// Make sure the check was registered with the default note
check, ok = agent.state.Checks()[nodeMaintCheckID]
check, ok = agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}
Expand Down
12 changes: 11 additions & 1 deletion consul/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)

const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"

// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)

func ValidStatus(s string) bool {
Expand Down Expand Up @@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
return clone
}

// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck

// CheckServiceNode is used to provide the node, its service
Expand Down Expand Up @@ -460,7 +470,7 @@ type NodeInfo struct {
Address string
TaggedAddresses map[string]string
Services []*NodeService
Checks []*HealthCheck
Checks HealthChecks
}

// NodeDump is used to dump all the nodes with all their
Expand Down

0 comments on commit 4179aac

Please sign in to comment.