Skip to content

Commit

Permalink
cli: Add prune flag for nomad server force-leave command (#18463)
Browse files Browse the repository at this point in the history
This feature will help operator to remove a failed/left node from Serf layer immediately
without waiting for 24 hours for the node to be reaped

* Update CLI with prune flag
* Update API /v1/agent/force-leave with prune query string parameter
* Update CLI and API doc
* Add unit test
  • Loading branch information
nvanthao authored Sep 15, 2023
1 parent d2dd64f commit 1339599
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 8 deletions.
3 changes: 3 additions & 0 deletions .changelog/18463.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:feature
cli: Add `-prune` flag to `nomad operator force-leave` command
```
22 changes: 21 additions & 1 deletion api/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ type KeyringRequest struct {
Key string
}

// ForceLeaveOpts are used to configure the ForceLeave method.
type ForceLeaveOpts struct {
// Prune indicates whether to remove a node from the list of members
Prune bool
}

// Agent returns a new agent which can be used to query
// the agent-specific endpoints.
func (c *Client) Agent() *Agent {
Expand Down Expand Up @@ -163,7 +169,21 @@ func (a *Agent) MembersOpts(opts *QueryOptions) (*ServerMembers, error) {

// ForceLeave is used to eject an existing node from the cluster.
func (a *Agent) ForceLeave(node string) error {
_, err := a.client.put("/v1/agent/force-leave?node="+node, nil, nil, nil)
v := url.Values{}
v.Add("node", node)
_, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil)
return err
}

// ForceLeaveWithOptions is used to eject an existing node from the cluster
// with additional options such as prune.
func (a *Agent) ForceLeaveWithOptions(node string, opts ForceLeaveOpts) error {
v := url.Values{}
v.Add("node", node)
if opts.Prune {
v.Add("prune", "1")
}
_, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil)
return err
}

Expand Down
48 changes: 48 additions & 0 deletions api/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ func TestAgent_ForceLeave(t *testing.T) {
must.One(t, n)

membersBefore, err := a.MembersOpts(&QueryOptions{})
must.NoError(t, err)
must.Eq(t, membersBefore.Members[1].Status, "alive")

err = a.ForceLeave(membersBefore.Members[1].Name)
Expand All @@ -152,6 +153,53 @@ func TestAgent_ForceLeave(t *testing.T) {
wait.Timeout(3*time.Second),
wait.Gap(100*time.Millisecond),
))

}

func TestAgent_ForceLeavePrune(t *testing.T) {
testutil.Parallel(t)

c, s := makeClient(t, nil, nil)
defer s.Stop()
a := c.Agent()

nodeName := "foo"
_, s2 := makeClient(t, nil, func(c *testutil.TestServerConfig) {
c.NodeName = nodeName
c.Server.BootstrapExpect = 0
})

n, err := a.Join(s2.SerfAddr)
must.NoError(t, err)
must.One(t, n)
membersBefore, err := a.MembersOpts(&QueryOptions{})
must.NoError(t, err)

s2.Stop()

forceLeaveOpts := ForceLeaveOpts{
Prune: true,
}
nodeName = nodeName + ".global"
err = a.ForceLeaveWithOptions(nodeName, forceLeaveOpts)
must.NoError(t, err)

f := func() error {
membersAfter, err := a.MembersOpts(&QueryOptions{})
if err != nil {
return err
}
if len(membersAfter.Members) == len(membersBefore.Members) {
return fmt.Errorf("node did not get pruned")
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(5*time.Second),
wait.Gap(100*time.Millisecond),
))

}

func (a *AgentMember) String() string {
Expand Down
11 changes: 10 additions & 1 deletion command/agent/agent_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,17 @@ func (s *HTTPServer) AgentForceLeaveRequest(resp http.ResponseWriter, req *http.
return nil, CodedError(400, "missing node to force leave")
}

prune, err := parseBool(req, "prune")
if err != nil {
return nil, CodedError(400, "invalid prune value")
}

// Attempt remove
err := srv.RemoveFailedNode(node)
if prune != nil && *prune {
err = srv.RemoveFailedNodePrune(node)
} else {
err = srv.RemoveFailedNode(node)
}
return nil, err
}

Expand Down
26 changes: 22 additions & 4 deletions command/server_force_leave.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"strings"

"github.com/hashicorp/nomad/api"
"github.com/posener/complete"
)

Expand All @@ -21,14 +22,22 @@ Usage: nomad server force-leave [options] <node>
Forces an server to enter the "left" state. This can be used to
eject nodes which have failed and will not rejoin the cluster.
Note that if the member is actually still alive, it will
eventually rejoin the cluster again.
eventually rejoin the cluster again. The failed or left server will
be garbage collected after 24h.
If ACLs are enabled, this option requires a token with the 'agent:write'
capability.
General Options:
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace)
` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
Server Force-Leave Options:
-prune
Removes failed or left server from the Serf member list immediately.
If member is actually still alive, it will eventually rejoin the cluster again.
`
return strings.TrimSpace(helpText)
}

Expand All @@ -37,7 +46,10 @@ func (c *ServerForceLeaveCommand) Synopsis() string {
}

func (c *ServerForceLeaveCommand) AutocompleteFlags() complete.Flags {
return c.Meta.AutocompleteFlags(FlagSetClient)
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
complete.Flags{
"-prune": complete.PredictNothing,
})
}

func (c *ServerForceLeaveCommand) AutocompleteArgs() complete.Predictor {
Expand All @@ -47,8 +59,11 @@ func (c *ServerForceLeaveCommand) AutocompleteArgs() complete.Predictor {
func (c *ServerForceLeaveCommand) Name() string { return "server force-leave" }

func (c *ServerForceLeaveCommand) Run(args []string) int {
var prune bool
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.BoolVar(&prune, "prune", false, "Remove server completely from list of members")

if err := flags.Parse(args); err != nil {
return 1
}
Expand All @@ -70,7 +85,10 @@ func (c *ServerForceLeaveCommand) Run(args []string) int {
}

// Call force-leave on the node
if err := client.Agent().ForceLeave(node); err != nil {
forceLeaveOpts := api.ForceLeaveOpts{
Prune: prune,
}
if err := client.Agent().ForceLeaveWithOptions(node, forceLeaveOpts); err != nil {
c.Ui.Error(fmt.Sprintf("Error force-leaving server %s: %s", node, err))
return 1
}
Expand Down
5 changes: 5 additions & 0 deletions nomad/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1866,6 +1866,11 @@ func (s *Server) RemoveFailedNode(node string) error {
return s.serf.RemoveFailedNode(node)
}

// RemoveFailedNodePrune immediately removes a failed node from the list of members
func (s *Server) RemoveFailedNodePrune(node string) error {
return s.serf.RemoveFailedNodePrune(node)
}

// KeyManager returns the Serf keyring manager
func (s *Server) KeyManager() *serf.KeyManager {
return s.serf.KeyManager()
Expand Down
6 changes: 5 additions & 1 deletion website/content/api-docs/agent.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -441,13 +441,17 @@ The table below shows this endpoint's support for
### Parameters

- `node` `(string: <required>)` - Specifies the name of the node to force leave.
- `prune` `(boolean: <optional>)` - Removes failed or left server from the Serf
member list immediately. If member is actually still alive, it will eventually rejoin
the cluster again.


### Sample Request

```shell-session
$ curl \
--request POST \
https://localhost:4646/v1/agent/force-leave?node=client-ab2e23dc
https://localhost:4646/v1/agent/force-leave?node=client-ab2e23dc&prune=true
```

## Health
Expand Down
19 changes: 18 additions & 1 deletion website/content/docs/commands/server/force-leave.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ description: >

The `server force-leave` command forces a server to enter the "left" state.
This can be used to eject server nodes which have failed and will not rejoin
the cluster. Note that if the server is actually still alive, it will
the cluster. The failed or left server will be garbage collected after `24h`.

~> Note that if the server is actually still alive, it will
eventually rejoin the cluster again.

## Usage
Expand All @@ -22,13 +24,21 @@ nomad server force-leave [options] <node>
This command expects only one argument - the node which should be forced
to enter the "left" state.

Additionally, by specifying the `prune` flag, a failed or left node can be forcibly removed
from the list of members immediately.

If ACLs are enabled, this option requires a token with the `agent:write`
capability.

## General Options

@include 'general_options_no_namespace.mdx'

## Server Force-Leave Options

- `-prune`: Removes failed or left server from the Serf member list immediately.
If member is actually still alive, it will eventually rejoin the cluster again.

## Examples

Force-leave the server "node1":
Expand All @@ -37,3 +47,10 @@ Force-leave the server "node1":
$ nomad server force-leave node1
```

Force-leave the server "node1" and prune it:

```shell-session
$ nomad server force-leave -prune node1
```

0 comments on commit 1339599

Please sign in to comment.