Skip to content

Commit

Permalink
Node heartbeats (#3709)
Browse files Browse the repository at this point in the history
Implements heartbeats for compute nodes, sending heartbeat messages to
the requester node over NATS PubSub. The server, upon receiving a
heartbeat updates the map of nodes to include the current server-side
timestamp.

Compute nodes using the heartbeat client, will continuously send
heartbeat messages every n seconds.

The heartbeat server receiving these heartbeats maintains a priority
queue, which dequeues oldest items (lowest timestamp) first. Every 5
seconds any item older than a specific timestamp is dequeued, and its
state either set to unhealthy (if it is the first missed heartbeat) or
unknown if it is the second. The default for timestamps is

* 30s since heartbeat - unhealthy
* 60s since heartbeat - unknown (node may be live but disconnected)

The next heartbeat sent by a unhealthy of unknown node will make it
healthy again and ready to receive work.

The current state of the node is added to the nodeinfo during a
Get/GetByPrefix/List call to the node info store. This means that the
liveness is dynamic and not persisted to the kvstore for node info.
  • Loading branch information
rossjones authored and aronchick committed Apr 27, 2024
1 parent a8339a8 commit e951da1
Show file tree
Hide file tree
Showing 43 changed files with 1,392 additions and 128 deletions.
2 changes: 1 addition & 1 deletion .cspell/custom-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ multierror
multiformats
Muxed
mypy
nats
NATS
nbconvert
nemt
NOAA
Expand Down
12 changes: 11 additions & 1 deletion cmd/cli/node/columns.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,19 @@ var alwaysColumns = []output.TableColumn[*models.NodeInfo]{
Value: func(ni *models.NodeInfo) string { return ni.NodeType.String() },
},
{
ColumnConfig: table.ColumnConfig{Name: "status"},
ColumnConfig: table.ColumnConfig{Name: "approval"},
Value: func(ni *models.NodeInfo) string { return ni.Approval.String() },
},
{
ColumnConfig: table.ColumnConfig{Name: "status"},
Value: func(ni *models.NodeInfo) string {
if ni.ComputeNodeInfo != nil {
return ni.State.String()
}

return "" // nothing for requester nodes
},
},
}

var toggleColumns = map[string][]output.TableColumn[*models.NodeInfo]{
Expand Down
31 changes: 22 additions & 9 deletions cmd/cli/node/list.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ import (

var defaultColumnGroups = []string{"labels", "capacity"}
var orderByFields = []string{"id", "type", "available_cpu", "available_memory", "available_disk", "available_gpu", "status"}
var filterStatusValues = []string{"approved", "pending", "rejected"}
var filterApprovalValues = []string{"approved", "pending", "rejected"}
var filterStatusValues = []string{"connected", "disconnected"}

// ListOptions is a struct to support node command
type ListOptions struct {
output.OutputOptions
cliflags.ListOptions
ColumnGroups []string
Labels string
FilterByStatus string
ColumnGroups []string
Labels string
FilterByApproval string
FilterByStatus string
}

// NewListOptions returns initialized Options
Expand All @@ -42,22 +44,24 @@ func NewListCmd() *cobra.Command {
Use: "list",
Short: "List info of network nodes. ",
Args: cobra.NoArgs,
Run: o.run,
RunE: o.run,
}
nodeCmd.Flags().StringSliceVar(&o.ColumnGroups, "show", o.ColumnGroups,
fmt.Sprintf("What column groups to show. Zero or more of: %q", maps.Keys(toggleColumns)))
nodeCmd.Flags().StringVar(&o.Labels, "labels", o.Labels,
"Filter nodes by labels. See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ for more information.")
nodeCmd.Flags().AddFlagSet(cliflags.ListFlags(&o.ListOptions))
nodeCmd.Flags().AddFlagSet(cliflags.OutputFormatFlags(&o.OutputOptions))
nodeCmd.Flags().StringVar(&o.FilterByApproval, "filter-approval", o.FilterByApproval,
fmt.Sprintf("Filter nodes by approval. One of: %q", filterApprovalValues))
nodeCmd.Flags().StringVar(&o.FilterByStatus, "filter-status", o.FilterByStatus,
fmt.Sprintf("Filter nodes by status. One of: %q", filterStatusValues))

return nodeCmd
}

// Run executes node command
func (o *ListOptions) run(cmd *cobra.Command, _ []string) {
func (o *ListOptions) run(cmd *cobra.Command, _ []string) error {
ctx := cmd.Context()

var err error
Expand All @@ -69,15 +73,22 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) {
}
}

if o.FilterByApproval != "" {
if !slices.Contains(filterApprovalValues, o.FilterByApproval) {
return fmt.Errorf("cannot use '%s' as filter-approval value, should be one of: %q", o.FilterByApproval, filterApprovalValues)
}
}

if o.FilterByStatus != "" {
if !slices.Contains(filterStatusValues, o.FilterByStatus) {
util.Fatal(cmd, fmt.Errorf("cannot use '%s' as filter status value, should be one of: %q", o.FilterByStatus, filterStatusValues), 1)
return fmt.Errorf("cannot use '%s' as filter-status value, should be one of: %q", o.FilterByStatus, filterStatusValues)
}
}

response, err := util.GetAPIClientV2(cmd).Nodes().List(ctx, &apimodels.ListNodesRequest{
Labels: labelRequirements,
FilterByStatus: o.FilterByStatus,
Labels: labelRequirements,
FilterByApproval: o.FilterByApproval,
FilterByStatus: o.FilterByStatus,
BaseListRequest: apimodels.BaseListRequest{
Limit: o.Limit,
NextToken: o.NextToken,
Expand All @@ -97,4 +108,6 @@ func (o *ListOptions) run(cmd *cobra.Command, _ []string) {
if err = output.Output(cmd, columns, o.OutputOptions, response.Nodes); err != nil {
util.Fatal(cmd, fmt.Errorf("failed to output: %w", err), 1)
}

return nil
}
67 changes: 67 additions & 0 deletions docs/docs/dev/cli-reference/cli/node/approve/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
sidebar_label: approve
---

# Command: `node approve`

The `bacalhau node approve` command offers administrators the ability to approve the cluster membership for a node using its name.

## Description:

Using the `approve` sub-command under the `bacalhau node` umbrella, users can allow a node in the pending state to join the cluster and receive work. This feature is crucial for system administrators to manage the cluster.

## Usage:

```bash
bacalhau node approve [id] [flags]
```

## Flags:

- `[id]`:

- The unique identifier of the node you wish to describe.

- `-h`, `--help`:

- Displays the help documentation for the `describe` command.

- `-m message`:

- A message to be attached to the approval action.

## Global Flags:

- `--api-host string`:

- Specifies the host for client-server communication through REST. This flag is overridden if the `BACALHAU_API_HOST` environment variable is set.
- Default: `"bootstrap.production.bacalhau.org"`

- `--api-port int`:

- Designates the port for REST-based communication between client and server. This flag is overlooked if the `BACALHAU_API_PORT` environment variable is defined.
- Default: `1234`

- `--log-mode logging-mode`:

- Determines the log format preference.
- Options: `'default','station','json','combined','event'`
- Default: `'default'`

- `--repo string`:
- Points to the bacalhau repository's path.
- Default: `"`$HOME/.bacalhau"`

## Examples:

1. Approve a Node with ID `nodeID123`:

```bash
bacalhau node approve nodeID123
```

2. Approve a Node with an audit message:

```bash
bacalhau node approve nodeID123 -m "okay"
```
67 changes: 67 additions & 0 deletions docs/docs/dev/cli-reference/cli/node/delete/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
sidebar_label: delete
---

# Command: `node delete`

The `bacalhau node delete` command offers administrators the ability to remove a node from the cluster using its name.

## Description:

Using the `delete` sub-command, administrators can remove a node from the list of available compute nodes in the cluster. This feature is necessary for the management of the infrastructure.

## Usage:

```bash
bacalhau node delete [id] [flags]
```

## Flags:

- `[id]`:

- The unique identifier of the node you wish to describe.

- `-h`, `--help`:

- Displays the help documentation for the `describe` command.

- `-m message`:

- A message to be attached to the deletion action.

## Global Flags:

- `--api-host string`:

- Specifies the host for client-server communication through REST. This flag is overridden if the `BACALHAU_API_HOST` environment variable is set.
- Default: `"bootstrap.production.bacalhau.org"`

- `--api-port int`:

- Designates the port for REST-based communication between client and server. This flag is overlooked if the `BACALHAU_API_PORT` environment variable is defined.
- Default: `1234`

- `--log-mode logging-mode`:

- Determines the log format preference.
- Options: `'default','station','json','combined','event'`
- Default: `'default'`

- `--repo string`:
- Points to the bacalhau repository's path.
- Default: `"`$HOME/.bacalhau"`

## Examples:

1. Delete the Node with ID `nodeID123`:

```bash
bacalhau node delete nodeID123
```

2. Delete a Node with an audit message:

```bash
bacalhau node delete nodeID123 -m "bad actor"
```
77 changes: 52 additions & 25 deletions docs/docs/dev/cli-reference/cli/node/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,48 +12,75 @@ bacalhau node [command]

## Available Commands

1. **[approve](./approve)**:

- Description: Approves a single node to join the cluster.
- Usage:

```bash
bacalhau node approve
```

1. **[delete](./delete)**:

- Description: Deletes a node from the cluster using its ID.
- Usage:
```bash
bacalhau node delete
```

1. **[describe](./describe)**:
- Description: Retrieves detailed information of a node using its ID.
- Usage:
```bash
bacalhau node describe
```

2. **[list](./list)**:
- Description: Lists the details of all nodes present in the network.
- Usage:
```bash
bacalhau node list
```

- Description: Retrieves detailed information of a node using its ID.
- Usage:
```bash
bacalhau node describe
```

1. **[list](./list)**:

- Description: Lists the details of all nodes present in the network.
- Usage:
```bash
bacalhau node list
```

1. **[reject](./reject)**:

- Description: Reject a specific node's request to join the cluster.
- Usage:
```bash
bacalhau node reject
```
For comprehensive details on any of the sub-commands, run:
```bash
bacalhau node [command] --help
```
## Flags
- `-h`, `--help`:
- Description: Shows the help information for the `node` command.
- Description: Shows the help information for the `node` command.
## Global Flags
- `--api-host string`:
- Description: Specifies the host for RESTful communication between the client and server. The flag will be ignored if the `BACALHAU_API_HOST` environment variable is set.
- Default: `bootstrap.production.bacalhau.org`
- Description: Specifies the host for RESTful communication between the client and server. The flag will be ignored if the `BACALHAU_API_HOST` environment variable is set.
- Default: `bootstrap.production.bacalhau.org`
- `--api-port int`:
- Description: Designates the port for RESTful communication. The flag will be bypassed if the `BACALHAU_API_PORT` environment variable is active.
- Default: `1234`
- `--log-mode logging-mode`:
- Description: Chooses the preferred log format. Available choices are: `default`, `station`, `json`, `combined`, and `event`.
- Default: `default`
- Description: Designates the port for RESTful communication. The flag will be bypassed if the `BACALHAU_API_PORT` environment variable is active.
- Default: `1234`
- `--repo string`:
- Description: Specifies the path to the bacalhau repository.
- Default: `/Users/walid/.bacalhau`
- `--log-mode logging-mode`:
---
- Description: Chooses the preferred log format. Available choices are: `default`, `station`, `json`, `combined`, and `event`.
- Default: `default`
This should provide an organized and structured overview of the `node` command and its functionalities!
- `--repo string`:
- Description: Specifies the path to the bacalhau repository.
- Default: `/Users/walid/.bacalhau`
Loading

0 comments on commit e951da1

Please sign in to comment.