Skip to content

Commit

Permalink
[tmpnet] Set an explicit instance label for logs and metrics (#3650)
Browse files Browse the repository at this point in the history
Co-authored-by: Maru Newby <[email protected]>
  • Loading branch information
marun and maru-ava authored Jan 27, 2025
1 parent d31320e commit e84f72e
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 33 deletions.
17 changes: 0 additions & 17 deletions tests/fixture/tmpnet/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,23 +510,6 @@ func (n *Network) StartNode(ctx context.Context, log logging.Logger, node *Node)

// Restart a single node.
func (n *Network) RestartNode(ctx context.Context, log logging.Logger, node *Node) error {
// Ensure the node reuses the same API port across restarts to ensure
// consistent labeling of metrics. Otherwise prometheus's automatic
// addition of the `instance` label (host:port) results in
// segmentation of results for a given node every time the port
// changes on restart. This segmentation causes graphs on the grafana
// dashboards to display multiple series per graph for a given node,
// one for each port that the node used.
//
// There is a non-zero chance of the port being allocatted to a
// different process and the node subsequently being unable to start,
// but the alternative is having to update the grafana dashboards
// query-by-query to ensure that node metrics ignore the instance
// label.
if err := node.SaveAPIPort(); err != nil {
return err
}

if err := node.Stop(ctx); err != nil {
return fmt.Errorf("failed to stop node %s: %w", node.NodeID, err)
}
Expand Down
22 changes: 6 additions & 16 deletions tests/fixture/tmpnet/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"errors"
"fmt"
"io"
"net"
"net/http"
"net/netip"
"os"
Expand Down Expand Up @@ -366,19 +365,10 @@ func (n *Node) EnsureNodeID() error {
return nil
}

// Saves the currently allocated API port to the node's configuration
// for use across restarts. Reusing the port ensures consistent
// labeling of metrics.
func (n *Node) SaveAPIPort() error {
hostPort := strings.TrimPrefix(n.URI, "http://")
if len(hostPort) == 0 {
// Without an API URI there is nothing to save
return nil
}
_, port, err := net.SplitHostPort(hostPort)
if err != nil {
return err
}
n.Flags[config.HTTPPortKey] = port
return nil
// GetUniqueID returns a globally unique identifier for the node.
func (n *Node) GetUniqueID() string {
nodeIDString := n.NodeID.String()
startIndex := len(ids.NodeIDPrefix)
endIndex := startIndex + 8 // 8 characters should be enough to identify a node in the context of its network
return n.NetworkUUID + "-" + strings.ToLower(nodeIDString[startIndex:endIndex])
}
4 changes: 4 additions & 0 deletions tests/fixture/tmpnet/node_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ func (p *NodeProcess) getProcess() (*os.Process, error) {
func (p *NodeProcess) writeMonitoringConfig() error {
// Ensure labeling that uniquely identifies the node and its network
commonLabels := FlagsMap{
// Explicitly setting an instance label avoids the default
// behavior of using the node's URI since the URI isn't
// guaranteed stable (e.g. port may change after restart).
"instance": p.node.GetUniqueID(),
"network_uuid": p.node.NetworkUUID,
"node_id": p.node.NodeID,
"is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral),
Expand Down

0 comments on commit e84f72e

Please sign in to comment.