Skip to content

Commit

Permalink
Add basic signal metrics (#2107)
Browse files Browse the repository at this point in the history
  • Loading branch information
lixmal authored Jun 12, 2024
1 parent 94e5054 commit 85b8f36
Show file tree
Hide file tree
Showing 13 changed files with 443 additions and 39 deletions.
8 changes: 7 additions & 1 deletion client/cmd/testutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import (
"testing"
"time"

"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"

"github.com/netbirdio/netbird/management/server/activity"

"github.com/netbirdio/netbird/util"
Expand Down Expand Up @@ -53,7 +56,10 @@ func startSignal(t *testing.T) (*grpc.Server, net.Listener) {
t.Fatal(err)
}
s := grpc.NewServer()
sigProto.RegisterSignalExchangeServer(s, sig.NewServer())
srv, err := sig.NewServer(otel.Meter(""))
require.NoError(t, err)

sigProto.RegisterSignalExchangeServer(s, srv)
go func() {
if err := s.Serve(lis); err != nil {
panic(err)
Expand Down
11 changes: 8 additions & 3 deletions client/internal/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
"google.golang.org/grpc"
"google.golang.org/grpc/keepalive"
Expand Down Expand Up @@ -810,7 +811,7 @@ func TestEngine_MultiplePeers(t *testing.T) {
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
defer cancel()

sigServer, signalAddr, err := startSignal()
sigServer, signalAddr, err := startSignal(t)
if err != nil {
t.Fatal(err)
return
Expand Down Expand Up @@ -1013,15 +1014,19 @@ func createEngine(ctx context.Context, cancel context.CancelFunc, setupKey strin
return e, err
}

func startSignal() (*grpc.Server, string, error) {
func startSignal(t *testing.T) (*grpc.Server, string, error) {
t.Helper()

s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))

lis, err := net.Listen("tcp", "localhost:0")
if err != nil {
log.Fatalf("failed to listen: %v", err)
}

proto.RegisterSignalExchangeServer(s, signalServer.NewServer())
srv, err := signalServer.NewServer(otel.Meter(""))
require.NoError(t, err)
proto.RegisterSignalExchangeServer(s, srv)

go func() {
if err = s.Serve(lis); err != nil {
Expand Down
12 changes: 9 additions & 3 deletions client/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"time"

"github.com/netbirdio/management-integrations/integrations"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"

log "github.com/sirupsen/logrus"
"google.golang.org/grpc"
Expand Down Expand Up @@ -39,7 +41,7 @@ var (
// we will use a management server started via to simulate the server and capture the number of retries
func TestConnectWithRetryRuns(t *testing.T) {
// start the signal server
_, signalAddr, err := startSignal()
_, signalAddr, err := startSignal(t)
if err != nil {
t.Fatalf("failed to start signal server: %v", err)
}
Expand Down Expand Up @@ -141,15 +143,19 @@ func startManagement(t *testing.T, signalAddr string, counter *int) (*grpc.Serve
return s, lis.Addr().String(), nil
}

func startSignal() (*grpc.Server, string, error) {
func startSignal(t *testing.T) (*grpc.Server, string, error) {
t.Helper()

s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))

lis, err := net.Listen("tcp", "localhost:0")
if err != nil {
log.Fatalf("failed to listen: %v", err)
}

proto.RegisterSignalExchangeServer(s, signalServer.NewServer())
srv, err := signalServer.NewServer(otel.Meter(""))
require.NoError(t, err)
proto.RegisterSignalExchangeServer(s, srv)

go func() {
if err = s.Serve(lis); err != nil {
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ require (
github.com/things-go/go-socks5 v0.0.4
github.com/yusufpapurcu/wmi v1.2.4
github.com/zcalusic/sysinfo v1.0.2
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0
go.opentelemetry.io/otel v1.26.0
go.opentelemetry.io/otel/exporters/prometheus v0.48.0
go.opentelemetry.io/otel/metric v1.26.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,8 @@ github.com/zcalusic/sysinfo v1.0.2 h1:nwTTo2a+WQ0NXwo0BGRojOJvJ/5XKvQih+2RrtWqfx
github.com/zcalusic/sysinfo v1.0.2/go.mod h1:kluzTYflRWo6/tXVMJPdEjShsbPpsFRyy+p1mBQPC30=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 h1:4Pp6oUg3+e/6M4C0A/3kJ2VYa++dsWVTtGgLVj5xtHg=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 h1:Xs2Ncz0gNihqu9iosIZ5SkBbWo5T8JhhLJFMQL1qmLI=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0/go.mod h1:vy+2G/6NvVMpwGX/NyLqcC41fxepnuKHk16E6IZUcJc=
go.opentelemetry.io/otel v1.26.0 h1:LQwgL5s/1W7YiiRwxf03QGnWLb2HW4pLiAhaA5cZXBs=
Expand Down
78 changes: 69 additions & 9 deletions signal/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# netbird Signal Server

This is a netbird signal-exchange server and client library to exchange connection information between netbird peers
This is a netbird signal-exchange server and client library to exchange
connection information between netbird peers

## Command Options
The CLI accepts the command **management** with the following options:

The CLI accepts the the following options:

```shell
start Netbird Signal Server daemon

Expand All @@ -20,24 +23,38 @@ Global Flags:
--log-file string sets Netbird log path. If console is specified the the log will be output to stdout (default "/var/log/netbird/signal.log")
--log-level string (default "info")
```
## Running the Signal service (Docker)
We have packed the Signal server into docker image. You can pull the image from Docker Hub and execute it with the following commands:
We have packed the Signal server into docker image. You can pull the image from
Docker Hub and execute it with the
following commands:
````shell
docker pull netbirdio/signal:latest
docker run -d --name netbird-signal -p 10000:10000 netbirdio/signal:latest
````
The default log-level is set to INFO, if you need you can change it using by updating the docker cmd as followed:
The default log-level is set to INFO, if you need you can change it using by
updating the docker cmd as followed:
````shell
docker run -d --name netbird-signal -p 10000:10000 netbirdio/signal:latest --log-level DEBUG
````
### Run with TLS (Let's Encrypt).
By specifying the **--letsencrypt-domain** the daemon will handle SSL certificate request and configuration.

In the following example ```10000``` is the signal service **default** port, and ```443``` will be used as port for Let's Encrypt challenge and HTTP API.
> The server where you are running a container has to have a public IP (for Let's Encrypt certificate challenge).
By specifying the **--letsencrypt-domain** the daemon will handle SSL
certificate request and configuration.

Replace <YOUR-DOMAIN> with your server's public domain (e.g. mydomain.com or subdomain sub.mydomain.com).
In the following example ```10000``` is the signal service **default** port,
and ```443``` will be used as port for
Let's Encrypt challenge and HTTP API.
> The server where you are running a container has to have a public IP (for
> Let's Encrypt certificate challenge).

Replace `<YOUR-DOMAIN>` with your server's public domain (e.g. mydomain.com or
subdomain sub.mydomain.com).
```bash
# create a volume
Expand All @@ -50,14 +67,57 @@ docker run -d --name netbird-signal \
netbirdio/signal:latest \
--letsencrypt-domain <YOUR-DOMAIN>
```
## Metrics
The Signal Server exposes the following metrics in Prometheus format:
### Application Metrics
- **active_peers**: A Gauge metric that tracks the number of active peers
connected to the server.
- **peer_connection_duration_seconds**: A Histogram metric that measures the
duration a peer was connected in seconds.
- **registrations_total**: A Counter metric that counts the total number of peer
registrations.
- **deregistrations_total**: A Counter metric that counts the total number of
peer deregistrations.
- **registration_failures_total**: A Counter metric that counts the total number
of failed peer registrations. Possible
labels:
- `error`: The type of error that caused the registration failure (
e.g., `missing_id`, `missing_meta`, `failed_header`).
- **registration_delay_milliseconds**: A Histogram metric that measures the time
it took to register a peer in
milliseconds.
- **messages_forwarded_total**: A Counter metric that counts the total number of
messages forwarded between peers.
- **message_forward_failures_total**: A Counter metric that counts the total
number of failed message forwards between
peers. Possible labels:
- `type`: The type of failure (
e.g., `error`, `not_connected`, `not_registered`).
- **message_forward_latency_milliseconds**: A Histogram metric that measures the
latency of message forwarding between
peers in milliseconds.
### Endpoint
The metrics are exposed in Prometheus format on the `/metrics` endpoint. By
default, the server listens on port `9090`,
so the full endpoint would be:
> http://<server_ip>:9090/metrics
## For development purposes:
The project uses gRpc library and defines service in protobuf file located in:
```proto/signalexchange.proto```
```proto/signalexchange.proto```
To build the project you have to do the following things.
Install golang gRpc tools:
```bash
#!/bin/bash
go install google.golang.org/protobuf/cmd/[email protected]
Expand Down
7 changes: 6 additions & 1 deletion signal/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
log "github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
Expand Down Expand Up @@ -198,7 +199,11 @@ func startSignal() (*grpc.Server, net.Listener) {
panic(err)
}
s := grpc.NewServer()
sigProto.RegisterSignalExchangeServer(s, server.NewServer())
srv, err := server.NewServer(otel.Meter(""))
if err != nil {
panic(err)
}
sigProto.RegisterSignalExchangeServer(s, srv)
go func() {
if err := s.Serve(lis); err != nil {
log.Fatalf("failed to serve: %v", err)
Expand Down
37 changes: 35 additions & 2 deletions signal/cmd/run.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package cmd

import (
"context"
"errors"
"flag"
"fmt"
Expand All @@ -13,8 +14,11 @@ import (
"strings"
"time"

"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"golang.org/x/crypto/acme/autocert"

"github.com/netbirdio/netbird/signal/metrics"

"github.com/netbirdio/netbird/encryption"
"github.com/netbirdio/netbird/signal/proto"
"github.com/netbirdio/netbird/signal/server"
Expand All @@ -28,6 +32,10 @@ import (
"google.golang.org/grpc/keepalive"
)

const (
metricsPort = 9090
)

var (
signalPort int
signalLetsencryptDomain string
Expand Down Expand Up @@ -95,9 +103,26 @@ var (
opts = append(opts, grpc.Creds(transportCredentials))
}

opts = append(opts, signalKaep, signalKasp)
metricsServer := metrics.NewServer(metricsPort, "")
if err != nil {
return fmt.Errorf("setup metrics: %v", err)
}

opts = append(opts, signalKaep, signalKasp, grpc.StatsHandler(otelgrpc.NewServerHandler()))
grpcServer := grpc.NewServer(opts...)
proto.RegisterSignalExchangeServer(grpcServer, server.NewServer())

go func() {
log.Infof("running metrics server: %s%s", metricsServer.Addr, metricsServer.Endpoint)
if err := metricsServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("Failed to start metrics server: %v", err)
}
}()

srv, err := server.NewServer(metricsServer.Meter)
if err != nil {
return fmt.Errorf("creating signal server: %v", err)
}
proto.RegisterSignalExchangeServer(grpcServer, srv)

var compatListener net.Listener
if signalPort != 10000 {
Expand Down Expand Up @@ -150,6 +175,14 @@ var (
_ = compatListener.Close()
log.Infof("stopped gRPC backward compatibility server")
}

ctx, cancel := context.WithTimeout(cmd.Context(), 5*time.Second)
defer cancel()
if err := metricsServer.Shutdown(ctx); err != nil {
log.Errorf("Failed to stop metrics server: %v", err)
}
log.Infof("stopped metrics server")

log.Infof("stopped Signal Service")

return nil
Expand Down
Loading

0 comments on commit 85b8f36

Please sign in to comment.