Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic signal metrics #2107

Merged
merged 8 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion client/cmd/testutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import (
"testing"
"time"

"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"

"github.com/netbirdio/netbird/management/server/activity"

"github.com/netbirdio/netbird/util"
Expand Down Expand Up @@ -53,7 +56,10 @@ func startSignal(t *testing.T) (*grpc.Server, net.Listener) {
t.Fatal(err)
}
s := grpc.NewServer()
sigProto.RegisterSignalExchangeServer(s, sig.NewServer())
srv, err := sig.NewServer(otel.Meter(""))
require.NoError(t, err)

sigProto.RegisterSignalExchangeServer(s, srv)
go func() {
if err := s.Serve(lis); err != nil {
panic(err)
Expand Down
11 changes: 8 additions & 3 deletions client/internal/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
"google.golang.org/grpc"
"google.golang.org/grpc/keepalive"
Expand Down Expand Up @@ -810,7 +811,7 @@ func TestEngine_MultiplePeers(t *testing.T) {
ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
defer cancel()

sigServer, signalAddr, err := startSignal()
sigServer, signalAddr, err := startSignal(t)
if err != nil {
t.Fatal(err)
return
Expand Down Expand Up @@ -1013,15 +1014,19 @@ func createEngine(ctx context.Context, cancel context.CancelFunc, setupKey strin
return e, err
}

func startSignal() (*grpc.Server, string, error) {
func startSignal(t *testing.T) (*grpc.Server, string, error) {
t.Helper()

s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))

lis, err := net.Listen("tcp", "localhost:0")
if err != nil {
log.Fatalf("failed to listen: %v", err)
}

proto.RegisterSignalExchangeServer(s, signalServer.NewServer())
srv, err := signalServer.NewServer(otel.Meter(""))
require.NoError(t, err)
proto.RegisterSignalExchangeServer(s, srv)

go func() {
if err = s.Serve(lis); err != nil {
Expand Down
12 changes: 9 additions & 3 deletions client/server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"time"

"github.com/netbirdio/management-integrations/integrations"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"

log "github.com/sirupsen/logrus"
"google.golang.org/grpc"
Expand Down Expand Up @@ -39,7 +41,7 @@ var (
// we will use a management server started via to simulate the server and capture the number of retries
func TestConnectWithRetryRuns(t *testing.T) {
// start the signal server
_, signalAddr, err := startSignal()
_, signalAddr, err := startSignal(t)
if err != nil {
t.Fatalf("failed to start signal server: %v", err)
}
Expand Down Expand Up @@ -141,15 +143,19 @@ func startManagement(t *testing.T, signalAddr string, counter *int) (*grpc.Serve
return s, lis.Addr().String(), nil
}

func startSignal() (*grpc.Server, string, error) {
func startSignal(t *testing.T) (*grpc.Server, string, error) {
t.Helper()

s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))

lis, err := net.Listen("tcp", "localhost:0")
if err != nil {
log.Fatalf("failed to listen: %v", err)
}

proto.RegisterSignalExchangeServer(s, signalServer.NewServer())
srv, err := signalServer.NewServer(otel.Meter(""))
require.NoError(t, err)
proto.RegisterSignalExchangeServer(s, srv)

go func() {
if err = s.Serve(lis); err != nil {
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ require (
github.com/things-go/go-socks5 v0.0.4
github.com/yusufpapurcu/wmi v1.2.4
github.com/zcalusic/sysinfo v1.0.2
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0
go.opentelemetry.io/otel v1.26.0
go.opentelemetry.io/otel/exporters/prometheus v0.48.0
go.opentelemetry.io/otel/metric v1.26.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,8 @@ github.com/zcalusic/sysinfo v1.0.2 h1:nwTTo2a+WQ0NXwo0BGRojOJvJ/5XKvQih+2RrtWqfx
github.com/zcalusic/sysinfo v1.0.2/go.mod h1:kluzTYflRWo6/tXVMJPdEjShsbPpsFRyy+p1mBQPC30=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 h1:4Pp6oUg3+e/6M4C0A/3kJ2VYa++dsWVTtGgLVj5xtHg=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 h1:Xs2Ncz0gNihqu9iosIZ5SkBbWo5T8JhhLJFMQL1qmLI=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0/go.mod h1:vy+2G/6NvVMpwGX/NyLqcC41fxepnuKHk16E6IZUcJc=
go.opentelemetry.io/otel v1.26.0 h1:LQwgL5s/1W7YiiRwxf03QGnWLb2HW4pLiAhaA5cZXBs=
Expand Down
78 changes: 69 additions & 9 deletions signal/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# netbird Signal Server

This is a netbird signal-exchange server and client library to exchange connection information between netbird peers
This is a netbird signal-exchange server and client library to exchange
connection information between netbird peers

## Command Options
The CLI accepts the command **management** with the following options:

The CLI accepts the the following options:

```shell
start Netbird Signal Server daemon

Expand All @@ -20,24 +23,38 @@ Global Flags:
--log-file string sets Netbird log path. If console is specified the the log will be output to stdout (default "/var/log/netbird/signal.log")
--log-level string (default "info")
```

## Running the Signal service (Docker)

We have packed the Signal server into docker image. You can pull the image from Docker Hub and execute it with the following commands:
We have packed the Signal server into docker image. You can pull the image from
Docker Hub and execute it with the
following commands:

````shell
docker pull netbirdio/signal:latest
docker run -d --name netbird-signal -p 10000:10000 netbirdio/signal:latest
````
The default log-level is set to INFO, if you need you can change it using by updating the docker cmd as followed:

The default log-level is set to INFO, if you need you can change it using by
updating the docker cmd as followed:

````shell
docker run -d --name netbird-signal -p 10000:10000 netbirdio/signal:latest --log-level DEBUG
````

### Run with TLS (Let's Encrypt).
By specifying the **--letsencrypt-domain** the daemon will handle SSL certificate request and configuration.

In the following example ```10000``` is the signal service **default** port, and ```443``` will be used as port for Let's Encrypt challenge and HTTP API.
> The server where you are running a container has to have a public IP (for Let's Encrypt certificate challenge).
By specifying the **--letsencrypt-domain** the daemon will handle SSL
certificate request and configuration.

Replace <YOUR-DOMAIN> with your server's public domain (e.g. mydomain.com or subdomain sub.mydomain.com).
In the following example ```10000``` is the signal service **default** port,
and ```443``` will be used as port for
Let's Encrypt challenge and HTTP API.
> The server where you are running a container has to have a public IP (for
> Let's Encrypt certificate challenge).

Replace `<YOUR-DOMAIN>` with your server's public domain (e.g. mydomain.com or
subdomain sub.mydomain.com).

```bash
# create a volume
Expand All @@ -50,14 +67,57 @@ docker run -d --name netbird-signal \
netbirdio/signal:latest \
--letsencrypt-domain <YOUR-DOMAIN>
```

## Metrics

The Signal Server exposes the following metrics in Prometheus format:

### Application Metrics

- **active_peers**: A Gauge metric that tracks the number of active peers
connected to the server.
- **peer_connection_duration_seconds**: A Histogram metric that measures the
duration a peer was connected in seconds.
- **registrations_total**: A Counter metric that counts the total number of peer
registrations.
- **deregistrations_total**: A Counter metric that counts the total number of
peer deregistrations.
- **registration_failures_total**: A Counter metric that counts the total number
of failed peer registrations. Possible
labels:
- `error`: The type of error that caused the registration failure (
e.g., `missing_id`, `missing_meta`, `failed_header`).
- **registration_delay_milliseconds**: A Histogram metric that measures the time
it took to register a peer in
milliseconds.
- **messages_forwarded_total**: A Counter metric that counts the total number of
messages forwarded between peers.
- **message_forward_failures_total**: A Counter metric that counts the total
number of failed message forwards between
peers. Possible labels:
- `type`: The type of failure (
e.g., `error`, `not_connected`, `not_registered`).
- **message_forward_latency_milliseconds**: A Histogram metric that measures the
latency of message forwarding between
peers in milliseconds.

### Endpoint

The metrics are exposed in Prometheus format on the `/metrics` endpoint. By
default, the server listens on port `9090`,
so the full endpoint would be:

> http://<server_ip>:9090/metrics

## For development purposes:

The project uses gRpc library and defines service in protobuf file located in:
```proto/signalexchange.proto```
```proto/signalexchange.proto```

To build the project you have to do the following things.

Install golang gRpc tools:

```bash
#!/bin/bash
go install google.golang.org/protobuf/cmd/[email protected]
Expand Down
7 changes: 6 additions & 1 deletion signal/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
log "github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
Expand Down Expand Up @@ -198,7 +199,11 @@ func startSignal() (*grpc.Server, net.Listener) {
panic(err)
}
s := grpc.NewServer()
sigProto.RegisterSignalExchangeServer(s, server.NewServer())
srv, err := server.NewServer(otel.Meter(""))
if err != nil {
panic(err)
}
sigProto.RegisterSignalExchangeServer(s, srv)
go func() {
if err := s.Serve(lis); err != nil {
log.Fatalf("failed to serve: %v", err)
Expand Down
37 changes: 35 additions & 2 deletions signal/cmd/run.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package cmd

import (
"context"
"errors"
"flag"
"fmt"
Expand All @@ -13,8 +14,11 @@ import (
"strings"
"time"

"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"golang.org/x/crypto/acme/autocert"

"github.com/netbirdio/netbird/signal/metrics"

"github.com/netbirdio/netbird/encryption"
"github.com/netbirdio/netbird/signal/proto"
"github.com/netbirdio/netbird/signal/server"
Expand All @@ -28,6 +32,10 @@ import (
"google.golang.org/grpc/keepalive"
)

const (
metricsPort = 9090
)

var (
signalPort int
signalLetsencryptDomain string
Expand Down Expand Up @@ -95,9 +103,26 @@ var (
opts = append(opts, grpc.Creds(transportCredentials))
}

opts = append(opts, signalKaep, signalKasp)
metricsServer := metrics.NewServer(metricsPort, "")
if err != nil {
return fmt.Errorf("setup metrics: %v", err)
}

opts = append(opts, signalKaep, signalKasp, grpc.StatsHandler(otelgrpc.NewServerHandler()))
grpcServer := grpc.NewServer(opts...)
proto.RegisterSignalExchangeServer(grpcServer, server.NewServer())

go func() {
log.Infof("running metrics server: %s%s", metricsServer.Addr, metricsServer.Endpoint)
if err := metricsServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("Failed to start metrics server: %v", err)
}
}()

srv, err := server.NewServer(metricsServer.Meter)
if err != nil {
return fmt.Errorf("creating signal server: %v", err)
}
proto.RegisterSignalExchangeServer(grpcServer, srv)

var compatListener net.Listener
if signalPort != 10000 {
Expand Down Expand Up @@ -150,6 +175,14 @@ var (
_ = compatListener.Close()
log.Infof("stopped gRPC backward compatibility server")
}

ctx, cancel := context.WithTimeout(cmd.Context(), 5*time.Second)
defer cancel()
if err := metricsServer.Shutdown(ctx); err != nil {
log.Errorf("Failed to stop metrics server: %v", err)
}
log.Infof("stopped metrics server")

log.Infof("stopped Signal Service")

return nil
Expand Down
Loading
Loading