From f992631f3aacd3b446a56070bff2e527d4d398b0 Mon Sep 17 00:00:00 2001
From: Denis Rechkunov <denis.rechkunov@elastic.co>
Date: Tue, 23 Aug 2022 18:50:17 +0200
Subject: [PATCH] Add persistence acknowledgement support for shipper (#32708)

Now the shipper output is able to receive acknowledgement of the
persisted events from the shipper server.

Also, added the UUID verification in case the client is reconnected to
a different server.
---
 NOTICE.txt                                  |  4 +-
 go.mod                                      |  2 +-
 go.sum                                      |  4 +-
 libbeat/outputs/shipper/api/shipper_mock.go | 16 +++-
 libbeat/outputs/shipper/config.go           | 18 +++--
 libbeat/outputs/shipper/shipper.go          | 61 +++++++++++---
 libbeat/outputs/shipper/shipper_test.go     | 88 +++++++++++++++------
 7 files changed, 146 insertions(+), 47 deletions(-)

diff --git a/NOTICE.txt b/NOTICE.txt
index e23ebb81db4..cdd08c68d43 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -10069,11 +10069,11 @@ Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-l
 
 --------------------------------------------------------------------------------
 Dependency : github.com/elastic/elastic-agent-shipper-client
-Version: v0.2.0
+Version: v0.4.0
 Licence type (autodetected): Elastic
 --------------------------------------------------------------------------------
 
-Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-shipper-client@v0.2.0/LICENSE.txt:
+Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-shipper-client@v0.4.0/LICENSE.txt:
 
 Elastic License 2.0
 
diff --git a/go.mod b/go.mod
index 97a4c2a4a84..06059cd0f42 100644
--- a/go.mod
+++ b/go.mod
@@ -203,7 +203,7 @@ require (
 	github.com/elastic/bayeux v1.0.5
 	github.com/elastic/elastic-agent-autodiscover v0.2.1
 	github.com/elastic/elastic-agent-libs v0.2.11
-	github.com/elastic/elastic-agent-shipper-client v0.2.0
+	github.com/elastic/elastic-agent-shipper-client v0.4.0
 	github.com/elastic/elastic-agent-system-metrics v0.4.4
 	github.com/elastic/go-elasticsearch/v8 v8.2.0
 	github.com/pierrec/lz4/v4 v4.1.15
diff --git a/go.sum b/go.sum
index 7d4451b5818..ef59438d866 100644
--- a/go.sum
+++ b/go.sum
@@ -603,8 +603,8 @@ github.com/elastic/elastic-agent-libs v0.2.5/go.mod h1:chO3rtcLyGlKi9S0iGVZhYCzD
 github.com/elastic/elastic-agent-libs v0.2.7/go.mod h1:chO3rtcLyGlKi9S0iGVZhYCzDfdDsAQYBc+ui588AFE=
 github.com/elastic/elastic-agent-libs v0.2.11 h1:ZeYn35Kxt+IdtMPmE01TaDeaahCg/z7MkGPVWUo6Lp4=
 github.com/elastic/elastic-agent-libs v0.2.11/go.mod h1:chO3rtcLyGlKi9S0iGVZhYCzDfdDsAQYBc+ui588AFE=
-github.com/elastic/elastic-agent-shipper-client v0.2.0 h1:p+5ep48YCOe+3nICeWmiLwQV11yDLad2n4NunI66Shg=
-github.com/elastic/elastic-agent-shipper-client v0.2.0/go.mod h1:OyI2W+Mv3JxlkEF3OeT7K0dbuxvwew8ke2Cf4HpLa9Q=
+github.com/elastic/elastic-agent-shipper-client v0.4.0 h1:nsTJF9oo4RHLl+zxFUZqNHaE86C6Ba5aImfegcEf6Sk=
+github.com/elastic/elastic-agent-shipper-client v0.4.0/go.mod h1:OyI2W+Mv3JxlkEF3OeT7K0dbuxvwew8ke2Cf4HpLa9Q=
 github.com/elastic/elastic-agent-system-metrics v0.4.4 h1:Br3S+TlBhijrLysOvbHscFhgQ00X/trDT5VEnOau0E0=
 github.com/elastic/elastic-agent-system-metrics v0.4.4/go.mod h1:tF/f9Off38nfzTZHIVQ++FkXrDm9keFhFpJ+3pQ00iI=
 github.com/elastic/elastic-transport-go/v8 v8.1.0 h1:NeqEz1ty4RQz+TVbUrpSU7pZ48XkzGWQj02k5koahIE=
diff --git a/libbeat/outputs/shipper/api/shipper_mock.go b/libbeat/outputs/shipper/api/shipper_mock.go
index fce9db750cd..7b9c3534e5d 100644
--- a/libbeat/outputs/shipper/api/shipper_mock.go
+++ b/libbeat/outputs/shipper/api/shipper_mock.go
@@ -22,17 +22,22 @@ import (
 
 	pb "github.com/elastic/elastic-agent-shipper-client/pkg/proto"
 	"github.com/elastic/elastic-agent-shipper-client/pkg/proto/messages"
+
+	"github.com/gofrs/uuid"
 )
 
 func NewProducerMock(cap int) *ProducerMock {
+	id, _ := uuid.NewV4()
 	return &ProducerMock{
-		Q: make([]*messages.Event, 0, cap),
+		UUID: id.String(),
+		Q:    make([]*messages.Event, 0, cap),
 	}
 }
 
 type ProducerMock struct {
 	pb.UnimplementedProducerServer
 	Q     []*messages.Event
+	UUID  string
 	Error error
 }
 
@@ -52,5 +57,14 @@ func (p *ProducerMock) PublishEvents(ctx context.Context, r *messages.PublishReq
 		resp.AcceptedCount++
 	}
 
+	resp.AcceptedIndex = uint64(len(p.Q))
+
 	return resp, nil
 }
+
+func (p *ProducerMock) PersistedIndex(req *messages.PersistedIndexRequest, producer pb.Producer_PersistedIndexServer) error {
+	return producer.Send(&messages.PersistedIndexReply{
+		Uuid:           p.UUID,
+		PersistedIndex: uint64(len(p.Q)),
+	})
+}
diff --git a/libbeat/outputs/shipper/config.go b/libbeat/outputs/shipper/config.go
index 7e0a5d10386..43bef1a63c0 100644
--- a/libbeat/outputs/shipper/config.go
+++ b/libbeat/outputs/shipper/config.go
@@ -34,21 +34,27 @@ type Config struct {
 	// TLS/SSL configurationf or secure connection
 	TLS *tlscommon.Config `config:"ssl"`
 	// Timeout of a single batch publishing request
-	Timeout time.Duration `config:"timeout"             validate:"min=1"`
+	Timeout time.Duration `config:"timeout" validate:"min=1"`
 	// MaxRetries is how many times the same batch is attempted to be sent
-	MaxRetries int `config:"max_retries"         validate:"min=-1,nonzero"`
+	MaxRetries int `config:"max_retries" validate:"min=-1,nonzero"`
 	// BulkMaxSize max amount of events in a single batch
 	BulkMaxSize int `config:"bulk_max_size"`
+	// AckPollingInterval is an interval for polling the shipper server for persistence acknowledgement.
+	// The default/minimum 5 ms polling interval means we could publish at most 1000 ms / 5 ms = 200 batches/sec.
+	// With the default `bulk_max_size` of 50 events this would limit
+	// the default throughput per worker to 200 * 50 = 10000 events/sec.
+	AckPollingInterval time.Duration `config:"ack_polling_interval" validate:"min=5ms"`
 	// Backoff strategy for the shipper output
 	Backoff backoffConfig `config:"backoff"`
 }
 
 func defaultConfig() Config {
 	return Config{
-		TLS:         nil,
-		Timeout:     30 * time.Second,
-		MaxRetries:  3,
-		BulkMaxSize: 50,
+		TLS:                nil,
+		Timeout:            30 * time.Second,
+		MaxRetries:         3,
+		BulkMaxSize:        50,
+		AckPollingInterval: 5 * time.Millisecond,
 		Backoff: backoffConfig{
 			Init: 1 * time.Second,
 			Max:  60 * time.Second,
diff --git a/libbeat/outputs/shipper/shipper.go b/libbeat/outputs/shipper/shipper.go
index eceedff9cd7..9a9aab65857 100644
--- a/libbeat/outputs/shipper/shipper.go
+++ b/libbeat/outputs/shipper/shipper.go
@@ -39,6 +39,7 @@ import (
 	"google.golang.org/grpc/credentials"
 	"google.golang.org/grpc/credentials/insecure"
 	"google.golang.org/grpc/status"
+	"google.golang.org/protobuf/types/known/durationpb"
 	"google.golang.org/protobuf/types/known/timestamppb"
 )
 
@@ -49,6 +50,7 @@ type shipper struct {
 	client   sc.ProducerClient
 	timeout  time.Duration
 	config   Config
+	serverID string
 }
 
 func init() {
@@ -109,11 +111,24 @@ func (c *shipper) Connect() error {
 	if err != nil {
 		return fmt.Errorf("shipper connection failed with: %w", err)
 	}
-	c.log.Debugf("connect to %s established.", c.config.Server)
 
 	c.conn = conn
 	c.client = sc.NewProducerClient(conn)
 
+	indexClient, err := c.client.PersistedIndex(ctx, &messages.PersistedIndexRequest{
+		PollingInterval: durationpb.New(0), // no need to subscribe, we need it only once
+	})
+	if err != nil {
+		return fmt.Errorf("failed to connect to the server: %w", err)
+	}
+	serverInfo, err := indexClient.Recv()
+	if err != nil {
+		return fmt.Errorf("failed to fetch server information: %w", err)
+	}
+	c.serverID = serverInfo.GetUuid()
+
+	c.log.Debugf("connection to %s (%s) established.", c.config.Server, c.serverID)
+
 	return nil
 }
 
@@ -153,29 +168,55 @@ func (c *shipper) Publish(ctx context.Context, batch publisher.Batch) error {
 
 	ctx, cancel := context.WithTimeout(ctx, c.timeout)
 	defer cancel()
-	resp, err := c.client.PublishEvents(ctx, &messages.PublishRequest{
+	publishReply, err := c.client.PublishEvents(ctx, &messages.PublishRequest{
+		Uuid:   c.serverID,
 		Events: convertedEvents,
 	})
 
-	if status.Code(err) != codes.OK || resp == nil {
+	if status.Code(err) != codes.OK {
 		batch.Cancelled()         // does not decrease the TTL
 		st.Cancelled(len(events)) // we cancel the whole batch not just non-dropped events
 		return fmt.Errorf("failed to publish the batch to the shipper, none of the %d events were accepted: %w", len(convertedEvents), err)
 	}
 
 	// with a correct server implementation should never happen, this error is not recoverable
-	if int(resp.AcceptedCount) > len(nonDroppedEvents) {
+	if int(publishReply.AcceptedCount) > len(nonDroppedEvents) {
 		return fmt.Errorf(
 			"server returned unexpected results, expected maximum accepted items %d, got %d",
 			len(nonDroppedEvents),
-			resp.AcceptedCount,
+			publishReply.AcceptedCount,
 		)
 	}
 
-	// the server is supposed to retain the order of the initial events in the response
-	// judging by the size of the result list we can determine what part of the initial
-	// list was accepted and we can send the rest of the list for a retry
-	retries := nonDroppedEvents[resp.AcceptedCount:]
+	// so we explicitly close the stream
+	ackCtx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	ackClient, err := c.client.PersistedIndex(ackCtx, &messages.PersistedIndexRequest{
+		PollingInterval: durationpb.New(c.config.AckPollingInterval),
+	})
+	if err != nil {
+		return fmt.Errorf("acknowledgement failed due to the connectivity error: %w", err)
+	}
+
+	indexReply, err := ackClient.Recv()
+	if err != nil {
+		return fmt.Errorf("acknowledgement failed due to the connectivity error: %w", err)
+	}
+
+	if indexReply.GetUuid() != c.serverID {
+		batch.Cancelled()
+		st.Cancelled(len(events))
+		return fmt.Errorf("acknowledgement failed due to a connection to a different server %s, expected %s", indexReply.Uuid, c.serverID)
+	}
+
+	for indexReply.PersistedIndex < publishReply.AcceptedIndex {
+		indexReply, err = ackClient.Recv()
+		if err != nil {
+			return fmt.Errorf("acknowledgement failed due to the connectivity error: %w", err)
+		}
+	}
+
+	retries := nonDroppedEvents[publishReply.AcceptedCount:]
 	if len(retries) == 0 {
 		batch.ACK()
 		st.Acked(len(nonDroppedEvents))
@@ -183,7 +224,7 @@ func (c *shipper) Publish(ctx context.Context, batch publisher.Batch) error {
 	} else {
 		batch.RetryEvents(retries) // decreases TTL unless guaranteed delivery
 		st.Failed(len(retries))
-		c.log.Debugf("%d events have been accepted, %d sent for retry, %d dropped", resp.AcceptedCount, len(retries), droppedCount)
+		c.log.Debugf("%d events have been accepted, %d sent for retry, %d dropped", publishReply.AcceptedCount, len(retries), droppedCount)
 	}
 
 	return nil
diff --git a/libbeat/outputs/shipper/shipper_test.go b/libbeat/outputs/shipper/shipper_test.go
index 08451006433..9cdd869e971 100644
--- a/libbeat/outputs/shipper/shipper_test.go
+++ b/libbeat/outputs/shipper/shipper_test.go
@@ -305,21 +305,11 @@ func TestPublish(t *testing.T) {
 			})
 			require.NoError(t, err)
 
-			group, err := makeShipper(
-				nil,
-				beat.Info{Beat: "libbeat", IndexPrefix: "testbeat"},
-				outputs.NewNilObserver(),
-				cfg,
-			)
-			require.NoError(t, err)
-			require.Len(t, group.Clients, 1)
+			client := createShipperClient(t, cfg)
 
 			batch := outest.NewBatch(tc.events...)
 
-			err = group.Clients[0].(outputs.Connectable).Connect()
-			require.NoError(t, err)
-
-			err = group.Clients[0].Publish(ctx, batch)
+			err = client.Publish(ctx, batch)
 			if tc.expError != "" {
 				require.Error(t, err)
 				require.Contains(t, err.Error(), tc.expError)
@@ -348,19 +338,7 @@ func TestPublish(t *testing.T) {
 		})
 		require.NoError(t, err)
 
-		group, err := makeShipper(
-			nil,
-			beat.Info{Beat: "libbeat", IndexPrefix: "testbeat"},
-			outputs.NewNilObserver(),
-			cfg,
-		)
-		require.NoError(t, err)
-		require.Len(t, group.Clients, 1)
-
-		client := group.Clients[0].(outputs.NetworkClient)
-
-		err = client.Connect()
-		require.NoError(t, err)
+		client := createShipperClient(t, cfg)
 
 		// Should successfully publish with the server running
 		batch := outest.NewBatch(events...)
@@ -407,6 +385,48 @@ func TestPublish(t *testing.T) {
 		require.NoError(t, err)
 		require.Equal(t, expSignals, batch.Signals)
 	})
+
+	t.Run("cancel the batch when a different server responds", func(t *testing.T) {
+		ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
+		defer cancel()
+
+		addr, stop := runServer(t, 5, nil, "localhost:0")
+		defer stop()
+
+		cfg, err := config.NewConfigFrom(map[string]interface{}{
+			"server":  addr,
+			"timeout": 5, // 5 sec
+			"backoff": map[string]interface{}{
+				"init": "10ms",
+				"max":  "5s",
+			},
+		})
+		require.NoError(t, err)
+
+		client := createShipperClient(t, cfg)
+
+		// Should successfully publish without an ID change
+		batch := outest.NewBatch(events...)
+		err = client.Publish(ctx, batch)
+		require.NoError(t, err)
+
+		// Replace the server (would change the ID)
+		stop()
+
+		_, stop = runServer(t, 5, nil, addr)
+		defer stop()
+
+		batch = outest.NewBatch(events...)
+		err = client.Publish(ctx, batch)
+		require.Error(t, err)
+
+		require.Eventually(t, func() bool {
+			// the mock server does not validate incoming IDs on `Publish`, so the error should come from
+			// the acknowledgement request
+			return strings.Contains(err.Error(), "acknowledgement failed due to a connection to a different server")
+		}, 100*time.Millisecond, 10*time.Millisecond)
+	})
+
 }
 
 // BenchmarkToShipperEvent is used to detect performance regression when the conversion function is changed.
@@ -485,6 +505,24 @@ func runServer(t *testing.T, qSize int, err error, listenAddr string) (actualAdd
 	return actualAddr, stop
 }
 
+func createShipperClient(t *testing.T, cfg *config.C) outputs.NetworkClient {
+	group, err := makeShipper(
+		nil,
+		beat.Info{Beat: "libbeat", IndexPrefix: "testbeat"},
+		outputs.NewNilObserver(),
+		cfg,
+	)
+	require.NoError(t, err)
+	require.Len(t, group.Clients, 1)
+
+	client := group.Clients[0].(outputs.NetworkClient)
+
+	err = client.Connect()
+	require.NoError(t, err)
+
+	return client
+}
+
 func protoStruct(t *testing.T, values map[string]interface{}) *messages.Struct {
 	s, err := helpers.NewStruct(values)
 	require.NoError(t, err)