From a4019eccd50ece5ba4ee9f600880e67e86f02708 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 10 Apr 2024 18:26:22 -0400 Subject: [PATCH 01/99] cleanup --- x-pack/libbeat/common/aws/credentials.go | 33 ++++--------------- x-pack/libbeat/common/aws/credentials_test.go | 2 +- 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/x-pack/libbeat/common/aws/credentials.go b/x-pack/libbeat/common/aws/credentials.go index f6efde3e2b20..981547cb1dc7 100644 --- a/x-pack/libbeat/common/aws/credentials.go +++ b/x-pack/libbeat/common/aws/credentials.go @@ -56,7 +56,7 @@ type ConfigAWS struct { // InitializeAWSConfig function creates the awssdk.Config object from the provided config func InitializeAWSConfig(beatsConfig ConfigAWS) (awssdk.Config, error) { - awsConfig, _ := GetAWSCredentials(beatsConfig) + awsConfig, _ := getAWSCredentials(beatsConfig) if awsConfig.Region == "" { if beatsConfig.DefaultRegion != "" { awsConfig.Region = beatsConfig.DefaultRegion @@ -92,12 +92,12 @@ func InitializeAWSConfig(beatsConfig ConfigAWS) (awssdk.Config, error) { return awsConfig, nil } -// GetAWSCredentials function gets aws credentials from the config. +// getAWSCredentials function gets aws credentials from the config. // If access keys given, use them as credentials. // If access keys are not given, then load from AWS config file. If credential_profile_name is not // given, default profile will be used. // If role_arn is given, assume the IAM role either with access keys or default profile. -func GetAWSCredentials(beatsConfig ConfigAWS) (awssdk.Config, error) { +func getAWSCredentials(beatsConfig ConfigAWS) (awssdk.Config, error) { // Check if accessKeyID or secretAccessKey or sessionToken is given from configuration if beatsConfig.AccessKeyID != "" || beatsConfig.SecretAccessKey != "" || beatsConfig.SessionToken != "" { return getConfigForKeys(beatsConfig), nil @@ -110,17 +110,10 @@ func GetAWSCredentials(beatsConfig ConfigAWS) (awssdk.Config, error) { // Provided config must contain an accessKeyID, secretAccessKey and sessionToken to generate a valid CredentialsProfile func getConfigForKeys(beatsConfig ConfigAWS) awssdk.Config { config := awssdk.NewConfig() - awsCredentials := awssdk.Credentials{ - AccessKeyID: beatsConfig.AccessKeyID, - SecretAccessKey: beatsConfig.SecretAccessKey, - } - - if beatsConfig.SessionToken != "" { - awsCredentials.SessionToken = beatsConfig.SessionToken - } - - addStaticCredentialsProviderToAwsConfig(beatsConfig, config) - + config.Credentials = credentials.NewStaticCredentialsProvider( + beatsConfig.AccessKeyID, + beatsConfig.SecretAccessKey, + beatsConfig.SessionToken) return *config } @@ -172,15 +165,3 @@ func addAssumeRoleProviderToAwsConfig(config ConfigAWS, awsConfig *awssdk.Config } }) } - -// addStaticCredentialsProviderToAwsConfig adds a static credentials provider to the current AWS config by using the keys stored in Beats config -func addStaticCredentialsProviderToAwsConfig(beatsConfig ConfigAWS, awsConfig *awssdk.Config) { - logger := logp.NewLogger("addStaticCredentialsProviderToAwsConfig") - logger.Debug("Switching credentials provider to StaticCredentialsProvider") - staticCredentialsProvider := credentials.NewStaticCredentialsProvider( - beatsConfig.AccessKeyID, - beatsConfig.SecretAccessKey, - beatsConfig.SessionToken) - - awsConfig.Credentials = staticCredentialsProvider -} diff --git a/x-pack/libbeat/common/aws/credentials_test.go b/x-pack/libbeat/common/aws/credentials_test.go index 43bbc642bc53..9f125c6301f4 100644 --- a/x-pack/libbeat/common/aws/credentials_test.go +++ b/x-pack/libbeat/common/aws/credentials_test.go @@ -41,7 +41,7 @@ func TestGetAWSCredentials(t *testing.T) { SecretAccessKey: "abc", SessionToken: "fake-session-token", } - awsConfig, err := GetAWSCredentials(inputConfig) + awsConfig, err := getAWSCredentials(inputConfig) assert.NoError(t, err) retrievedAWSConfig, err := awsConfig.Credentials.Retrieve(context.Background()) From a3d37571a998625c350b60652983439b9039b0f2 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 10 Apr 2024 21:59:21 -0400 Subject: [PATCH 02/99] cleanups --- x-pack/filebeat/input/awss3/input.go | 44 +++++++---------------- x-pack/filebeat/input/awss3/input_test.go | 4 +-- x-pack/libbeat/common/aws/semaphore.go | 19 ++++------ 3 files changed, 20 insertions(+), 47 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 5fc1c1f0491c..e80f6445005b 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -127,16 +127,17 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { defer cancelInputCtx() if in.config.QueueURL != "" { - regionName, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint, in.config.RegionName) - if err != nil && in.config.RegionName == "" { + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) } - var warn regionMismatchError - if errors.As(err, &warn) { - // Warn of mismatch, but go ahead with configured region name. - inputContext.Logger.Warnf("%v: using %q", err, regionName) - } - in.awsConfig.Region = regionName + + in.awsConfig.Region = urlRegion // Create SQS receiver and S3 notification processor. receiver, err := in.createSQSReceiver(inputContext, pipeline) @@ -319,7 +320,7 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") -func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (region string, err error) { +func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { // get region from queueURL // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue @@ -332,11 +333,7 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg // check for sqs queue url if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - region = queueHostSplit[1] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplit[1], nil } } @@ -344,30 +341,13 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - region = queueHostSplitVPC[2] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplitVPC[2], nil } } - - if defaultRegion != "" { - return defaultRegion, nil - } } return "", errBadQueueURL } -type regionMismatchError struct { - queueURLRegion string - defaultRegion string -} - -func (e regionMismatchError) Error() string { - return fmt.Sprintf("configured region disagrees with queue_url region: %q != %q", e.queueURLRegion, e.defaultRegion) -} - func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ Bucket: awssdk.String(bucketName), diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index abc9f5c9a6a6..0a3053f7f1b9 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -54,7 +54,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { name string queueURL string endpoint string - deflt string want string wantErr error }{ @@ -77,7 +76,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { { name: "vpce_endpoint", queueURL: "https://vpce-test.sqs.us-east-2.vpce.amazonaws.com/12345678912/sqs-queue", - deflt: "", want: "us-east-2", }, { @@ -90,7 +88,7 @@ func TestGetRegionFromQueueURL(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := getRegionFromQueueURL(test.queueURL, test.endpoint, test.deflt) + got, err := getRegionFromQueueURL(test.queueURL, test.endpoint) if !sameError(err, test.wantErr) { t.Errorf("unexpected error: got:%v want:%v", err, test.wantErr) } diff --git a/x-pack/libbeat/common/aws/semaphore.go b/x-pack/libbeat/common/aws/semaphore.go index 28343bcbd32e..1e7af456b28c 100644 --- a/x-pack/libbeat/common/aws/semaphore.go +++ b/x-pack/libbeat/common/aws/semaphore.go @@ -10,19 +10,14 @@ import ( ) type Sem struct { - mutex *sync.Mutex cond sync.Cond available int } func NewSem(n int) *Sem { - var m sync.Mutex return &Sem{ available: n, - mutex: &m, - cond: sync.Cond{ - L: &m, - }, + cond: sync.Cond{L: &sync.Mutex{}}, } } @@ -46,8 +41,8 @@ func (s *Sem) Acquire(n int) int { return 0 } - s.mutex.Lock() - defer s.mutex.Unlock() + s.cond.L.Lock() + defer s.cond.L.Unlock() if s.available == 0 { s.cond.Wait() @@ -68,16 +63,16 @@ func (s *Sem) Release(n int) { return } - s.mutex.Lock() - defer s.mutex.Unlock() + s.cond.L.Lock() + defer s.cond.L.Unlock() s.available += n s.cond.Signal() } func (s *Sem) Available() int { - s.mutex.Lock() - defer s.mutex.Unlock() + s.cond.L.Lock() + defer s.cond.L.Unlock() return s.available } From 597e0a580078b2ab5052ac512bb621a5c5ca67e6 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 11 Apr 2024 10:25:13 -0400 Subject: [PATCH 03/99] break input sources up into separate helper functions --- x-pack/filebeat/input/awss3/input.go | 114 ++++++++++++++++----------- 1 file changed, 70 insertions(+), 44 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index e80f6445005b..31431c3adcc7 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -127,57 +127,16 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { defer cancelInputCtx() if in.config.QueueURL != "" { - configRegion := in.config.RegionName - urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) - if err != nil && configRegion == "" { - // Only report an error if we don't have a configured region - // to fall back on. - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } else if configRegion != "" && configRegion != urlRegion { - inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) - } - - in.awsConfig.Region = urlRegion - - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) + err = in.runQueueReader(ctx, inputContext, pipeline) if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) - } - defer receiver.metrics.Close() - - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) - - if err := receiver.Receive(ctx); err != nil { + // possibly this should be unconditional? return err } } if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - CloseRef: inputContext.Cancelation, - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) + err = in.runS3Poller(ctx, inputContext, pipeline, persistentStore, states) if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() - - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() - - if err := poller.Poll(ctx); err != nil { return err } } @@ -185,6 +144,70 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { return nil } +func (in *s3Input) runS3Poller( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, + persistentStore *statestore.Store, + states *states +) error { + // Create client for publishing events and receive notification of their ACKs. + client, err := pipeline.ConnectWith(beat.ClientConfig{ + CloseRef: inputContext.Cancelation, + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) + if err != nil { + return fmt.Errorf("failed to create pipeline client: %w", err) + } + defer client.Close() + + // Create S3 receiver and S3 notification processor. + poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) + if err != nil { + return fmt.Errorf("failed to initialize s3 poller: %w", err) + } + defer poller.metrics.Close() + + if err := poller.Poll(ctx); err != nil { + return err + } +} + +func (in *s3Input) runQueueReader( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) + } + + in.awsConfig.Region = urlRegion + + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, pipeline) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() + + // Poll metrics periodically in the background + go pollSqsWaitingMetric(ctx, receiver) + + return receiver.Receive(ctx) +} + func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { sqsAPI := &awsSQSAPI{ client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { @@ -227,8 +250,11 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s return nil, err } in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) + sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) return sqsReader, nil From 0df748a27ebf875ef90006cdefc54c9cb78c91be Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 11 Apr 2024 14:20:56 -0400 Subject: [PATCH 04/99] finish helper function split --- x-pack/filebeat/input/awss3/input.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 31431c3adcc7..d044f98c0637 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -149,7 +149,7 @@ func (in *s3Input) runS3Poller( inputContext v2.Context, pipeline beat.Pipeline, persistentStore *statestore.Store, - states *states + states *states, ) error { // Create client for publishing events and receive notification of their ACKs. client, err := pipeline.ConnectWith(beat.ClientConfig{ @@ -173,9 +173,7 @@ func (in *s3Input) runS3Poller( } defer poller.metrics.Close() - if err := poller.Poll(ctx); err != nil { - return err - } + return poller.Poll(ctx) } func (in *s3Input) runQueueReader( From 4b709003aafd1f49026e21eede45a5666e6d7fd9 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 11 Apr 2024 17:26:22 -0400 Subject: [PATCH 05/99] rewrite the sqsReader main loop --- x-pack/filebeat/input/awss3/sqs.go | 156 ++++++++++++++++++++--------- 1 file changed, 108 insertions(+), 48 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index dd454a3bfb92..b686e6b65461 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -13,9 +13,8 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sqs/types" - awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" + "github.com/elastic/beats/v7/libbeat/common/atomic" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/go-concert/timed" ) const ( @@ -25,11 +24,26 @@ const ( type sqsReader struct { maxMessagesInflight int - workerSem *awscommon.Sem + activeMessages atomic.Int sqs sqsAPI msgHandler sqsProcessor log *logp.Logger metrics *inputMetrics + + // The main loop sends incoming messages to workChan, and the worker + // goroutines read from it. + workChan chan types.Message + + // workerWg is used to wait on worker goroutines during shutdown + workerWg sync.WaitGroup + + // This channel is used by wakeUpMainLoop() to signal to the main + // loop that a worker is ready for more data + wakeUpChan chan struct{} + + // If retryTimer is set, there was an error receiving SQS messages, + // and the run loop will not try again until the timer expires. + retryTimer *time.Timer } func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessagesInflight int, msgHandler sqsProcessor) *sqsReader { @@ -39,69 +53,115 @@ func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessag } return &sqsReader{ maxMessagesInflight: maxMessagesInflight, - workerSem: awscommon.NewSem(maxMessagesInflight), sqs: sqs, msgHandler: msgHandler, log: log, metrics: metrics, + workChan: make(chan types.Message), + + // wakeUpChan is buffered so we can always trigger it without blocking, + // even if the main loop is in the middle of other work + wakeUpChan: make(chan struct{}, 1), } } -func (r *sqsReader) Receive(ctx context.Context) error { - // This loop tries to keep the workers busy as much as possible while - // honoring the max message cap as opposed to a simpler loop that receives - // N messages, waits for them all to finish, then requests N more messages. - var workerWg sync.WaitGroup - for ctx.Err() == nil { - // Determine how many SQS workers are available. - workers, err := r.workerSem.AcquireContext(r.maxMessagesInflight, ctx) - if err != nil { - break +func (r *sqsReader) wakeUpMainLoop() { + select { + case r.wakeUpChan <- struct{}{}: + default: + } +} + +func (r *sqsReader) sqsWorkerLoop(ctx context.Context) { + for msg := range r.workChan { + start := time.Now() + + id := r.metrics.beginSQSWorker() + if err := r.msgHandler.ProcessSQS(ctx, &msg); err != nil { + r.log.Warnw("Failed processing SQS message.", + "error", err, + "message_id", *msg.MessageId, + "elapsed_time_ns", time.Since(start)) } + r.metrics.endSQSWorker(id) + r.activeMessages.Dec() + // Notify the main loop that we're ready for more data, in case it's asleep + r.wakeUpMainLoop() + } +} - // Receive (at most) as many SQS messages as there are workers. - msgs, err := r.sqs.ReceiveMessage(ctx, workers) - if err != nil { - r.workerSem.Release(workers) +func (r *sqsReader) getMessageBatch(ctx context.Context) []types.Message { + // We read enough messages to bring activeMessages up to the total + // worker count + receiveCount := r.maxMessagesInflight - r.activeMessages.Load() + if receiveCount > 0 { + msgs, err := r.sqs.ReceiveMessage(ctx, receiveCount) + if err != nil && ctx.Err() == nil { + r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) + r.retryTimer = time.NewTimer(sqsRetryDelay) + } + r.activeMessages.Add(len(msgs)) + r.log.Debugf("Received %v SQS messages.", len(msgs)) + r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) + return msgs + } + return nil +} - if ctx.Err() == nil { - r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) +func (r *sqsReader) startWorkers(ctx context.Context) { + // Start the worker goroutines that will process messages from workChan + // until the input shuts down. + for i := 0; i < r.maxMessagesInflight; i++ { + r.workerWg.Add(1) + go func() { + defer r.workerWg.Done() + r.sqsWorkerLoop(ctx) + }() + } +} - // Throttle retries. - _ = timed.Wait(ctx, sqsRetryDelay) - } - continue +func (r *sqsReader) Receive(ctx context.Context) error { + var msgs []types.Message + for ctx.Err() == nil { + // If we don't have any messages, and we aren't in a retry delay, + // try to read some + if len(msgs) == 0 && r.retryTimer == nil { + msgs = r.getMessageBatch(ctx) } - // Release unused workers. - r.workerSem.Release(workers - len(msgs)) + // Unblock the local work channel only if there are messages to send + var workChan chan types.Message + var nextMessage types.Message + if len(msgs) > 0 { + workChan = r.workChan + nextMessage = msgs[0] + } - // Process each SQS message asynchronously with a goroutine. - r.log.Debugf("Received %v SQS messages.", len(msgs)) - r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) - workerWg.Add(len(msgs)) - - for _, msg := range msgs { - go func(msg types.Message, start time.Time) { - id := r.metrics.beginSQSWorker() - defer func() { - r.metrics.endSQSWorker(id) - workerWg.Done() - r.workerSem.Release(1) - }() - - if err := r.msgHandler.ProcessSQS(ctx, &msg); err != nil { - r.log.Warnw("Failed processing SQS message.", - "error", err, - "message_id", *msg.MessageId, - "elapsed_time_ns", time.Since(start)) - } - }(msg, time.Now()) + // Unblock the retry channel only if there's an active retry timer + var retryChan <-chan time.Time + if r.retryTimer != nil { + retryChan = r.retryTimer.C + } + + select { + case <-ctx.Done(): + case workChan <- nextMessage: + msgs = msgs[1:] + case <-retryChan: + // The retry interval has elapsed, clear the timer so we can request + // new messages again + r.retryTimer = nil + case <-r.wakeUpChan: + // No need to do anything, this is just to unblock us when a worker is + // ready for more data } } + // Close the work channel to signal to the workers that we're done + close(r.workChan) + // Wait for all workers to finish. - workerWg.Wait() + r.workerWg.Wait() if errors.Is(ctx.Err(), context.Canceled) { // A canceled context is a normal shutdown. From 90d9e24eac0a3c34fb594e97fc7357a676f8c9d9 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 13:26:22 -0400 Subject: [PATCH 06/99] simplify sqsReader loop --- x-pack/filebeat/input/awss3/input.go | 3 +- .../input/awss3/input_benchmark_test.go | 6 +- x-pack/filebeat/input/awss3/sqs.go | 107 ++++++------------ x-pack/filebeat/input/awss3/sqs_test.go | 7 +- 4 files changed, 37 insertions(+), 86 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index d044f98c0637..3b27c9f011c5 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -203,7 +203,8 @@ func (in *s3Input) runQueueReader( // Poll metrics periodically in the background go pollSqsWaitingMetric(ctx, receiver) - return receiver.Receive(ctx) + receiver.Receive(ctx) + return nil } func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index e05e5b461ca6..895845a83613 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -233,11 +233,7 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR b.ResetTimer() start := time.Now() - if err := sqsReader.Receive(ctx); err != nil { - if !errors.Is(err, context.DeadlineExceeded) { - t.Fatal(err) - } - } + sqsReader.Receive(ctx) b.StopTimer() elapsed := time.Since(start) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index b686e6b65461..823d66ee0168 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -6,7 +6,6 @@ package awss3 import ( "context" - "errors" "strconv" "sync" "time" @@ -36,14 +35,6 @@ type sqsReader struct { // workerWg is used to wait on worker goroutines during shutdown workerWg sync.WaitGroup - - // This channel is used by wakeUpMainLoop() to signal to the main - // loop that a worker is ready for more data - wakeUpChan chan struct{} - - // If retryTimer is set, there was an error receiving SQS messages, - // and the run loop will not try again until the timer expires. - retryTimer *time.Timer } func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessagesInflight int, msgHandler sqsProcessor) *sqsReader { @@ -58,17 +49,6 @@ func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessag log: log, metrics: metrics, workChan: make(chan types.Message), - - // wakeUpChan is buffered so we can always trigger it without blocking, - // even if the main loop is in the middle of other work - wakeUpChan: make(chan struct{}, 1), - } -} - -func (r *sqsReader) wakeUpMainLoop() { - select { - case r.wakeUpChan <- struct{}{}: - default: } } @@ -85,27 +65,32 @@ func (r *sqsReader) sqsWorkerLoop(ctx context.Context) { } r.metrics.endSQSWorker(id) r.activeMessages.Dec() - // Notify the main loop that we're ready for more data, in case it's asleep - r.wakeUpMainLoop() } } func (r *sqsReader) getMessageBatch(ctx context.Context) []types.Message { // We read enough messages to bring activeMessages up to the total - // worker count - receiveCount := r.maxMessagesInflight - r.activeMessages.Load() - if receiveCount > 0 { - msgs, err := r.sqs.ReceiveMessage(ctx, receiveCount) - if err != nil && ctx.Err() == nil { - r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) - r.retryTimer = time.NewTimer(sqsRetryDelay) + // worker count (plus one, to unblock us when workers are ready for + // more messages) + receiveCount := r.maxMessagesInflight + 1 - r.activeMessages.Load() + if receiveCount <= 0 { + return nil + } + msgs, err := r.sqs.ReceiveMessage(ctx, receiveCount) + for err != nil && ctx.Err() == nil { + r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) + // Wait for the retry delay, but stop early if the context is cancelled. + select { + case <-ctx.Done(): + return nil + case <-time.After(sqsRetryDelay): } - r.activeMessages.Add(len(msgs)) - r.log.Debugf("Received %v SQS messages.", len(msgs)) - r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) - return msgs + msgs, err = r.sqs.ReceiveMessage(ctx, receiveCount) } - return nil + r.activeMessages.Add(len(msgs)) + r.log.Debugf("Received %v SQS messages.", len(msgs)) + r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) + return msgs } func (r *sqsReader) startWorkers(ctx context.Context) { @@ -120,54 +105,26 @@ func (r *sqsReader) startWorkers(ctx context.Context) { } } -func (r *sqsReader) Receive(ctx context.Context) error { - var msgs []types.Message - for ctx.Err() == nil { - // If we don't have any messages, and we aren't in a retry delay, - // try to read some - if len(msgs) == 0 && r.retryTimer == nil { - msgs = r.getMessageBatch(ctx) - } +// The main loop of the reader, that fetches messages from SQS +// and forwards them to workers via workChan. +func (r *sqsReader) Receive(ctx context.Context) { + r.startWorkers(ctx) - // Unblock the local work channel only if there are messages to send - var workChan chan types.Message - var nextMessage types.Message - if len(msgs) > 0 { - workChan = r.workChan - nextMessage = msgs[0] - } - - // Unblock the retry channel only if there's an active retry timer - var retryChan <-chan time.Time - if r.retryTimer != nil { - retryChan = r.retryTimer.C - } + for ctx.Err() == nil { + msgs := r.getMessageBatch(ctx) - select { - case <-ctx.Done(): - case workChan <- nextMessage: - msgs = msgs[1:] - case <-retryChan: - // The retry interval has elapsed, clear the timer so we can request - // new messages again - r.retryTimer = nil - case <-r.wakeUpChan: - // No need to do anything, this is just to unblock us when a worker is - // ready for more data + for _, msg := range msgs { + select { + case <-ctx.Done(): + case r.workChan <- msg: + } } } - // Close the work channel to signal to the workers that we're done + // Close the work channel to signal to the workers that we're done, + // then wait for them to finish. close(r.workChan) - - // Wait for all workers to finish. r.workerWg.Wait() - - if errors.Is(ctx.Err(), context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } - return ctx.Err() } func (r *sqsReader) GetApproximateMessageCount(ctx context.Context) (int, error) { diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index 5eda5d1885e2..2ab261173d16 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -16,7 +16,6 @@ import ( "github.com/gofrs/uuid" "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent-libs/logp" ) @@ -73,8 +72,7 @@ func TestSQSReceiver(t *testing.T) { // Execute sqsReader and verify calls/state. receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) - require.NoError(t, receiver.Receive(ctx)) - assert.Equal(t, maxMessages, receiver.workerSem.Available()) + receiver.Receive(ctx) }) t.Run("retry after ReceiveMessage error", func(t *testing.T) { @@ -106,8 +104,7 @@ func TestSQSReceiver(t *testing.T) { // Execute SQSReceiver and verify calls/state. receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) - require.NoError(t, receiver.Receive(ctx)) - assert.Equal(t, maxMessages, receiver.workerSem.Available()) + receiver.Receive(ctx) }) } From 5f94e9b934f538082d257e6694ebff95792c2bf4 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 13:35:35 -0400 Subject: [PATCH 07/99] adjust variable names --- x-pack/filebeat/input/awss3/sqs.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 823d66ee0168..6774ccdb8c22 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -68,15 +68,15 @@ func (r *sqsReader) sqsWorkerLoop(ctx context.Context) { } } -func (r *sqsReader) getMessageBatch(ctx context.Context) []types.Message { - // We read enough messages to bring activeMessages up to the total - // worker count (plus one, to unblock us when workers are ready for - // more messages) - receiveCount := r.maxMessagesInflight + 1 - r.activeMessages.Load() - if receiveCount <= 0 { +func (r *sqsReader) readMessages(ctx context.Context) []types.Message { + // We try to read enough messages to bring activeMessages up to the + // total worker count (plus one, to unblock us when workers are ready + // for more messages) + readCount := r.maxMessagesInflight + 1 - r.activeMessages.Load() + if readCount <= 0 { return nil } - msgs, err := r.sqs.ReceiveMessage(ctx, receiveCount) + msgs, err := r.sqs.ReceiveMessage(ctx, readCount) for err != nil && ctx.Err() == nil { r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) // Wait for the retry delay, but stop early if the context is cancelled. @@ -85,7 +85,7 @@ func (r *sqsReader) getMessageBatch(ctx context.Context) []types.Message { return nil case <-time.After(sqsRetryDelay): } - msgs, err = r.sqs.ReceiveMessage(ctx, receiveCount) + msgs, err = r.sqs.ReceiveMessage(ctx, readCount) } r.activeMessages.Add(len(msgs)) r.log.Debugf("Received %v SQS messages.", len(msgs)) @@ -111,7 +111,7 @@ func (r *sqsReader) Receive(ctx context.Context) { r.startWorkers(ctx) for ctx.Err() == nil { - msgs := r.getMessageBatch(ctx) + msgs := r.readMessages(ctx) for _, msg := range msgs { select { From b79726167e703bd01bedf6445206fa9d75cfcdfe Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 14:04:34 -0400 Subject: [PATCH 08/99] remove unused parameter --- x-pack/filebeat/input/awss3/input.go | 4 ++-- .../filebeat/input/awss3/input_benchmark_test.go | 4 ++-- x-pack/filebeat/input/awss3/s3_objects.go | 2 +- x-pack/filebeat/input/awss3/s3_objects_test.go | 14 +++++++------- x-pack/filebeat/input/awss3/s3_test.go | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 3b27c9f011c5..454c49f7dbcb 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -250,7 +250,7 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s } in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig) sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) @@ -325,7 +325,7 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} } in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig) s3Poller := newS3Poller(log.Named("s3_poller"), in.metrics, s3API, diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 895845a83613..9e1d288b8db2 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -217,7 +217,7 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR pipeline := &fakePipeline{} conf := makeBenchmarkConfig(t) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}, maxMessagesInflight) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory, maxMessagesInflight) sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, maxMessagesInflight, sqsMessageHandler) @@ -344,7 +344,7 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult return } - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}, numberOfWorkers) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) if err := s3Poller.Poll(ctx); err != nil { diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go index 32911778336b..ef8d1994ac4a 100644 --- a/x-pack/filebeat/input/awss3/s3_objects.go +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -43,7 +43,7 @@ type s3ObjectProcessorFactory struct { backupConfig backupConfig } -func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig, maxWorkers int) *s3ObjectProcessorFactory { +func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig) *s3ObjectProcessorFactory { if metrics == nil { // Metrics are optional. Initialize a stub. metrics = newInputMetrics("", nil, 0) diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go index 6732c12e0579..0aa7470b172e 100644 --- a/x-pack/filebeat/input/awss3/s3_objects_test.go +++ b/x-pack/filebeat/input/awss3/s3_objects_test.go @@ -153,7 +153,7 @@ func TestS3ObjectProcessor(t *testing.T) { GetObject(gomock.Any(), gomock.Eq(s3Event.S3.Bucket.Name), gomock.Eq(s3Event.S3.Object.Key)). Return(nil, errFakeConnectivityFailure) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupConfig{}, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupConfig{}) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() require.Error(t, err) @@ -175,7 +175,7 @@ func TestS3ObjectProcessor(t *testing.T) { GetObject(gomock.Any(), gomock.Eq(s3Event.S3.Bucket.Name), gomock.Eq(s3Event.S3.Object.Key)). Return(nil, nil) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupConfig{}, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupConfig{}) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() require.Error(t, err) @@ -202,7 +202,7 @@ func TestS3ObjectProcessor(t *testing.T) { Times(2), ) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupConfig{}, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupConfig{}) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() require.NoError(t, err) @@ -228,7 +228,7 @@ func TestS3ObjectProcessor(t *testing.T) { Return(nil, nil), ) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupCfg, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupCfg) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).FinalizeS3Object() require.NoError(t, err) @@ -258,7 +258,7 @@ func TestS3ObjectProcessor(t *testing.T) { Return(nil, nil), ) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupCfg, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupCfg) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).FinalizeS3Object() require.NoError(t, err) @@ -285,7 +285,7 @@ func TestS3ObjectProcessor(t *testing.T) { Return(nil, nil), ) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupCfg, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, nil, backupCfg) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).FinalizeS3Object() require.NoError(t, err) @@ -331,7 +331,7 @@ func _testProcessS3Object(t testing.TB, file, contentType string, numEvents int, Times(numEvents), ) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, selectors, backupConfig{}, 1) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3API, selectors, backupConfig{}) ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index b94ba7cfb09b..b76f6dc1fb96 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -133,7 +133,7 @@ func TestS3Poller(t *testing.T) { GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("2024-02-08T08:35:00+00:02.json.gz")). Return(nil, errFakeConnectivityFailure) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) @@ -256,7 +256,7 @@ func TestS3Poller(t *testing.T) { GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("key5")). Return(nil, errFakeConnectivityFailure) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) From 88f39808c5c3e1074e3562f1f249385fbc912c02 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 14:08:54 -0400 Subject: [PATCH 09/99] createS3Lister -> createS3Poller --- x-pack/filebeat/input/awss3/input.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 454c49f7dbcb..cf27303ea0cc 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -167,7 +167,7 @@ func (in *s3Input) runS3Poller( defer client.Close() // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) + poller, err := in.createS3Poller(inputContext, ctx, client, persistentStore, states) if err != nil { return fmt.Errorf("failed to initialize s3 poller: %w", err) } @@ -267,7 +267,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil } -func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { +func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { var bucketName string var bucketID string if in.config.NonAWSBucketName != "" { From 9f32df6cdd68e74e33679cdca25d63cc7039d3ed Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 14:25:40 -0400 Subject: [PATCH 10/99] remove unused error checks --- x-pack/filebeat/input/awss3/input.go | 3 ++- .../input/awss3/input_benchmark_test.go | 6 +----- x-pack/filebeat/input/awss3/s3.go | 19 ++----------------- x-pack/filebeat/input/awss3/s3_test.go | 6 ++---- 4 files changed, 7 insertions(+), 27 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index cf27303ea0cc..30bbc5279e29 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -173,7 +173,8 @@ func (in *s3Input) runS3Poller( } defer poller.metrics.Close() - return poller.Poll(ctx) + poller.Poll(ctx) + return nil } func (in *s3Input) runQueueReader( diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 9e1d288b8db2..83e55b1051ba 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -347,11 +347,7 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) - if err := s3Poller.Poll(ctx); err != nil { - if !errors.Is(err, context.DeadlineExceeded) { - errChan <- err - } - } + s3Poller.Poll(ctx) }(i, wg) } diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 5aa8d31e95de..81842a9ba524 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -6,7 +6,6 @@ package awss3 import ( "context" - "errors" "fmt" "sync" "time" @@ -343,7 +342,7 @@ func (p *s3Poller) Purge(ctx context.Context) { } } -func (p *s3Poller) Poll(ctx context.Context) error { +func (p *s3Poller) Poll(ctx context.Context) { // This loop tries to keep the workers busy as much as possible while // honoring the number in config opposed to a simpler loop that does one // listing, sequentially processes every object and then does another listing @@ -384,23 +383,9 @@ func (p *s3Poller) Poll(ctx context.Context) error { }() } - err = timed.Wait(ctx, p.bucketPollInterval) - if err != nil { - if errors.Is(err, context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } - - return err - } + timed.Wait(ctx, p.bucketPollInterval) } // Wait for all workers to finish. workerWg.Wait() - - if errors.Is(ctx.Err(), context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } - return ctx.Err() } diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index b76f6dc1fb96..2c0281474b5d 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -14,7 +14,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/elastic/beats/v7/libbeat/statestore" "github.com/elastic/beats/v7/libbeat/statestore/storetest" @@ -135,7 +134,7 @@ func TestS3Poller(t *testing.T) { s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) - require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) + receiver.Poll(ctx) assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) @@ -258,7 +257,6 @@ func TestS3Poller(t *testing.T) { s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) - require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) + receiver.Poll(ctx) }) } From 48ec82a03ba74dfae951f591272eb8aa1a22846e Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 15:11:02 -0400 Subject: [PATCH 11/99] cleanup --- x-pack/filebeat/input/awss3/input.go | 11 ++--------- x-pack/filebeat/input/awss3/s3.go | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 30bbc5279e29..732dd7444427 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -127,18 +127,11 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { defer cancelInputCtx() if in.config.QueueURL != "" { - err = in.runQueueReader(ctx, inputContext, pipeline) - if err != nil { - // possibly this should be unconditional? - return err - } + return in.runQueueReader(ctx, inputContext, pipeline) } if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - err = in.runS3Poller(ctx, inputContext, pipeline, persistentStore, states) - if err != nil { - return err - } + return in.runS3Poller(ctx, inputContext, pipeline, persistentStore, states) } return nil diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 81842a9ba524..9c73e3d0925f 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -383,7 +383,7 @@ func (p *s3Poller) Poll(ctx context.Context) { }() } - timed.Wait(ctx, p.bucketPollInterval) + _ = timed.Wait(ctx, p.bucketPollInterval) } // Wait for all workers to finish. From 58e084a0d27a1f3e8da9e570e7ac62b33f24f747 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 15:28:37 -0400 Subject: [PATCH 12/99] make a wrapper for v2.Canceler that doesn't use an extra goroutine --- filebeat/input/v2/input.go | 19 +++++++++++++++++++ x-pack/filebeat/input/awss3/input.go | 12 +----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/filebeat/input/v2/input.go b/filebeat/input/v2/input.go index f816e285eb32..3667802cc806 100644 --- a/filebeat/input/v2/input.go +++ b/filebeat/input/v2/input.go @@ -18,6 +18,9 @@ package v2 import ( + "context" + "time" + "github.com/elastic/beats/v7/libbeat/beat" conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" @@ -111,3 +114,19 @@ type Canceler interface { Done() <-chan struct{} Err() error } + +func GoContextFromCanceler(c Canceler) context.Context { + return cancelerCtx{c} +} + +type cancelerCtx struct { + Canceler +} + +func (c cancelerCtx) Deadline() (deadline time.Time, ok bool) { + return time.Time{}, false +} + +func (c cancelerCtx) Value(_ any) any { + return nil +} diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 732dd7444427..153e70f474d8 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -114,17 +114,7 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { return fmt.Errorf("can not start persistent store: %w", err) } - // Wrap input Context's cancellation Done channel a context.Context. This - // goroutine stops with the parent closes the Done channel. - ctx, cancelInputCtx := context.WithCancel(context.Background()) - go func() { - defer cancelInputCtx() - select { - case <-inputContext.Cancelation.Done(): - case <-ctx.Done(): - } - }() - defer cancelInputCtx() + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) if in.config.QueueURL != "" { return in.runQueueReader(ctx, inputContext, pipeline) From 1974f8fb9ecd0067e31f4be29eba39fda2f23d7a Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 15:45:06 -0400 Subject: [PATCH 13/99] remove unused parameter --- x-pack/filebeat/input/awss3/input.go | 7 ++----- .../filebeat/input/awss3/input_benchmark_test.go | 2 +- x-pack/filebeat/input/awss3/sqs_s3_event.go | 1 - x-pack/filebeat/input/awss3/sqs_s3_event_test.go | 16 ++++++++-------- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 153e70f474d8..494636c6d669 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -99,13 +99,12 @@ func (in *s3Input) Test(ctx v2.TestContext) error { } func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - var err error + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) persistentStore, err := in.store.Access() if err != nil { return fmt.Errorf("can not access persistent store: %w", err) } - defer persistentStore.Close() states := newStates(inputContext) @@ -114,8 +113,6 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { return fmt.Errorf("can not start persistent store: %w", err) } - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - if in.config.QueueURL != "" { return in.runQueueReader(ctx, inputContext, pipeline) } @@ -236,7 +233,7 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig) - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory) sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 83e55b1051ba..9a97de8a19a3 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -218,7 +218,7 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR conf := makeBenchmarkConfig(t) s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory, maxMessagesInflight) + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, maxMessagesInflight, sqsMessageHandler) ctx, cancel := context.WithCancel(context.Background()) diff --git a/x-pack/filebeat/input/awss3/sqs_s3_event.go b/x-pack/filebeat/input/awss3/sqs_s3_event.go index 7f95cf564c09..db893e443ac3 100644 --- a/x-pack/filebeat/input/awss3/sqs_s3_event.go +++ b/x-pack/filebeat/input/awss3/sqs_s3_event.go @@ -104,7 +104,6 @@ func newSQSS3EventProcessor( maxReceiveCount int, pipeline beat.Pipeline, s3 s3ObjectHandlerFactory, - maxWorkers int, ) *sqsS3EventProcessor { if metrics == nil { // Metrics are optional. Initialize a stub. diff --git a/x-pack/filebeat/input/awss3/sqs_s3_event_test.go b/x-pack/filebeat/input/awss3/sqs_s3_event_test.go index 5ecd72fc4c91..65552525136d 100644 --- a/x-pack/filebeat/input/awss3/sqs_s3_event_test.go +++ b/x-pack/filebeat/input/awss3/sqs_s3_event_test.go @@ -50,7 +50,7 @@ func TestSQSS3EventProcessor(t *testing.T) { mockAPI.EXPECT().DeleteMessage(gomock.Any(), gomock.Eq(&msg)).Return(nil), ) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory) require.NoError(t, p.ProcessSQS(ctx, &msg)) }) @@ -73,7 +73,7 @@ func TestSQSS3EventProcessor(t *testing.T) { mockAPI.EXPECT().DeleteMessage(gomock.Any(), gomock.Eq(&invalidBodyMsg)).Return(nil), ) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory) err := p.ProcessSQS(ctx, &invalidBodyMsg) require.Error(t, err) t.Log(err) @@ -95,7 +95,7 @@ func TestSQSS3EventProcessor(t *testing.T) { mockAPI.EXPECT().DeleteMessage(gomock.Any(), gomock.Eq(&emptyRecordsMsg)).Return(nil), ) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory) require.NoError(t, p.ProcessSQS(ctx, &emptyRecordsMsg)) }) @@ -127,7 +127,7 @@ func TestSQSS3EventProcessor(t *testing.T) { mockS3Handler.EXPECT().FinalizeS3Object().Return(nil), ) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, visibilityTimeout, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, visibilityTimeout, 5, mockBeatPipeline, mockS3HandlerFactory) require.NoError(t, p.ProcessSQS(ctx, &msg)) }) @@ -150,7 +150,7 @@ func TestSQSS3EventProcessor(t *testing.T) { mockClient.EXPECT().Close(), ) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory) err := p.ProcessSQS(ctx, &msg) t.Log(err) require.Error(t, err) @@ -181,7 +181,7 @@ func TestSQSS3EventProcessor(t *testing.T) { mockAPI.EXPECT().DeleteMessage(gomock.Any(), gomock.Eq(&msg)).Return(nil), ) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, time.Minute, 5, mockBeatPipeline, mockS3HandlerFactory) err := p.ProcessSQS(ctx, &msg) t.Log(err) require.Error(t, err) @@ -227,7 +227,7 @@ func TestSqsProcessor_keepalive(t *testing.T) { mockAPI.EXPECT().ChangeMessageVisibility(gomock.Any(), gomock.Eq(&msg), gomock.Eq(visibilityTimeout)). Times(1).Return(tc.Err) - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, visibilityTimeout, 5, mockBeatPipeline, mockS3HandlerFactory, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, mockAPI, nil, visibilityTimeout, 5, mockBeatPipeline, mockS3HandlerFactory) var wg sync.WaitGroup wg.Add(1) p.keepalive(ctx, p.log, &wg, &msg) @@ -239,7 +239,7 @@ func TestSqsProcessor_keepalive(t *testing.T) { func TestSqsProcessor_getS3Notifications(t *testing.T) { logp.TestingSetup() - p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, nil, nil, time.Minute, 5, nil, nil, 5) + p := newSQSS3EventProcessor(logp.NewLogger(inputName), nil, nil, nil, time.Minute, 5, nil, nil) t.Run("s3 key is url unescaped", func(t *testing.T) { msg := newSQSMessage(newS3Event("Happy+Face.jpg")) From 646374c65464960baf2b8bf1aa3efac435e49a2a Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 16:28:51 -0400 Subject: [PATCH 14/99] cleanup --- x-pack/filebeat/input/awss3/sqs.go | 51 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 6774ccdb8c22..0b005e484116 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -52,7 +52,32 @@ func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessag } } -func (r *sqsReader) sqsWorkerLoop(ctx context.Context) { +// The main loop of the reader, that fetches messages from SQS +// and forwards them to workers via workChan. +func (r *sqsReader) Receive(ctx context.Context) { + r.startWorkers(ctx) + r.readerLoop(ctx) + + // Close the work channel to signal to the workers that we're done, + // then wait for them to finish. + close(r.workChan) + r.workerWg.Wait() +} + +func (r *sqsReader) readerLoop(ctx context.Context) { + for ctx.Err() == nil { + msgs := r.readMessages(ctx) + + for _, msg := range msgs { + select { + case <-ctx.Done(): + case r.workChan <- msg: + } + } + } +} + +func (r *sqsReader) workerLoop(ctx context.Context) { for msg := range r.workChan { start := time.Now() @@ -100,33 +125,11 @@ func (r *sqsReader) startWorkers(ctx context.Context) { r.workerWg.Add(1) go func() { defer r.workerWg.Done() - r.sqsWorkerLoop(ctx) + r.workerLoop(ctx) }() } } -// The main loop of the reader, that fetches messages from SQS -// and forwards them to workers via workChan. -func (r *sqsReader) Receive(ctx context.Context) { - r.startWorkers(ctx) - - for ctx.Err() == nil { - msgs := r.readMessages(ctx) - - for _, msg := range msgs { - select { - case <-ctx.Done(): - case r.workChan <- msg: - } - } - } - - // Close the work channel to signal to the workers that we're done, - // then wait for them to finish. - close(r.workChan) - r.workerWg.Wait() -} - func (r *sqsReader) GetApproximateMessageCount(ctx context.Context) (int, error) { attributes, err := r.sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) if err == nil { From a43cae69c702b1e7c9549e44f8192182114d879e Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 19:54:25 -0400 Subject: [PATCH 15/99] remove redundant helper --- x-pack/filebeat/input/awss3/states.go | 38 +++++++-------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 449219a867f5..5def8d9500b7 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -91,8 +91,7 @@ func (s *states) Delete(id string) { s.Lock() defer s.Unlock() - index := s.findPrevious(id) - if index >= 0 { + if index, exists := s.idx[id]; exists { last := len(s.states) - 1 s.states[last], s.states[index] = s.states[index], s.states[last] s.states = s.states[:last] @@ -155,9 +154,8 @@ func (s *states) Update(newState state, listingID string) { defer s.Unlock() id := newState.ID - index := s.findPrevious(id) - if index >= 0 { + if index, exists := s.idx[id]; exists { s.states[index] = newState } else { // No existing state found, add new one @@ -205,11 +203,10 @@ func (s *states) FindPrevious(newState state) state { s.RLock() defer s.RUnlock() id := newState.ID - i := s.findPrevious(id) - if i < 0 { - return state{} + if i, exists := s.idx[id]; exists { + return s.states[i] } - return s.states[i] + return state{} } // FindPreviousByID lookups a registered state, that matching the id. @@ -217,33 +214,18 @@ func (s *states) FindPrevious(newState state) state { func (s *states) FindPreviousByID(id string) state { s.RLock() defer s.RUnlock() - i := s.findPrevious(id) - if i < 0 { - return state{} + if i, exists := s.idx[id]; exists { + return s.states[i] } - return s.states[i] + return state{} } func (s *states) IsNew(state state) bool { s.RLock() defer s.RUnlock() - id := state.ID - i := s.findPrevious(id) - - if i < 0 { - return true - } - return !s.states[i].IsEqual(&state) -} - -// findPrevious returns the previous state for the file. -// In case no previous state exists, index -1 is returned -func (s *states) findPrevious(id string) int { - if i, exists := s.idx[id]; exists { - return i - } - return -1 + i, exists := s.idx[state.ID] + return !exists || !s.states[i].IsEqual(&state) } // GetStates creates copy of the file states. From f46ef06338e016ad729d511074bf79701e33b186 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 19:55:10 -0400 Subject: [PATCH 16/99] adjust variable names --- x-pack/filebeat/input/awss3/states.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 5def8d9500b7..7ade64669295 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -39,8 +39,8 @@ type states struct { // states store states []state - // idx maps state IDs to state indexes for fast lookup and modifications. - idx map[string]int + // indexForStateID maps state IDs to state indexes for fast lookup and modifications. + indexForStateID map[string]int listingIDs map[string]struct{} listingInfo *sync.Map @@ -52,7 +52,7 @@ func newStates(ctx v2.Context) *states { return &states{ log: ctx.Logger.Named("states"), states: nil, - idx: map[string]int{}, + indexForStateID: map[string]int{}, listingInfo: new(sync.Map), listingIDs: map[string]struct{}{}, statesByListingID: map[string][]state{}, @@ -91,14 +91,14 @@ func (s *states) Delete(id string) { s.Lock() defer s.Unlock() - if index, exists := s.idx[id]; exists { + if index, exists := s.indexForStateID[id]; exists { last := len(s.states) - 1 s.states[last], s.states[index] = s.states[index], s.states[last] s.states = s.states[:last] - s.idx = map[string]int{} + s.indexForStateID = map[string]int{} for i, state := range s.states { - s.idx[state.ID] = i + s.indexForStateID[state.ID] = i } } } @@ -155,11 +155,11 @@ func (s *states) Update(newState state, listingID string) { id := newState.ID - if index, exists := s.idx[id]; exists { + if index, exists := s.indexForStateID[id]; exists { s.states[index] = newState } else { // No existing state found, add new one - s.idx[id] = len(s.states) + s.indexForStateID[id] = len(s.states) s.states = append(s.states, newState) s.log.Debug("New state added for ", newState.ID) } @@ -203,7 +203,7 @@ func (s *states) FindPrevious(newState state) state { s.RLock() defer s.RUnlock() id := newState.ID - if i, exists := s.idx[id]; exists { + if i, exists := s.indexForStateID[id]; exists { return s.states[i] } return state{} @@ -214,7 +214,7 @@ func (s *states) FindPrevious(newState state) state { func (s *states) FindPreviousByID(id string) state { s.RLock() defer s.RUnlock() - if i, exists := s.idx[id]; exists { + if i, exists := s.indexForStateID[id]; exists { return s.states[i] } return state{} @@ -224,7 +224,7 @@ func (s *states) IsNew(state state) bool { s.RLock() defer s.RUnlock() - i, exists := s.idx[state.ID] + i, exists := s.indexForStateID[state.ID] return !exists || !s.states[i].IsEqual(&state) } From d9be04b468f76b203d46efc38eb318be6fb30d97 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 20:23:18 -0400 Subject: [PATCH 17/99] remove extra index indirection in state lookup --- x-pack/filebeat/input/awss3/states.go | 54 +++++++-------------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 7ade64669295..d80588d2f48a 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -36,11 +36,8 @@ type states struct { log *logp.Logger - // states store - states []state - - // indexForStateID maps state IDs to state indexes for fast lookup and modifications. - indexForStateID map[string]int + // states store, keyed by ID + states map[string]state listingIDs map[string]struct{} listingInfo *sync.Map @@ -51,8 +48,7 @@ type states struct { func newStates(ctx v2.Context) *states { return &states{ log: ctx.Logger.Named("states"), - states: nil, - indexForStateID: map[string]int{}, + states: map[string]state{}, listingInfo: new(sync.Map), listingIDs: map[string]struct{}{}, statesByListingID: map[string][]state{}, @@ -91,16 +87,7 @@ func (s *states) Delete(id string) { s.Lock() defer s.Unlock() - if index, exists := s.indexForStateID[id]; exists { - last := len(s.states) - 1 - s.states[last], s.states[index] = s.states[index], s.states[last] - s.states = s.states[:last] - - s.indexForStateID = map[string]int{} - for i, state := range s.states { - s.indexForStateID[state.ID] = i - } - } + delete(s.states, id) } // IsListingFullyStored check if listing if fully stored @@ -153,16 +140,7 @@ func (s *states) Update(newState state, listingID string) { s.Lock() defer s.Unlock() - id := newState.ID - - if index, exists := s.indexForStateID[id]; exists { - s.states[index] = newState - } else { - // No existing state found, add new one - s.indexForStateID[id] = len(s.states) - s.states = append(s.states, newState) - s.log.Debug("New state added for ", newState.ID) - } + s.states[newState.ID] = newState if listingID == "" || !newState.IsProcessed() { return @@ -202,11 +180,7 @@ func (s *states) Update(newState state, listingID string) { func (s *states) FindPrevious(newState state) state { s.RLock() defer s.RUnlock() - id := newState.ID - if i, exists := s.indexForStateID[id]; exists { - return s.states[i] - } - return state{} + return s.states[newState.ID] } // FindPreviousByID lookups a registered state, that matching the id. @@ -214,18 +188,14 @@ func (s *states) FindPrevious(newState state) state { func (s *states) FindPreviousByID(id string) state { s.RLock() defer s.RUnlock() - if i, exists := s.indexForStateID[id]; exists { - return s.states[i] - } - return state{} + return s.states[id] } func (s *states) IsNew(state state) bool { s.RLock() defer s.RUnlock() - - i, exists := s.indexForStateID[state.ID] - return !exists || !s.states[i].IsEqual(&state) + oldState, exists := s.states[state.ID] + return !exists || !oldState.IsEqual(&state) } // GetStates creates copy of the file states. @@ -233,8 +203,10 @@ func (s *states) GetStates() []state { s.RLock() defer s.RUnlock() - newStates := make([]state, len(s.states)) - copy(newStates, s.states) + newStates := make([]state, 0, len(s.states)) + for _, state := range s.states { + newStates = append(newStates, state) + } return newStates } From 5e1fbccaa4030a699eafe3d7c8fc03c8a2da7f8f Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 20:42:50 -0400 Subject: [PATCH 18/99] remove redundant sync.Map --- x-pack/filebeat/input/awss3/states.go | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index d80588d2f48a..7fb2bc246fc0 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -40,7 +40,7 @@ type states struct { states map[string]state listingIDs map[string]struct{} - listingInfo *sync.Map + listingInfo map[string]*listingInfo statesByListingID map[string][]state } @@ -49,7 +49,7 @@ func newStates(ctx v2.Context) *states { return &states{ log: ctx.Logger.Named("states"), states: map[string]state{}, - listingInfo: new(sync.Map), + listingInfo: map[string]*listingInfo{}, listingIDs: map[string]struct{}{}, statesByListingID: map[string][]state{}, } @@ -93,12 +93,10 @@ func (s *states) Delete(id string) { // IsListingFullyStored check if listing if fully stored // After first time the condition is met it will always return false func (s *states) IsListingFullyStored(listingID string) bool { - info, ok := s.listingInfo.Load(listingID) - if !ok { - return false - } - listingInfo, ok := info.(*listingInfo) - if !ok { + s.RLock() + listingInfo := s.listingInfo[listingID] + s.RUnlock() + if listingInfo == nil { return false } @@ -123,7 +121,7 @@ func (s *states) AddListing(listingID string, listingInfo *listingInfo) { s.Lock() defer s.Unlock() s.listingIDs[listingID] = struct{}{} - s.listingInfo.Store(listingID, listingInfo) + s.listingInfo[listingID] = listingInfo } // DeleteListing delete listing info @@ -132,7 +130,7 @@ func (s *states) DeleteListing(listingID string) { defer s.Unlock() delete(s.listingIDs, listingID) delete(s.statesByListingID, listingID) - s.listingInfo.Delete(listingID) + delete(s.listingInfo, listingID) } // Update updates a state. If previous state didn't exist, new one is created @@ -147,11 +145,7 @@ func (s *states) Update(newState state, listingID string) { } // here we increase the number of stored object - info, ok := s.listingInfo.Load(listingID) - if !ok { - return - } - listingInfo, ok := info.(*listingInfo) + listingInfo, ok := s.listingInfo[listingID] if !ok { return } From c16a22f1483b905ee158f6f38bbf6ef46203902c Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 20:50:58 -0400 Subject: [PATCH 19/99] merge redundant state maps --- x-pack/filebeat/input/awss3/states.go | 31 +++++++++++---------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 7fb2bc246fc0..44afb17f15f0 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -27,6 +27,7 @@ type listingInfo struct { storedObjects int errorObjects int finalCheck bool + states []state } // states handles list of s3 object state. One must use newStates to instantiate a @@ -39,19 +40,17 @@ type states struct { // states store, keyed by ID states map[string]state - listingIDs map[string]struct{} - listingInfo map[string]*listingInfo - statesByListingID map[string][]state + listingIDs map[string]struct{} + listingInfo map[string]*listingInfo } // newStates generates a new states registry. func newStates(ctx v2.Context) *states { return &states{ - log: ctx.Logger.Named("states"), - states: map[string]state{}, - listingInfo: map[string]*listingInfo{}, - listingIDs: map[string]struct{}{}, - statesByListingID: map[string][]state{}, + log: ctx.Logger.Named("states"), + states: map[string]state{}, + listingInfo: map[string]*listingInfo{}, + listingIDs: map[string]struct{}{}, } } @@ -129,7 +128,6 @@ func (s *states) DeleteListing(listingID string) { s.Lock() defer s.Unlock() delete(s.listingIDs, listingID) - delete(s.statesByListingID, listingID) delete(s.listingInfo, listingID) } @@ -155,18 +153,12 @@ func (s *states) Update(newState state, listingID string) { if newState.Stored { listingInfo.storedObjects++ } - if newState.Error { listingInfo.errorObjects++ } + listingInfo.states = append(listingInfo.states, newState) listingInfo.mu.Unlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - s.statesByListingID[listingID] = make([]state, 0) - } - - s.statesByListingID[listingID] = append(s.statesByListingID[listingID], newState) } // FindPrevious lookups a registered state, that matching the new state. @@ -222,12 +214,13 @@ func (s *states) GetStatesByListingID(listingID string) []state { s.RLock() defer s.RUnlock() - if _, ok := s.statesByListingID[listingID]; !ok { + listingInfo, ok := s.listingInfo[listingID] + if !ok { return nil } - newStates := make([]state, len(s.statesByListingID[listingID])) - copy(newStates, s.statesByListingID[listingID]) + newStates := make([]state, len(listingInfo.states)) + copy(newStates, listingInfo.states) return newStates } From f07915a30032560c1661a2890cdf512a0de416ae Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 21:04:40 -0400 Subject: [PATCH 20/99] remove redundant state map --- x-pack/filebeat/input/awss3/states.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 44afb17f15f0..6bbb1046f192 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -40,7 +40,6 @@ type states struct { // states store, keyed by ID states map[string]state - listingIDs map[string]struct{} listingInfo map[string]*listingInfo } @@ -50,7 +49,6 @@ func newStates(ctx v2.Context) *states { log: ctx.Logger.Named("states"), states: map[string]state{}, listingInfo: map[string]*listingInfo{}, - listingIDs: map[string]struct{}{}, } } @@ -119,7 +117,6 @@ func (s *states) IsListingFullyStored(listingID string) bool { func (s *states) AddListing(listingID string, listingInfo *listingInfo) { s.Lock() defer s.Unlock() - s.listingIDs[listingID] = struct{}{} s.listingInfo[listingID] = listingInfo } @@ -127,7 +124,6 @@ func (s *states) AddListing(listingID string, listingInfo *listingInfo) { func (s *states) DeleteListing(listingID string) { s.Lock() defer s.Unlock() - delete(s.listingIDs, listingID) delete(s.listingInfo, listingID) } @@ -201,8 +197,8 @@ func (s *states) GetStates() []state { func (s *states) GetListingIDs() []string { s.RLock() defer s.RUnlock() - listingIDs := make([]string, 0, len(s.listingIDs)) - for listingID := range s.listingIDs { + listingIDs := make([]string, 0, len(s.listingInfo)) + for listingID := range s.listingInfo { listingIDs = append(listingIDs, listingID) } @@ -212,13 +208,16 @@ func (s *states) GetListingIDs() []string { // GetStatesByListingID return a copy of the states by listing ID func (s *states) GetStatesByListingID(listingID string) []state { s.RLock() - defer s.RUnlock() - listingInfo, ok := s.listingInfo[listingID] + s.RUnlock() + if !ok { return nil } + listingInfo.mu.Lock() + defer listingInfo.mu.Unlock() + newStates := make([]state, len(listingInfo.states)) copy(newStates, listingInfo.states) return newStates From 0f483a326f64fc7ed9357661206c179fca73a006 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 22:17:21 -0400 Subject: [PATCH 21/99] simplify s3Poller worker handling --- x-pack/filebeat/input/awss3/s3.go | 58 +++++++------------------------ 1 file changed, 13 insertions(+), 45 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 9c73e3d0925f..fc578b739a75 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -11,7 +11,6 @@ import ( "time" "github.com/gofrs/uuid" - "go.uber.org/multierr" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/statestore" @@ -131,9 +130,7 @@ func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3 return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event), event } -func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { - var errs []error - +func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) { for s3ObjectPayload := range s3ObjectPayloadChan { // Process S3 object (download, parse, create events). err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() @@ -145,7 +142,7 @@ func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) er if err != nil { event := s3ObjectPayload.s3ObjectEvent - errs = append(errs, + p.log.Warnw("processing S3 listing", "error", fmt.Errorf( fmt.Sprintf("failed processing S3 event for object key %q in bucket %q: %%w", event.S3.Object.Key, event.S3.Bucket.Name), @@ -160,13 +157,9 @@ func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) er // Metrics p.metrics.s3ObjectsAckedTotal.Inc() } - - return multierr.Combine(errs...) } func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { - defer close(s3ObjectPayloadChan) - bucketName := getBucketNameFromARN(p.bucket) circuitBreaker := 0 @@ -343,49 +336,24 @@ func (p *s3Poller) Purge(ctx context.Context) { } func (p *s3Poller) Poll(ctx context.Context) { - // This loop tries to keep the workers busy as much as possible while - // honoring the number in config opposed to a simpler loop that does one - // listing, sequentially processes every object and then does another listing - workerWg := new(sync.WaitGroup) - for ctx.Err() == nil { - // Determine how many S3 workers are available. - workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx) - if err != nil { - break - } + var workerWg sync.WaitGroup + s3ObjectPayloadChan := make(chan *s3ObjectPayload) - if workers == 0 { - continue - } - - s3ObjectPayloadChan := make(chan *s3ObjectPayload) - - workerWg.Add(1) + for i := 0; i < p.numberOfWorkers; i++ { go func() { - defer func() { - workerWg.Done() - }() - - p.GetS3Objects(ctx, s3ObjectPayloadChan) - p.Purge(ctx) + defer workerWg.Done() + p.ProcessObject(s3ObjectPayloadChan) }() + } - workerWg.Add(workers) - for i := 0; i < workers; i++ { - go func() { - defer func() { - workerWg.Done() - p.workerSem.Release(1) - }() - if err := p.ProcessObject(s3ObjectPayloadChan); err != nil { - p.log.Warnw("Failed processing S3 listing.", "error", err) - } - }() - } + for ctx.Err() == nil { + p.GetS3Objects(ctx, s3ObjectPayloadChan) + go p.Purge(ctx) _ = timed.Wait(ctx, p.bucketPollInterval) } - // Wait for all workers to finish. + // Close the workers' input channel and wait for all of them to finish. + close(s3ObjectPayloadChan) workerWg.Wait() } From a8cb6bda22d2cd8db2cca1933b2995e3cc96d2b8 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 22:31:36 -0400 Subject: [PATCH 22/99] simplify waitgroup handling / unused errors --- x-pack/filebeat/input/awscloudwatch/input.go | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/x-pack/filebeat/input/awscloudwatch/input.go b/x-pack/filebeat/input/awscloudwatch/input.go index 4ee9daa05ad0..4bcc633bef43 100644 --- a/x-pack/filebeat/input/awscloudwatch/input.go +++ b/x-pack/filebeat/input/awscloudwatch/input.go @@ -6,7 +6,6 @@ package awscloudwatch import ( "context" - "errors" "fmt" "strings" "sync" @@ -143,10 +142,11 @@ func (in *cloudwatchInput) Run(inputContext v2.Context, pipeline beat.Pipeline) in.config.LogStreamPrefix) logProcessor := newLogProcessor(log.Named("log_processor"), in.metrics, client, ctx) cwPoller.metrics.logGroupsTotal.Add(uint64(len(logGroupNames))) - return in.Receive(svc, cwPoller, ctx, logProcessor, logGroupNames) + in.Receive(svc, cwPoller, ctx, logProcessor, logGroupNames) + return nil } -func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwatchPoller, ctx context.Context, logProcessor *logProcessor, logGroupNames []string) error { +func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwatchPoller, ctx context.Context, logProcessor *logProcessor, logGroupNames []string) { // This loop tries to keep the workers busy as much as possible while // honoring the number in config opposed to a simpler loop that does one // listing, sequentially processes every object and then does another listing @@ -173,7 +173,6 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa continue } - workerWg.Add(availableWorkers) logGroupNamesLength := len(logGroupNames) runningGoroutines := 0 @@ -187,13 +186,11 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa if lastLogGroupOffset >= logGroupNamesLength { // release unused workers cwPoller.workerSem.Release(availableWorkers - runningGoroutines) - for j := 0; j < availableWorkers-runningGoroutines; j++ { - workerWg.Done() - } lastLogGroupOffset = 0 } lg := logGroupNames[i] + workerWg.Add(1) go func(logGroup string, startTime int64, endTime int64) { defer func() { cwPoller.log.Infof("aws-cloudwatch input worker for log group '%v' has stopped.", logGroup) @@ -208,11 +205,6 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa // Wait for all workers to finish. workerWg.Wait() - if errors.Is(ctx.Err(), context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } - return ctx.Err() } func parseARN(logGroupARN string) (string, string, error) { From 78a7db4fe2bdf478256720e807fd4e15bb82ad3e Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 12 Apr 2024 22:43:26 -0400 Subject: [PATCH 23/99] clean up context handling --- x-pack/filebeat/input/awscloudwatch/input.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/x-pack/filebeat/input/awscloudwatch/input.go b/x-pack/filebeat/input/awscloudwatch/input.go index 4bcc633bef43..3a0de4f84748 100644 --- a/x-pack/filebeat/input/awscloudwatch/input.go +++ b/x-pack/filebeat/input/awscloudwatch/input.go @@ -95,19 +95,7 @@ func (in *cloudwatchInput) Test(ctx v2.TestContext) error { } func (in *cloudwatchInput) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - var err error - - // Wrap input Context's cancellation Done channel a context.Context. This - // goroutine stops with the parent closes the Done channel. - ctx, cancelInputCtx := context.WithCancel(context.Background()) - go func() { - defer cancelInputCtx() - select { - case <-inputContext.Cancelation.Done(): - case <-ctx.Done(): - } - }() - defer cancelInputCtx() + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) // Create client for publishing events and receive notification of their ACKs. client, err := pipeline.ConnectWith(beat.ClientConfig{ From edc1bd3da72a8670b54a9804a06eaf9004dc11a2 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Sat, 13 Apr 2024 08:19:17 -0400 Subject: [PATCH 24/99] adjust delay timer --- x-pack/filebeat/input/awscloudwatch/input.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/x-pack/filebeat/input/awscloudwatch/input.go b/x-pack/filebeat/input/awscloudwatch/input.go index 3a0de4f84748..56e51521a0a3 100644 --- a/x-pack/filebeat/input/awscloudwatch/input.go +++ b/x-pack/filebeat/input/awscloudwatch/input.go @@ -138,17 +138,9 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa // This loop tries to keep the workers busy as much as possible while // honoring the number in config opposed to a simpler loop that does one // listing, sequentially processes every object and then does another listing - start := true workerWg := new(sync.WaitGroup) lastLogGroupOffset := 0 for ctx.Err() == nil { - if !start { - cwPoller.log.Debugf("sleeping for %v before checking new logs", in.config.ScanFrequency) - time.Sleep(in.config.ScanFrequency) - cwPoller.log.Debug("done sleeping") - } - start = false - currentTime := time.Now() cwPoller.startTime, cwPoller.endTime = getStartPosition(in.config.StartPosition, currentTime, cwPoller.endTime, in.config.ScanFrequency, in.config.Latency) cwPoller.log.Debugf("start_position = %s, startTime = %v, endTime = %v", in.config.StartPosition, time.Unix(cwPoller.startTime/1000, 0), time.Unix(cwPoller.endTime/1000, 0)) @@ -189,6 +181,10 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa cwPoller.run(svc, logGroup, startTime, endTime, logProcessor) }(lg, cwPoller.startTime, cwPoller.endTime) } + + cwPoller.log.Debugf("sleeping for %v before checking new logs", in.config.ScanFrequency) + time.Sleep(in.config.ScanFrequency) + cwPoller.log.Debug("done sleeping") } // Wait for all workers to finish. From 1497be48c520b76f95c1eb3e4f38235f35cb44bc Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Sat, 13 Apr 2024 08:25:00 -0400 Subject: [PATCH 25/99] remove unused struct fields --- x-pack/filebeat/input/awscloudwatch/cloudwatch.go | 4 ---- x-pack/filebeat/input/awscloudwatch/input.go | 7 ++++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/x-pack/filebeat/input/awscloudwatch/cloudwatch.go b/x-pack/filebeat/input/awscloudwatch/cloudwatch.go index ca54721bd279..262b1d3c9c71 100644 --- a/x-pack/filebeat/input/awscloudwatch/cloudwatch.go +++ b/x-pack/filebeat/input/awscloudwatch/cloudwatch.go @@ -24,8 +24,6 @@ type cloudwatchPoller struct { region string logStreams []*string logStreamPrefix string - startTime int64 - endTime int64 workerSem *awscommon.Sem log *logp.Logger metrics *inputMetrics @@ -46,8 +44,6 @@ func newCloudwatchPoller(log *logp.Logger, metrics *inputMetrics, region: awsRegion, logStreams: logStreams, logStreamPrefix: logStreamPrefix, - startTime: int64(0), - endTime: int64(0), workerSem: awscommon.NewSem(numberOfWorkers), log: log, metrics: metrics, diff --git a/x-pack/filebeat/input/awscloudwatch/input.go b/x-pack/filebeat/input/awscloudwatch/input.go index 56e51521a0a3..33ddd5232970 100644 --- a/x-pack/filebeat/input/awscloudwatch/input.go +++ b/x-pack/filebeat/input/awscloudwatch/input.go @@ -140,10 +140,11 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa // listing, sequentially processes every object and then does another listing workerWg := new(sync.WaitGroup) lastLogGroupOffset := 0 + var startTime, endTime int64 for ctx.Err() == nil { currentTime := time.Now() - cwPoller.startTime, cwPoller.endTime = getStartPosition(in.config.StartPosition, currentTime, cwPoller.endTime, in.config.ScanFrequency, in.config.Latency) - cwPoller.log.Debugf("start_position = %s, startTime = %v, endTime = %v", in.config.StartPosition, time.Unix(cwPoller.startTime/1000, 0), time.Unix(cwPoller.endTime/1000, 0)) + startTime, endTime = getStartPosition(in.config.StartPosition, currentTime, endTime, in.config.ScanFrequency, in.config.Latency) + cwPoller.log.Debugf("start_position = %s, startTime = %v, endTime = %v", in.config.StartPosition, time.Unix(startTime/1000, 0), time.Unix(endTime/1000, 0)) availableWorkers, err := cwPoller.workerSem.AcquireContext(in.config.NumberOfWorkers, ctx) if err != nil { break @@ -179,7 +180,7 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa }() cwPoller.log.Infof("aws-cloudwatch input worker for log group: '%v' has started", logGroup) cwPoller.run(svc, logGroup, startTime, endTime, logProcessor) - }(lg, cwPoller.startTime, cwPoller.endTime) + }(lg, startTime, endTime) } cwPoller.log.Debugf("sleeping for %v before checking new logs", in.config.ScanFrequency) From a3e0dc823eb165545b3d5c05ac2ee1fbbdf7d9f1 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Sat, 13 Apr 2024 09:30:15 -0400 Subject: [PATCH 26/99] cleanup --- x-pack/filebeat/input/awscloudwatch/input.go | 25 ++++++++------------ x-pack/filebeat/input/awss3/s3.go | 2 ++ 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/x-pack/filebeat/input/awscloudwatch/input.go b/x-pack/filebeat/input/awscloudwatch/input.go index 33ddd5232970..e3787f15236f 100644 --- a/x-pack/filebeat/input/awscloudwatch/input.go +++ b/x-pack/filebeat/input/awscloudwatch/input.go @@ -139,11 +139,10 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa // honoring the number in config opposed to a simpler loop that does one // listing, sequentially processes every object and then does another listing workerWg := new(sync.WaitGroup) - lastLogGroupOffset := 0 + nextLogGroupIndex := 0 var startTime, endTime int64 for ctx.Err() == nil { - currentTime := time.Now() - startTime, endTime = getStartPosition(in.config.StartPosition, currentTime, endTime, in.config.ScanFrequency, in.config.Latency) + startTime, endTime = in.getStartPosition(time.Now(), endTime) cwPoller.log.Debugf("start_position = %s, startTime = %v, endTime = %v", in.config.StartPosition, time.Unix(startTime/1000, 0), time.Unix(endTime/1000, 0)) availableWorkers, err := cwPoller.workerSem.AcquireContext(in.config.NumberOfWorkers, ctx) if err != nil { @@ -154,20 +153,19 @@ func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwa continue } - logGroupNamesLength := len(logGroupNames) runningGoroutines := 0 - for i := lastLogGroupOffset; i < logGroupNamesLength; i++ { + for i := nextLogGroupIndex; i < len(logGroupNames); i++ { if runningGoroutines >= availableWorkers { break } runningGoroutines++ - lastLogGroupOffset = i + 1 - if lastLogGroupOffset >= logGroupNamesLength { + nextLogGroupIndex = i + 1 + if nextLogGroupIndex >= len(logGroupNames) { // release unused workers cwPoller.workerSem.Release(availableWorkers - runningGoroutines) - lastLogGroupOffset = 0 + nextLogGroupIndex = 0 } lg := logGroupNames[i] @@ -234,13 +232,10 @@ func getLogGroupNames(svc *cloudwatchlogs.Client, logGroupNamePrefix string, log return logGroupNames, nil } -func getStartPosition(startPosition string, currentTime time.Time, endTime int64, scanFrequency time.Duration, latency time.Duration) (int64, int64) { - if latency != 0 { - // add latency if config is not 0 - currentTime = currentTime.Add(latency * -1) - } +func (in *cloudwatchInput) getStartPosition(currentTime time.Time, endTime int64) (int64, int64) { + currentTime = currentTime.Add(-in.config.Latency) - switch startPosition { + switch in.config.StartPosition { case "beginning": if endTime != int64(0) { return endTime, currentTime.UnixNano() / int64(time.Millisecond) @@ -250,7 +245,7 @@ func getStartPosition(startPosition string, currentTime time.Time, endTime int64 if endTime != int64(0) { return endTime, currentTime.UnixNano() / int64(time.Millisecond) } - return currentTime.Add(-scanFrequency).UnixNano() / int64(time.Millisecond), currentTime.UnixNano() / int64(time.Millisecond) + return currentTime.Add(-in.config.ScanFrequency).UnixNano() / int64(time.Millisecond), currentTime.UnixNano() / int64(time.Millisecond) } return 0, 0 } diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index fc578b739a75..7c4c8931b0c8 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -348,6 +348,8 @@ func (p *s3Poller) Poll(ctx context.Context) { for ctx.Err() == nil { p.GetS3Objects(ctx, s3ObjectPayloadChan) + // We purge in a goroutine since it will need to wait for acknowledgments + // for all objects in each listing go p.Purge(ctx) _ = timed.Wait(ctx, p.bucketPollInterval) From 219e8570e5401b661214758e352bdb5553926d14 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 11:23:12 -0400 Subject: [PATCH 27/99] Refactor cloudwatch worker task allocation --- .../input/awscloudwatch/cloudwatch.go | 134 ++++++++++++++---- x-pack/filebeat/input/awscloudwatch/input.go | 101 +------------ .../input/awscloudwatch/input_test.go | 103 -------------- 3 files changed, 108 insertions(+), 230 deletions(-) diff --git a/x-pack/filebeat/input/awscloudwatch/cloudwatch.go b/x-pack/filebeat/input/awscloudwatch/cloudwatch.go index ca54721bd279..d85480891a0a 100644 --- a/x-pack/filebeat/input/awscloudwatch/cloudwatch.go +++ b/x-pack/filebeat/input/awscloudwatch/cloudwatch.go @@ -14,61 +14,69 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs" - awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" "github.com/elastic/elastic-agent-libs/logp" ) type cloudwatchPoller struct { - numberOfWorkers int - apiSleep time.Duration + config config region string - logStreams []*string - logStreamPrefix string - startTime int64 - endTime int64 - workerSem *awscommon.Sem log *logp.Logger metrics *inputMetrics workersListingMap *sync.Map workersProcessingMap *sync.Map + + // When a worker is ready for its next task, it should + // send to workRequestChan and then read from workResponseChan. + // The worker can cancel the request based on other context + // cancellations, but if the write succeeds it _must_ read from + // workResponseChan to avoid deadlocking the main loop. + workRequestChan chan struct{} + workResponseChan chan workResponse + + workerWg sync.WaitGroup +} + +type workResponse struct { + logGroup string + startTime, endTime time.Time } func newCloudwatchPoller(log *logp.Logger, metrics *inputMetrics, - awsRegion string, apiSleep time.Duration, - numberOfWorkers int, logStreams []*string, logStreamPrefix string) *cloudwatchPoller { + awsRegion string, config config) *cloudwatchPoller { if metrics == nil { metrics = newInputMetrics("", nil) } return &cloudwatchPoller{ - numberOfWorkers: numberOfWorkers, - apiSleep: apiSleep, - region: awsRegion, - logStreams: logStreams, - logStreamPrefix: logStreamPrefix, - startTime: int64(0), - endTime: int64(0), - workerSem: awscommon.NewSem(numberOfWorkers), log: log, metrics: metrics, + region: awsRegion, + config: config, workersListingMap: new(sync.Map), workersProcessingMap: new(sync.Map), + // workRequestChan is unbuffered to guarantee that + // the worker and main loop agree whether a request + // was sent. workerResponseChan is buffered so the + // main loop doesn't have to block on the workers + // while distributing new data. + workRequestChan: make(chan struct{}), + workResponseChan: make(chan workResponse, 10), } } -func (p *cloudwatchPoller) run(svc *cloudwatchlogs.Client, logGroup string, startTime int64, endTime int64, logProcessor *logProcessor) { +func (p *cloudwatchPoller) run(svc *cloudwatchlogs.Client, logGroup string, startTime, endTime time.Time, logProcessor *logProcessor) { err := p.getLogEventsFromCloudWatch(svc, logGroup, startTime, endTime, logProcessor) if err != nil { var errRequestCanceled *awssdk.RequestCanceledError if errors.As(err, &errRequestCanceled) { - p.log.Error("getLogEventsFromCloudWatch failed with RequestCanceledError: ", err) + p.log.Error("getLogEventsFromCloudWatch failed with RequestCanceledError: ", errRequestCanceled) } p.log.Error("getLogEventsFromCloudWatch failed: ", err) } } // getLogEventsFromCloudWatch uses FilterLogEvents API to collect logs from CloudWatch -func (p *cloudwatchPoller) getLogEventsFromCloudWatch(svc *cloudwatchlogs.Client, logGroup string, startTime int64, endTime int64, logProcessor *logProcessor) error { +func (p *cloudwatchPoller) getLogEventsFromCloudWatch(svc *cloudwatchlogs.Client, logGroup string, startTime, endTime time.Time, logProcessor *logProcessor) error { // construct FilterLogEventsInput filterLogEventsInput := p.constructFilterLogEventsInput(startTime, endTime, logGroup) paginator := cloudwatchlogs.NewFilterLogEventsPaginator(svc, filterLogEventsInput) @@ -83,8 +91,8 @@ func (p *cloudwatchPoller) getLogEventsFromCloudWatch(svc *cloudwatchlogs.Client p.metrics.logEventsReceivedTotal.Add(uint64(len(logEvents))) // This sleep is to avoid hitting the FilterLogEvents API limit(5 transactions per second (TPS)/account/Region). - p.log.Debugf("sleeping for %v before making FilterLogEvents API call again", p.apiSleep) - time.Sleep(p.apiSleep) + p.log.Debugf("sleeping for %v before making FilterLogEvents API call again", p.config.APISleep) + time.Sleep(p.config.APISleep) p.log.Debug("done sleeping") p.log.Debugf("Processing #%v events", len(logEvents)) @@ -93,21 +101,87 @@ func (p *cloudwatchPoller) getLogEventsFromCloudWatch(svc *cloudwatchlogs.Client return nil } -func (p *cloudwatchPoller) constructFilterLogEventsInput(startTime int64, endTime int64, logGroup string) *cloudwatchlogs.FilterLogEventsInput { +func (p *cloudwatchPoller) constructFilterLogEventsInput(startTime, endTime time.Time, logGroup string) *cloudwatchlogs.FilterLogEventsInput { filterLogEventsInput := &cloudwatchlogs.FilterLogEventsInput{ LogGroupName: awssdk.String(logGroup), - StartTime: awssdk.Int64(startTime), - EndTime: awssdk.Int64(endTime), + StartTime: awssdk.Int64(startTime.UnixNano() / int64(time.Millisecond)), + EndTime: awssdk.Int64(endTime.UnixNano() / int64(time.Millisecond)), } - if len(p.logStreams) > 0 { - for _, stream := range p.logStreams { + if len(p.config.LogStreams) > 0 { + for _, stream := range p.config.LogStreams { filterLogEventsInput.LogStreamNames = append(filterLogEventsInput.LogStreamNames, *stream) } } - if p.logStreamPrefix != "" { - filterLogEventsInput.LogStreamNamePrefix = awssdk.String(p.logStreamPrefix) + if p.config.LogStreamPrefix != "" { + filterLogEventsInput.LogStreamNamePrefix = awssdk.String(p.config.LogStreamPrefix) } return filterLogEventsInput } + +func (p *cloudwatchPoller) startWorkers( + ctx context.Context, + svc *cloudwatchlogs.Client, + logProcessor *logProcessor, +) { + for i := 0; i < p.config.NumberOfWorkers; i++ { + p.workerWg.Add(1) + go func() { + defer p.workerWg.Done() + for { + var work workResponse + select { + case <-ctx.Done(): + return + case p.workRequestChan <- struct{}{}: + work = <-p.workResponseChan + } + + p.log.Infof("aws-cloudwatch input worker for log group: '%v' has started", work.logGroup) + p.run(svc, work.logGroup, work.startTime, work.endTime, logProcessor) + p.log.Infof("aws-cloudwatch input worker for log group '%v' has stopped.", work.logGroup) + } + }() + } +} + +// receive implements the main run loop that distributes tasks to the worker +// goroutines. It accepts a "clock" callback (which on a live input should +// equal time.Now) to allow deterministic unit tests. +func (p *cloudwatchPoller) receive(ctx context.Context, logGroupNames []string, clock func() time.Time) { + defer p.workerWg.Wait() + // startTime and endTime are the bounds of the current scanning interval. + // If we're starting at the end of the logs, advance the start time to the + // most recent scan window + var startTime time.Time + endTime := clock().Add(-p.config.Latency) + if p.config.StartPosition == "end" { + startTime = endTime.Add(-p.config.ScanFrequency) + } + for ctx.Err() == nil { + for _, lg := range logGroupNames { + select { + case <-ctx.Done(): + return + case <-p.workRequestChan: + p.workResponseChan <- workResponse{ + logGroup: lg, + startTime: startTime, + endTime: endTime, + } + } + } + + // Delay for ScanFrequency after finishing a time span + p.log.Debugf("sleeping for %v before checking new logs", p.config.ScanFrequency) + select { + case <-time.After(p.config.ScanFrequency): + case <-ctx.Done(): + } + p.log.Debug("done sleeping") + + // Advance to the next time span + startTime, endTime = endTime, clock().Add(-p.config.Latency) + } +} diff --git a/x-pack/filebeat/input/awscloudwatch/input.go b/x-pack/filebeat/input/awscloudwatch/input.go index 4ee9daa05ad0..75f22e6625af 100644 --- a/x-pack/filebeat/input/awscloudwatch/input.go +++ b/x-pack/filebeat/input/awscloudwatch/input.go @@ -6,10 +6,8 @@ package awscloudwatch import ( "context" - "errors" "fmt" "strings" - "sync" "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" @@ -137,82 +135,12 @@ func (in *cloudwatchInput) Run(inputContext v2.Context, pipeline beat.Pipeline) log.Named("cloudwatch_poller"), in.metrics, in.awsConfig.Region, - in.config.APISleep, - in.config.NumberOfWorkers, - in.config.LogStreams, - in.config.LogStreamPrefix) + in.config) logProcessor := newLogProcessor(log.Named("log_processor"), in.metrics, client, ctx) cwPoller.metrics.logGroupsTotal.Add(uint64(len(logGroupNames))) - return in.Receive(svc, cwPoller, ctx, logProcessor, logGroupNames) -} - -func (in *cloudwatchInput) Receive(svc *cloudwatchlogs.Client, cwPoller *cloudwatchPoller, ctx context.Context, logProcessor *logProcessor, logGroupNames []string) error { - // This loop tries to keep the workers busy as much as possible while - // honoring the number in config opposed to a simpler loop that does one - // listing, sequentially processes every object and then does another listing - start := true - workerWg := new(sync.WaitGroup) - lastLogGroupOffset := 0 - for ctx.Err() == nil { - if !start { - cwPoller.log.Debugf("sleeping for %v before checking new logs", in.config.ScanFrequency) - time.Sleep(in.config.ScanFrequency) - cwPoller.log.Debug("done sleeping") - } - start = false - - currentTime := time.Now() - cwPoller.startTime, cwPoller.endTime = getStartPosition(in.config.StartPosition, currentTime, cwPoller.endTime, in.config.ScanFrequency, in.config.Latency) - cwPoller.log.Debugf("start_position = %s, startTime = %v, endTime = %v", in.config.StartPosition, time.Unix(cwPoller.startTime/1000, 0), time.Unix(cwPoller.endTime/1000, 0)) - availableWorkers, err := cwPoller.workerSem.AcquireContext(in.config.NumberOfWorkers, ctx) - if err != nil { - break - } - - if availableWorkers == 0 { - continue - } - - workerWg.Add(availableWorkers) - logGroupNamesLength := len(logGroupNames) - runningGoroutines := 0 - - for i := lastLogGroupOffset; i < logGroupNamesLength; i++ { - if runningGoroutines >= availableWorkers { - break - } - - runningGoroutines++ - lastLogGroupOffset = i + 1 - if lastLogGroupOffset >= logGroupNamesLength { - // release unused workers - cwPoller.workerSem.Release(availableWorkers - runningGoroutines) - for j := 0; j < availableWorkers-runningGoroutines; j++ { - workerWg.Done() - } - lastLogGroupOffset = 0 - } - - lg := logGroupNames[i] - go func(logGroup string, startTime int64, endTime int64) { - defer func() { - cwPoller.log.Infof("aws-cloudwatch input worker for log group '%v' has stopped.", logGroup) - workerWg.Done() - cwPoller.workerSem.Release(1) - }() - cwPoller.log.Infof("aws-cloudwatch input worker for log group: '%v' has started", logGroup) - cwPoller.run(svc, logGroup, startTime, endTime, logProcessor) - }(lg, cwPoller.startTime, cwPoller.endTime) - } - } - - // Wait for all workers to finish. - workerWg.Wait() - if errors.Is(ctx.Err(), context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } - return ctx.Err() + cwPoller.startWorkers(ctx, svc, logProcessor) + cwPoller.receive(ctx, logGroupNames, time.Now) + return nil } func parseARN(logGroupARN string) (string, string, error) { @@ -256,24 +184,3 @@ func getLogGroupNames(svc *cloudwatchlogs.Client, logGroupNamePrefix string, log } return logGroupNames, nil } - -func getStartPosition(startPosition string, currentTime time.Time, endTime int64, scanFrequency time.Duration, latency time.Duration) (int64, int64) { - if latency != 0 { - // add latency if config is not 0 - currentTime = currentTime.Add(latency * -1) - } - - switch startPosition { - case "beginning": - if endTime != int64(0) { - return endTime, currentTime.UnixNano() / int64(time.Millisecond) - } - return 0, currentTime.UnixNano() / int64(time.Millisecond) - case "end": - if endTime != int64(0) { - return endTime, currentTime.UnixNano() / int64(time.Millisecond) - } - return currentTime.Add(-scanFrequency).UnixNano() / int64(time.Millisecond), currentTime.UnixNano() / int64(time.Millisecond) - } - return 0, 0 -} diff --git a/x-pack/filebeat/input/awscloudwatch/input_test.go b/x-pack/filebeat/input/awscloudwatch/input_test.go index c51c6a072f4f..4f9754c6a130 100644 --- a/x-pack/filebeat/input/awscloudwatch/input_test.go +++ b/x-pack/filebeat/input/awscloudwatch/input_test.go @@ -15,109 +15,6 @@ import ( "github.com/elastic/elastic-agent-libs/mapstr" ) -func TestGetStartPosition(t *testing.T) { - currentTime := time.Date(2020, time.June, 1, 0, 0, 0, 0, time.UTC) - cases := []struct { - title string - startPosition string - prevEndTime int64 - scanFrequency time.Duration - latency time.Duration - expectedStartTime int64 - expectedEndTime int64 - }{ - { - "startPosition=beginning", - "beginning", - int64(0), - 30 * time.Second, - 0, - int64(0), - int64(1590969600000), - }, - { - "startPosition=end", - "end", - int64(0), - 30 * time.Second, - 0, - int64(1590969570000), - int64(1590969600000), - }, - { - "startPosition=typo", - "typo", - int64(0), - 30 * time.Second, - 0, - int64(0), - int64(0), - }, - { - "startPosition=beginning with prevEndTime", - "beginning", - int64(1590000000000), - 30 * time.Second, - 0, - int64(1590000000000), - int64(1590969600000), - }, - { - "startPosition=end with prevEndTime", - "end", - int64(1590000000000), - 30 * time.Second, - 0, - int64(1590000000000), - int64(1590969600000), - }, - { - "startPosition=beginning with latency", - "beginning", - int64(0), - 30 * time.Second, - 10 * time.Minute, - int64(0), - int64(1590969000000), - }, - { - "startPosition=beginning with prevEndTime and latency", - "beginning", - int64(1590000000000), - 30 * time.Second, - 10 * time.Minute, - int64(1590000000000), - int64(1590969000000), - }, - { - "startPosition=end with latency", - "end", - int64(0), - 30 * time.Second, - 10 * time.Minute, - int64(1590968970000), - int64(1590969000000), - }, - { - "startPosition=end with prevEndTime and latency", - "end", - int64(1590000000000), - 30 * time.Second, - 10 * time.Minute, - int64(1590000000000), - int64(1590969000000), - }, - } - - for _, c := range cases { - t.Run(c.title, func(t *testing.T) { - startTime, endTime := getStartPosition(c.startPosition, currentTime, c.prevEndTime, c.scanFrequency, c.latency) - assert.Equal(t, c.expectedStartTime, startTime) - assert.Equal(t, c.expectedEndTime, endTime) - }) - } -} - func TestCreateEvent(t *testing.T) { logEvent := &types.FilteredLogEvent{ EventId: awssdk.String("id-1"), From 977a0d3bf389ecff565e8d2e1288691def7d258e Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 14:38:12 -0400 Subject: [PATCH 28/99] add unit tests for cloudwatchPoller.receive --- .../input/awscloudwatch/cloudwatch_test.go | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go diff --git a/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go b/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go new file mode 100644 index 000000000000..e2af3c8d022c --- /dev/null +++ b/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go @@ -0,0 +1,206 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awscloudwatch + +import ( + "context" + "testing" + "time" + + "github.com/elastic/elastic-agent-libs/logp" + "github.com/stretchr/testify/assert" +) + +type clock struct { + time time.Time +} + +func (c *clock) now() time.Time { + return c.time +} + +type receiveTestStep struct { + expected []workResponse + nextTime time.Time +} + +type receiveTestCase struct { + name string + logGroups []string + configOverrides func(*config) + startTime time.Time + steps []receiveTestStep +} + +func TestReceive(t *testing.T) { + // We use a mocked clock so scan frequency can be any positive value. + const defaultScanFrequency = time.Microsecond + t0 := time.Time{} + t1 := t0.Add(time.Hour) + t2 := t1.Add(time.Minute) + t3 := t2.Add(time.Hour) + testCases := []receiveTestCase{ + { + name: "Default config with one log group", + logGroups: []string{"a"}, + startTime: t1, + steps: []receiveTestStep{ + { + expected: []workResponse{ + {logGroup: "a", startTime: t0, endTime: t1}, + }, + nextTime: t2, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t1, endTime: t2}, + }, + nextTime: t3, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t2, endTime: t3}, + }, + }, + }, + }, + { + name: "Default config with two log groups", + logGroups: []string{"a", "b"}, + startTime: t1, + steps: []receiveTestStep{ + { + expected: []workResponse{ + {logGroup: "a", startTime: t0, endTime: t1}, + }, + nextTime: t2, + }, + { + expected: []workResponse{ + // start/end times for the second log group should be the same + // even though the clock has changed. + {logGroup: "b", startTime: t0, endTime: t1}, + }, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t1, endTime: t2}, + {logGroup: "b", startTime: t1, endTime: t2}, + }, + nextTime: t3, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t2, endTime: t3}, + {logGroup: "b", startTime: t2, endTime: t3}, + }, + }, + }, + }, + { + name: "One log group with start_position: end", + logGroups: []string{"a"}, + startTime: t1, + configOverrides: func(c *config) { + c.StartPosition = "end" + }, + steps: []receiveTestStep{ + { + expected: []workResponse{ + {logGroup: "a", startTime: t1.Add(-defaultScanFrequency), endTime: t1}, + }, + nextTime: t2, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t1, endTime: t2}, + }, + }, + }, + }, + { + name: "Two log group with start_position: end and latency", + logGroups: []string{"a", "b"}, + startTime: t1, + configOverrides: func(c *config) { + c.StartPosition = "end" + c.Latency = time.Second + }, + steps: []receiveTestStep{ + { + expected: []workResponse{ + {logGroup: "a", startTime: t1.Add(-defaultScanFrequency - time.Second), endTime: t1.Add(-time.Second)}, + {logGroup: "b", startTime: t1.Add(-defaultScanFrequency - time.Second), endTime: t1.Add(-time.Second)}, + }, + nextTime: t2, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t1.Add(-time.Second), endTime: t2.Add(-time.Second)}, + {logGroup: "b", startTime: t1.Add(-time.Second), endTime: t2.Add(-time.Second)}, + }, + }, + }, + }, + { + name: "Three log groups with latency", + logGroups: []string{"a", "b", "c"}, + startTime: t1, + configOverrides: func(c *config) { + c.Latency = time.Second + }, + steps: []receiveTestStep{ + { + expected: []workResponse{ + {logGroup: "a", startTime: t0, endTime: t1.Add(-time.Second)}, + {logGroup: "b", startTime: t0, endTime: t1.Add(-time.Second)}, + {logGroup: "c", startTime: t0, endTime: t1.Add(-time.Second)}, + }, + nextTime: t2, + }, + { + expected: []workResponse{ + {logGroup: "a", startTime: t1.Add(-time.Second), endTime: t2.Add(-time.Second)}, + {logGroup: "b", startTime: t1.Add(-time.Second), endTime: t2.Add(-time.Second)}, + {logGroup: "c", startTime: t1.Add(-time.Second), endTime: t2.Add(-time.Second)}, + }, + }, + }, + }, + } + clock := &clock{} + for stepIndex, test := range testCases { + ctx, cancel := context.WithCancel(context.Background()) + p := &cloudwatchPoller{ + workRequestChan: make(chan struct{}), + // Unlike the live cwPoller, we make workResponseChan unbuffered, + // so we can guarantee that clock updates happen when cwPoller has already + // decided on its output + workResponseChan: make(chan workResponse), + log: logp.NewLogger("test"), + } + + p.config = defaultConfig() + p.config.ScanFrequency = defaultScanFrequency + if test.configOverrides != nil { + test.configOverrides(&p.config) + } + clock.time = test.startTime + go p.receive(ctx, test.logGroups, clock.now) + for _, step := range test.steps { + for i, expected := range step.expected { + p.workRequestChan <- struct{}{} + if i+1 == len(step.expected) && !step.nextTime.Equal(time.Time{}) { + // On the last request of the step, we advance the clock if a + // time is set + clock.time = step.nextTime + } + response := <-p.workResponseChan + assert.Equalf(t, expected, response, "%v: step %v response %v doesn't match", test.name, stepIndex, i) + } + } + cancel() + } +} From 6cf55068404bdc58e32c63ea439aec17e5dcd446 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 14:46:00 -0400 Subject: [PATCH 29/99] update changelog --- CHANGELOG.next.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 060d44d907a4..9445a43ccca9 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -129,6 +129,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fix panic when more than 32767 pipeline clients are active. {issue}38197[38197] {pull}38556[38556] - Fix filestream's registry GC: registry entries are now removed from the in-memory and disk store when they're older than the set TTL {issue}36761[36761] {pull}38488[38488] - [threatintel] MISP splitting fix for empty responses {issue}38739[38739] {pull}38917[38917] +- Fix a bug in cloudwatch task allocation that could skip some logs {issue}38918[38918] {pull}38953[38953] *Heartbeat* From ff2457149f1edc86384fb47db63b962f3cc2d2de Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 14:54:12 -0400 Subject: [PATCH 30/99] make check --- x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go b/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go index e2af3c8d022c..2f8198c021dd 100644 --- a/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go +++ b/x-pack/filebeat/input/awscloudwatch/cloudwatch_test.go @@ -9,8 +9,9 @@ import ( "testing" "time" - "github.com/elastic/elastic-agent-libs/logp" "github.com/stretchr/testify/assert" + + "github.com/elastic/elastic-agent-libs/logp" ) type clock struct { From 2a6abb80e3310ec36a47addc64510180c23a5468 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 15:32:07 -0400 Subject: [PATCH 31/99] Remove unused custom semaphore helper --- x-pack/filebeat/input/awss3/s3.go | 2 - x-pack/filebeat/input/awss3/s3_test.go | 2 - x-pack/libbeat/common/aws/semaphore.go | 78 --------------------- x-pack/libbeat/common/aws/semaphore_test.go | 33 --------- 4 files changed, 115 deletions(-) delete mode 100644 x-pack/libbeat/common/aws/semaphore.go delete mode 100644 x-pack/libbeat/common/aws/semaphore_test.go diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 7c4c8931b0c8..ddc8c3a97ac4 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -46,7 +46,6 @@ type s3Poller struct { region string provider string bucketPollInterval time.Duration - workerSem *awscommon.Sem s3 s3API log *logp.Logger metrics *inputMetrics @@ -83,7 +82,6 @@ func newS3Poller(log *logp.Logger, region: awsRegion, provider: provider, bucketPollInterval: bucketPollInterval, - workerSem: awscommon.NewSem(numberOfWorkers), s3: s3, log: log, metrics: metrics, diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index 2c0281474b5d..ca63b13cc16d 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -13,7 +13,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" "github.com/elastic/beats/v7/libbeat/statestore" "github.com/elastic/beats/v7/libbeat/statestore/storetest" @@ -135,7 +134,6 @@ func TestS3Poller(t *testing.T) { s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) receiver.Poll(ctx) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) t.Run("retry after Poll error", func(t *testing.T) { diff --git a/x-pack/libbeat/common/aws/semaphore.go b/x-pack/libbeat/common/aws/semaphore.go deleted file mode 100644 index 1e7af456b28c..000000000000 --- a/x-pack/libbeat/common/aws/semaphore.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -package aws - -import ( - "context" - "sync" -) - -type Sem struct { - cond sync.Cond - available int -} - -func NewSem(n int) *Sem { - return &Sem{ - available: n, - cond: sync.Cond{L: &sync.Mutex{}}, - } -} - -func (s *Sem) AcquireContext(n int, ctx context.Context) (int, error) { - acquireC := make(chan int, 1) - go func() { - defer close(acquireC) - acquireC <- s.Acquire(n) - }() - - select { - case <-ctx.Done(): - return 0, ctx.Err() - case n := <-acquireC: - return n, nil - } -} - -func (s *Sem) Acquire(n int) int { - if n <= 0 { - return 0 - } - - s.cond.L.Lock() - defer s.cond.L.Unlock() - - if s.available == 0 { - s.cond.Wait() - } - - if n >= s.available { - rtn := s.available - s.available = 0 - return rtn - } - - s.available -= n - return n -} - -func (s *Sem) Release(n int) { - if n <= 0 { - return - } - - s.cond.L.Lock() - defer s.cond.L.Unlock() - - s.available += n - s.cond.Signal() -} - -func (s *Sem) Available() int { - s.cond.L.Lock() - defer s.cond.L.Unlock() - - return s.available -} diff --git a/x-pack/libbeat/common/aws/semaphore_test.go b/x-pack/libbeat/common/aws/semaphore_test.go deleted file mode 100644 index f91831ef8a0b..000000000000 --- a/x-pack/libbeat/common/aws/semaphore_test.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -package aws - -import ( - "sync" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestSemaphore(t *testing.T) { - s := NewSem(5) - - assert.Equal(t, s.Acquire(5), 5) - - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - // Asks for 2, and blocks because 0 are available. - // It unblocks and returns 1 when Release(1) is called. - assert.Equal(t, s.Acquire(2), 1) - }() - - // None are available until Release(). - assert.Equal(t, s.Available(), 0) - - s.Release(1) - wg.Wait() -} From 12a2a3c999b393d1975c0ea015a0368c3b8b68ec Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 16:40:33 -0400 Subject: [PATCH 32/99] cleanups in input.go --- x-pack/filebeat/input/awss3/input.go | 137 +++++++++++----------- x-pack/filebeat/input/awss3/input_test.go | 4 +- 2 files changed, 69 insertions(+), 72 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 855403e5dc46..61985d8adcc9 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -127,59 +127,76 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { defer cancelInputCtx() if in.config.QueueURL != "" { - regionName, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint, in.config.RegionName) - if err != nil && in.config.RegionName == "" { - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } - var warn regionMismatchError - if errors.As(err, &warn) { - // Warn of mismatch, but go ahead with configured region name. - inputContext.Logger.Warnf("%v: using %q", err, regionName) - } - in.awsConfig.Region = regionName + return in.runQueueReader(ctx, inputContext, pipeline) + } - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) - } - defer receiver.metrics.Close() + if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { + return in.runS3Poller(ctx, inputContext, pipeline, persistentStore, states) + } - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) + return nil +} - if err := receiver.Receive(ctx); err != nil { - return err - } +func (in *s3Input) runQueueReader( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) } - if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) - if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() + in.awsConfig.Region = urlRegion - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, pipeline) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() - if err := poller.Poll(ctx); err != nil { - return err - } + // Poll metrics periodically in the background + go pollSqsWaitingMetric(ctx, receiver) + + receiver.Receive(ctx) + return nil +} + +func (in *s3Input) runS3Poller( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, + persistentStore *statestore.Store, + states *states, +) error { + // Create client for publishing events and receive notification of their ACKs. + client, err := pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) + if err != nil { + return fmt.Errorf("failed to create pipeline client: %w", err) } + defer client.Close() + // Create S3 receiver and S3 notification processor. + poller, err := in.createS3Poller(inputContext, ctx, client, persistentStore, states) + if err != nil { + return fmt.Errorf("failed to initialize s3 poller: %w", err) + } + defer poller.metrics.Close() + + poller.Poll(ctx) return nil } @@ -225,8 +242,11 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s return nil, err } in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) + sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) return sqsReader, nil @@ -240,7 +260,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil } -func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { +func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { var bucketName string var bucketID string if in.config.NonAWSBucketName != "" { @@ -318,7 +338,7 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") -func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (region string, err error) { +func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { // get region from queueURL // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue @@ -331,11 +351,7 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg // check for sqs queue url if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - region = queueHostSplit[1] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplit[1], nil } } @@ -343,30 +359,13 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - region = queueHostSplitVPC[2] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplitVPC[2], nil } } - - if defaultRegion != "" { - return defaultRegion, nil - } } return "", errBadQueueURL } -type regionMismatchError struct { - queueURLRegion string - defaultRegion string -} - -func (e regionMismatchError) Error() string { - return fmt.Sprintf("configured region disagrees with queue_url region: %q != %q", e.queueURLRegion, e.defaultRegion) -} - func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ Bucket: awssdk.String(bucketName), diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index abc9f5c9a6a6..0a3053f7f1b9 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -54,7 +54,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { name string queueURL string endpoint string - deflt string want string wantErr error }{ @@ -77,7 +76,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { { name: "vpce_endpoint", queueURL: "https://vpce-test.sqs.us-east-2.vpce.amazonaws.com/12345678912/sqs-queue", - deflt: "", want: "us-east-2", }, { @@ -90,7 +88,7 @@ func TestGetRegionFromQueueURL(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := getRegionFromQueueURL(test.queueURL, test.endpoint, test.deflt) + got, err := getRegionFromQueueURL(test.queueURL, test.endpoint) if !sameError(err, test.wantErr) { t.Errorf("unexpected error: got:%v want:%v", err, test.wantErr) } From dd29fa0ac393a4aea7907251c6ea8746b1db983f Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 15 Apr 2024 16:58:47 -0400 Subject: [PATCH 33/99] revert unintentional return value change --- x-pack/filebeat/input/awss3/input.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 61985d8adcc9..9f5609cb06cd 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -164,8 +164,7 @@ func (in *s3Input) runQueueReader( // Poll metrics periodically in the background go pollSqsWaitingMetric(ctx, receiver) - receiver.Receive(ctx) - return nil + return receiver.Receive(ctx) } func (in *s3Input) runS3Poller( @@ -196,8 +195,7 @@ func (in *s3Input) runS3Poller( } defer poller.metrics.Close() - poller.Poll(ctx) - return nil + return poller.Poll(ctx) } func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { From 4956db9c06c3b6518c335b7e952208533db7cde6 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 22 Apr 2024 10:27:09 -0400 Subject: [PATCH 34/99] Concurrency / error handling fixes in awss3 --- x-pack/filebeat/input/awss3/input.go | 40 +-- x-pack/filebeat/input/awss3/s3.go | 309 ++++-------------- x-pack/filebeat/input/awss3/s3_objects.go | 15 +- x-pack/filebeat/input/awss3/state.go | 54 +--- x-pack/filebeat/input/awss3/states.go | 352 +++------------------ x-pack/filebeat/input/awss3/states_test.go | 2 +- 6 files changed, 154 insertions(+), 618 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 855403e5dc46..2c0372fe5616 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -13,6 +13,7 @@ import ( "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/retry" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/aws/smithy-go" @@ -21,7 +22,6 @@ import ( v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/feature" - "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/go-concert/unison" @@ -99,21 +99,6 @@ func (in *s3Input) Test(ctx v2.TestContext) error { } func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - var err error - - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) - } - - defer persistentStore.Close() - - states := newStates(inputContext) - err = states.readStatesFrom(persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } - // Wrap input Context's cancellation Done channel a context.Context. This // goroutine stops with the parent closes the Done channel. ctx, cancelInputCtx := context.WithCancel(context.Background()) @@ -168,8 +153,20 @@ func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { } defer client.Close() + // Connect to the registry and create our states lookup + persistentStore, err := in.store.Access() + if err != nil { + return fmt.Errorf("can not access persistent store: %w", err) + } + defer persistentStore.Close() + + states, err := newStates(inputContext, persistentStore) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } + // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) + poller, err := in.createS3Lister(inputContext, ctx, client, states) if err != nil { return fmt.Errorf("failed to initialize s3 poller: %w", err) } @@ -240,7 +237,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil } -func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { +func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { var bucketName string var bucketID string if in.config.NonAWSBucketName != "" { @@ -260,6 +257,12 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled } o.UsePathStyle = in.config.PathStyle + + o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { + so.MaxAttempts = 5 + // Recover quickly when requests start working again + so.NoRetryIncrement = 100 + }) }) regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) if err != nil { @@ -305,7 +308,6 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli client, s3EventHandlerFactory, states, - persistentStore, bucketID, in.config.BucketListPrefix, in.awsConfig.Region, diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 5aa8d31e95de..b321bb245665 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -11,34 +11,24 @@ import ( "sync" "time" - "github.com/gofrs/uuid" - "go.uber.org/multierr" - + "github.com/aws/aws-sdk-go-v2/aws/ratelimit" "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/beats/v7/libbeat/common/backoff" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/timed" ) -const maxCircuitBreaker = 5 +const maxCircuitBreaker = 10 type commitWriteState struct { time.Time } -type s3ObjectInfo struct { - name string - key string - etag string - lastModified time.Time - listingID string -} - type s3ObjectPayload struct { s3ObjectHandler s3ObjectHandler - s3ObjectInfo s3ObjectInfo s3ObjectEvent s3EventV2 + objectState state } type s3Poller struct { @@ -55,8 +45,6 @@ type s3Poller struct { client beat.Client s3ObjectHandler s3ObjectHandlerFactory states *states - store *statestore.Store - workersListingMap *sync.Map workersProcessingMap *sync.Map } @@ -66,7 +54,6 @@ func newS3Poller(log *logp.Logger, client beat.Client, s3ObjectHandler s3ObjectHandlerFactory, states *states, - store *statestore.Store, bucket string, listPrefix string, awsRegion string, @@ -92,34 +79,11 @@ func newS3Poller(log *logp.Logger, client: client, s3ObjectHandler: s3ObjectHandler, states: states, - store: store, - workersListingMap: new(sync.Map), workersProcessingMap: new(sync.Map), } } -func (p *s3Poller) handlePurgingLock(info s3ObjectInfo, isStored bool) { - id := stateID(info.name, info.key, info.etag, info.lastModified) - previousState := p.states.FindPreviousByID(id) - if !previousState.IsEmpty() { - if isStored { - previousState.MarkAsStored() - } else { - previousState.MarkAsError() - } - - p.states.Update(previousState, info.listingID) - } - - // Manage locks for purging. - if p.states.IsListingFullyStored(info.listingID) { - // locked on processing we unlock when all the object were ACKed - lock, _ := p.workersListingMap.Load(info.listingID) - lock.(*sync.Mutex).Unlock() - } -} - -func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3ObjectHandler, s3EventV2) { +func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { event := s3EventV2{} event.AWSRegion = p.region event.Provider = p.provider @@ -129,275 +93,130 @@ func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3 acker := awscommon.NewEventACKTracker(ctx) - return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event), event + return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event) } -func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { - var errs []error +func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s3ObjectPayload) { + rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) for s3ObjectPayload := range s3ObjectPayloadChan { - // Process S3 object (download, parse, create events). - err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() + objHandler := s3ObjectPayload.s3ObjectHandler + state := s3ObjectPayload.objectState - // Wait for all events to be ACKed before proceeding. - s3ObjectPayload.s3ObjectHandler.Wait() + // Process S3 object (download, parse, create events). + err := objHandler.ProcessS3Object() + if errors.Is(err, s3DownloadError) { + // Download errors are ephemeral. Add a backoff delay, then skip to the + // next iteration so we don't mark the object as permanently failed. + rateLimitWaiter.Wait() + continue + } + // Reset the rate limit delay on results that aren't download errors. + rateLimitWaiter.Reset() - info := s3ObjectPayload.s3ObjectInfo + // Wait for downloaded objects to be ACKed. + objHandler.Wait() if err != nil { - event := s3ObjectPayload.s3ObjectEvent - errs = append(errs, - fmt.Errorf( - fmt.Sprintf("failed processing S3 event for object key %q in bucket %q: %%w", - event.S3.Object.Key, event.S3.Bucket.Name), - err)) - - p.handlePurgingLock(info, false) - continue + p.log.Errorf("failed processing S3 event for object key %q in bucket %q: %v", + state.Key, state.Bucket, err.Error()) + + // Non-retryable error. + state.Failed = true + } else { + state.Stored = true } - p.handlePurgingLock(info, true) + // Persist the result + p.states.AddState(state) // Metrics p.metrics.s3ObjectsAckedTotal.Inc() } - - return multierr.Combine(errs...) } -func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { +func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { defer close(s3ObjectPayloadChan) bucketName := getBucketNameFromARN(p.bucket) + errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) circuitBreaker := 0 paginator := p.s3.ListObjectsPaginator(bucketName, p.listPrefix) for paginator.HasMorePages() { page, err := paginator.NextPage(ctx) + if err != nil { if !paginator.HasMorePages() { break } p.log.Warnw("Error when paginating listing.", "error", err) - circuitBreaker++ - if circuitBreaker >= maxCircuitBreaker { - p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) - break + // QuotaExceededError is client-side rate limiting in the AWS sdk, + // don't include it in the circuit breaker count + if !errors.As(err, &ratelimit.QuotaExceededError{}) { + circuitBreaker++ + if circuitBreaker >= maxCircuitBreaker { + p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) + break + } } + // add a backoff delay and try again + errorBackoff.Wait() continue } + // Reset the circuit breaker and the error backoff if a read is successful + circuitBreaker = 0 + errorBackoff.Reset() - listingID, err := uuid.NewV4() - if err != nil { - p.log.Warnw("Error generating UUID for listing page.", "error", err) - continue - } - - // lock for the listing page and state in workersListingMap - // this map is shared with the storedOp and will be unlocked there - lock := new(sync.Mutex) - lock.Lock() - p.workersListingMap.Store(listingID.String(), lock) - - totProcessableObjects := 0 totListedObjects := len(page.Contents) - s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, totListedObjects) // Metrics p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) for _, object := range page.Contents { - state := newState(bucketName, *object.Key, *object.ETag, p.listPrefix, *object.LastModified) - if p.states.MustSkip(state, p.store) { + state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified) + if p.states.AlreadyProcessed(state) { p.log.Debugw("skipping state.", "state", state) continue } - // we have no previous state or the previous state - // is not stored: refresh the state - previousState := p.states.FindPrevious(state) - if previousState.IsEmpty() || !previousState.IsProcessed() { - p.states.Update(state, "") - } - - s3Processor, event := p.createS3ObjectProcessor(ctx, state) + s3Processor := p.createS3ObjectProcessor(ctx, state) if s3Processor == nil { p.log.Debugw("empty s3 processor.", "state", state) continue } - totProcessableObjects++ - - s3ObjectPayloadChanByPage <- &s3ObjectPayload{ + s3ObjectPayloadChan <- &s3ObjectPayload{ s3ObjectHandler: s3Processor, - s3ObjectInfo: s3ObjectInfo{ - name: bucketName, - key: *object.Key, - etag: *object.ETag, - lastModified: *object.LastModified, - listingID: listingID.String(), - }, - s3ObjectEvent: event, - } - } - - if totProcessableObjects == 0 { - p.log.Debugw("0 processable objects on bucket pagination.", "bucket", p.bucket, "listPrefix", p.listPrefix, "listingID", listingID) - // nothing to be ACKed, unlock here - p.states.DeleteListing(listingID.String()) - lock.Unlock() - } else { - listingInfo := &listingInfo{totObjects: totProcessableObjects} - p.states.AddListing(listingID.String(), listingInfo) - - // Metrics - p.metrics.s3ObjectsProcessedTotal.Add(uint64(totProcessableObjects)) - } - - close(s3ObjectPayloadChanByPage) - for s3ObjectPayload := range s3ObjectPayloadChanByPage { - s3ObjectPayloadChan <- s3ObjectPayload - } - } -} - -func (p *s3Poller) Purge(ctx context.Context) { - listingIDs := p.states.GetListingIDs() - p.log.Debugw("purging listing.", "listingIDs", listingIDs) - for _, listingID := range listingIDs { - // we lock here in order to process the purge only after - // full listing page is ACKed by all the workers - lock, loaded := p.workersListingMap.Load(listingID) - if !loaded { - // purge calls can overlap, GetListingIDs can return - // an outdated snapshot with listing already purged - p.states.DeleteListing(listingID) - p.log.Debugw("deleting already purged listing from states.", "listingID", listingID) - continue - } - - lock.(*sync.Mutex).Lock() - - states := map[string]*state{} - latestStoredTimeByBucketAndListPrefix := make(map[string]time.Time, 0) - - listingStates := p.states.GetStatesByListingID(listingID) - for i, state := range listingStates { - // it is not stored, keep - if !state.IsProcessed() { - p.log.Debugw("state not stored or with error, skip purge", "state", state) - continue - } - - var latestStoredTime time.Time - states[state.ID] = &listingStates[i] - latestStoredTime, ok := latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] - if !ok { - var commitWriteState commitWriteState - err := p.store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState) - if err == nil { - // we have no entry in the map, and we have no entry in the store - // set zero time - latestStoredTime = time.Time{} - p.log.Debugw("last stored time is zero time", "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } else { - latestStoredTime = commitWriteState.Time - p.log.Debugw("last stored time is commitWriteState", "commitWriteState", commitWriteState, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } - } else { - p.log.Debugw("last stored time from memory", "latestStoredTime", latestStoredTime, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } - - if state.LastModified.After(latestStoredTime) { - p.log.Debugw("last stored time updated", "state.LastModified", state.LastModified, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] = state.LastModified - } - } - - for key := range states { - p.states.Delete(key) - } - - if err := p.states.writeStates(p.store); err != nil { - p.log.Errorw("Failed to write states to the registry", "error", err) - } - - for bucketAndListPrefix, latestStoredTime := range latestStoredTimeByBucketAndListPrefix { - if err := p.store.Set(awsS3WriteCommitPrefix+bucketAndListPrefix, commitWriteState{latestStoredTime}); err != nil { - p.log.Errorw("Failed to write commit time to the registry", "error", err) + objectState: state, } - } - // purge is done, we can unlock and clean - lock.(*sync.Mutex).Unlock() - p.workersListingMap.Delete(listingID) - p.states.DeleteListing(listingID) - - // Listing is removed from all states, we can finalize now - for _, state := range states { - processor, _ := p.createS3ObjectProcessor(ctx, *state) - if err := processor.FinalizeS3Object(); err != nil { - p.log.Errorw("Failed to finalize S3 object", "key", state.Key, "error", err) - } + p.metrics.s3ObjectsProcessedTotal.Inc() } } } func (p *s3Poller) Poll(ctx context.Context) error { - // This loop tries to keep the workers busy as much as possible while - // honoring the number in config opposed to a simpler loop that does one - // listing, sequentially processes every object and then does another listing - workerWg := new(sync.WaitGroup) for ctx.Err() == nil { - // Determine how many S3 workers are available. - workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx) - if err != nil { - break - } - - if workers == 0 { - continue - } + var workerWg sync.WaitGroup + workChan := make(chan *s3ObjectPayload) - s3ObjectPayloadChan := make(chan *s3ObjectPayload) - - workerWg.Add(1) - go func() { - defer func() { - workerWg.Done() - }() - - p.GetS3Objects(ctx, s3ObjectPayloadChan) - p.Purge(ctx) - }() - - workerWg.Add(workers) - for i := 0; i < workers; i++ { + // Start the worker goroutines to listen on the work channel + for i := 0; i < p.numberOfWorkers; i++ { + workerWg.Add(1) go func() { - defer func() { - workerWg.Done() - p.workerSem.Release(1) - }() - if err := p.ProcessObject(s3ObjectPayloadChan); err != nil { - p.log.Warnw("Failed processing S3 listing.", "error", err) - } + defer workerWg.Done() + p.workerLoop(ctx, workChan) }() } - err = timed.Wait(ctx, p.bucketPollInterval) - if err != nil { - if errors.Is(err, context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } + // Start reading data and wait for its processing to be done + p.readerLoop(ctx, workChan) + workerWg.Wait() - return err - } + _ = timed.Wait(ctx, p.bucketPollInterval) } - // Wait for all workers to finish. - workerWg.Wait() - if errors.Is(ctx.Err(), context.Canceled) { // A canceled context is a normal shutdown. return nil diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go index 32911778336b..50e4c2fa245d 100644 --- a/x-pack/filebeat/input/awss3/s3_objects.go +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -43,6 +43,11 @@ type s3ObjectProcessorFactory struct { backupConfig backupConfig } +// s3DownloadError reports problems downloading an S3 object. Download errors +// should never treated as permanent, they are just an indication to apply a +// retry backoff until the connection is healthy again. +var s3DownloadError = errors.New("S3 download failure") + func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig, maxWorkers int) *s3ObjectProcessorFactory { if metrics == nil { // Metrics are optional. Initialize a stub. @@ -135,8 +140,9 @@ func (p *s3ObjectProcessor) ProcessS3Object() error { // Request object (download). contentType, meta, body, err := p.download() if err != nil { - return fmt.Errorf("failed to get s3 object (elapsed_time_ns=%d): %w", - time.Since(start).Nanoseconds(), err) + // Wrap downloadError in the result so the caller knows it's not a + // permanent failure. + return fmt.Errorf("%v: %v", s3DownloadError, err) } defer body.Close() p.s3Metadata = meta @@ -434,10 +440,7 @@ func (p *s3ObjectProcessor) FinalizeS3Object() error { if bucketName == "" { return nil } - backupKey := p.s3Obj.S3.Object.Key - if p.backupConfig.BackupToBucketPrefix != "" { - backupKey = fmt.Sprintf("%s%s", p.backupConfig.BackupToBucketPrefix, backupKey) - } + backupKey := p.s3Obj.S3.Object.Key + p.backupConfig.BackupToBucketPrefix _, err := p.s3.CopyObject(p.ctx, p.s3Obj.S3.Bucket.Name, bucketName, p.s3Obj.S3.Object.Key, backupKey) if err != nil { return fmt.Errorf("failed to copy object to backup bucket: %w", err) diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index 97fb8d538cd6..fb4cafd7019e 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -11,58 +11,44 @@ import ( // state is used to communicate the publishing state of a s3 object type state struct { - // ID is used to identify the state in the store, and it is composed by - // Bucket + Key + Etag + LastModified.String(): changing this value or how it is - // composed will break backward compatibilities with entries already in the store. - ID string `json:"id" struct:"id"` Bucket string `json:"bucket" struct:"bucket"` Key string `json:"key" struct:"key"` Etag string `json:"etag" struct:"etag"` LastModified time.Time `json:"last_modified" struct:"last_modified"` - // ListPrefix is used for unique of the key in the store for awsS3WriteCommitPrefix - ListPrefix string `json:"list_prefix" struct:"list_prefix"` - // A state has Stored = true when all events are ACKed. Stored bool `json:"stored" struct:"stored"` - // A state has Error = true when ProcessS3Object returned an error - Error bool `json:"error" struct:"error"` + + // Failed is true when ProcessS3Object returned an error other than + // s3DownloadError. + // Before 8.14, this field was called "error". However, that field was + // set for many ephemeral reasons including client-side rate limiting + // (see https://github.com/elastic/beats/issues/39114). Now that we + // don't treat download errors as permanent, the field name was changed + // so that users upgrading from old versions aren't prevented from + // retrying old download failures. + Failed bool `json:"failed" struct:"failed"` } +// ID is used to identify the state in the store, and it is composed by +// Bucket + Key + Etag + LastModified.String(): changing this value or how it is +// composed will break backward compatibilities with entries already in the store. func stateID(bucket, key, etag string, lastModified time.Time) string { return bucket + key + etag + lastModified.String() } // newState creates a new s3 object state -func newState(bucket, key, etag, listPrefix string, lastModified time.Time) state { - s := state{ +func newState(bucket, key, etag string, lastModified time.Time) state { + return state{ Bucket: bucket, Key: key, LastModified: lastModified, Etag: etag, - ListPrefix: listPrefix, - Stored: false, - Error: false, } - - s.ID = stateID(s.Bucket, s.Key, s.Etag, s.LastModified) - - return s -} - -// MarkAsStored set the stored flag to true -func (s *state) MarkAsStored() { - s.Stored = true } -// MarkAsError set the error flag to true -func (s *state) MarkAsError() { - s.Error = true -} - -// IsProcessed checks if the state is either Stored or Error -func (s *state) IsProcessed() bool { - return s.Stored || s.Error +func (s *state) ID() string { + return stateID(s.Bucket, s.Key, s.Etag, s.LastModified) } // IsEqual checks if the two states point to the same s3 object. @@ -70,12 +56,6 @@ func (s *state) IsEqual(c *state) bool { return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) } -// IsEmpty checks if the state is empty -func (s *state) IsEmpty() bool { - c := state{} - return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) -} - // String returns string representation of the struct func (s *state) String() string { return fmt.Sprintf( diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 449219a867f5..3ecfbbec899c 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -15,278 +15,60 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore" ) -const ( - awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" - awsS3WriteCommitPrefix = "filebeat::aws-s3::writeCommit::" -) - -type listingInfo struct { - totObjects int - - mu sync.Mutex - storedObjects int - errorObjects int - finalCheck bool -} +const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" // states handles list of s3 object state. One must use newStates to instantiate a // file states registry. Using the zero-value is not safe. type states struct { - sync.RWMutex + // This mutex must be held to access states or store + sync.Mutex log *logp.Logger - // states store - states []state + // known states, indexed by state ID + states map[string]state - // idx maps state IDs to state indexes for fast lookup and modifications. - idx map[string]int - - listingIDs map[string]struct{} - listingInfo *sync.Map - statesByListingID map[string][]state + // The store used to persist state changes to the registry + store *statestore.Store } // newStates generates a new states registry. -func newStates(ctx v2.Context) *states { - return &states{ - log: ctx.Logger.Named("states"), - states: nil, - idx: map[string]int{}, - listingInfo: new(sync.Map), - listingIDs: map[string]struct{}{}, - statesByListingID: map[string][]state{}, - } -} - -func (s *states) MustSkip(state state, store *statestore.Store) bool { - if !s.IsNew(state) { - s.log.Debugw("not new state in must skip", "state", state) - return true - } - - previousState := s.FindPrevious(state) - - // status is forgotten. if there is no previous state and - // the state.LastModified is before the last cleanStore - // write commit we can remove - var commitWriteState commitWriteState - err := store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState) - if err == nil && previousState.IsEmpty() && - (state.LastModified.Before(commitWriteState.Time) || state.LastModified.Equal(commitWriteState.Time)) { - s.log.Debugw("state.LastModified older than writeCommitState in must skip", "state", state, "commitWriteState", commitWriteState) - return true - } - - // the previous state is stored or has error: let's skip - if !previousState.IsEmpty() && previousState.IsProcessed() { - s.log.Debugw("previous state is stored or has error", "state", state) - return true +func newStates(ctx v2.Context, store *statestore.Store) (*states, error) { + states := &states{ + log: ctx.Logger.Named("states"), + states: map[string]state{}, + store: store, } - - return false + return states, states.loadFromRegistry() } -func (s *states) Delete(id string) { +func (s *states) AlreadyProcessed(state state) bool { s.Lock() defer s.Unlock() - - index := s.findPrevious(id) - if index >= 0 { - last := len(s.states) - 1 - s.states[last], s.states[index] = s.states[index], s.states[last] - s.states = s.states[:last] - - s.idx = map[string]int{} - for i, state := range s.states { - s.idx[state.ID] = i - } - } + // Our in-memory table only stores completed objects + _, ok := s.states[state.ID()] + return ok } -// IsListingFullyStored check if listing if fully stored -// After first time the condition is met it will always return false -func (s *states) IsListingFullyStored(listingID string) bool { - info, ok := s.listingInfo.Load(listingID) - if !ok { - return false - } - listingInfo, ok := info.(*listingInfo) - if !ok { - return false - } - - listingInfo.mu.Lock() - defer listingInfo.mu.Unlock() - if listingInfo.finalCheck { - return false - } - - listingInfo.finalCheck = (listingInfo.storedObjects + listingInfo.errorObjects) == listingInfo.totObjects - - if (listingInfo.storedObjects + listingInfo.errorObjects) > listingInfo.totObjects { - s.log.Warnf("unexepected mixmatch between storedObjects (%d), errorObjects (%d) and totObjects (%d)", - listingInfo.storedObjects, listingInfo.errorObjects, listingInfo.totObjects) - } - - return listingInfo.finalCheck -} - -// AddListing add listing info -func (s *states) AddListing(listingID string, listingInfo *listingInfo) { +func (s *states) AddState(state state) { s.Lock() defer s.Unlock() - s.listingIDs[listingID] = struct{}{} - s.listingInfo.Store(listingID, listingInfo) -} - -// DeleteListing delete listing info -func (s *states) DeleteListing(listingID string) { - s.Lock() - defer s.Unlock() - delete(s.listingIDs, listingID) - delete(s.statesByListingID, listingID) - s.listingInfo.Delete(listingID) -} - -// Update updates a state. If previous state didn't exist, new one is created -func (s *states) Update(newState state, listingID string) { - s.Lock() - defer s.Unlock() - - id := newState.ID - index := s.findPrevious(id) - - if index >= 0 { - s.states[index] = newState - } else { - // No existing state found, add new one - s.idx[id] = len(s.states) - s.states = append(s.states, newState) - s.log.Debug("New state added for ", newState.ID) - } - - if listingID == "" || !newState.IsProcessed() { - return - } - - // here we increase the number of stored object - info, ok := s.listingInfo.Load(listingID) - if !ok { - return - } - listingInfo, ok := info.(*listingInfo) - if !ok { - return - } - - listingInfo.mu.Lock() - - if newState.Stored { - listingInfo.storedObjects++ - } - - if newState.Error { - listingInfo.errorObjects++ - } - - listingInfo.mu.Unlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - s.statesByListingID[listingID] = make([]state, 0) - } - - s.statesByListingID[listingID] = append(s.statesByListingID[listingID], newState) -} - -// FindPrevious lookups a registered state, that matching the new state. -// Returns a zero-state if no match is found. -func (s *states) FindPrevious(newState state) state { - s.RLock() - defer s.RUnlock() - id := newState.ID - i := s.findPrevious(id) - if i < 0 { - return state{} - } - return s.states[i] -} - -// FindPreviousByID lookups a registered state, that matching the id. -// Returns a zero-state if no match is found. -func (s *states) FindPreviousByID(id string) state { - s.RLock() - defer s.RUnlock() - i := s.findPrevious(id) - if i < 0 { - return state{} - } - return s.states[i] -} - -func (s *states) IsNew(state state) bool { - s.RLock() - defer s.RUnlock() - id := state.ID - i := s.findPrevious(id) - - if i < 0 { - return true - } - - return !s.states[i].IsEqual(&state) -} - -// findPrevious returns the previous state for the file. -// In case no previous state exists, index -1 is returned -func (s *states) findPrevious(id string) int { - if i, exists := s.idx[id]; exists { - return i - } - return -1 -} -// GetStates creates copy of the file states. -func (s *states) GetStates() []state { - s.RLock() - defer s.RUnlock() - - newStates := make([]state, len(s.states)) - copy(newStates, s.states) - - return newStates -} + id := state.ID() + // Update in-memory copy + s.states[id] = state -// GetListingIDs return a of the listing IDs -func (s *states) GetListingIDs() []string { - s.RLock() - defer s.RUnlock() - listingIDs := make([]string, 0, len(s.listingIDs)) - for listingID := range s.listingIDs { - listingIDs = append(listingIDs, listingID) + // Persist to the registry + key := awsS3ObjectStatePrefix + id + if err := s.store.Set(key, state); err != nil { + s.log.Errorw("Failed to write states to the registry", "error", err) } - - return listingIDs -} - -// GetStatesByListingID return a copy of the states by listing ID -func (s *states) GetStatesByListingID(listingID string) []state { - s.RLock() - defer s.RUnlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - return nil - } - - newStates := make([]state, len(s.statesByListingID[listingID])) - copy(newStates, s.statesByListingID[listingID]) - return newStates } -func (s *states) readStatesFrom(store *statestore.Store) error { - var states []state +func (s *states) loadFromRegistry() error { + var states map[string]state - err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { + err := s.store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { return true, nil } @@ -294,78 +76,28 @@ func (s *states) readStatesFrom(store *statestore.Store) error { // try to decode. Ignore faulty/incompatible values. var st state if err := dec.Decode(&st); err != nil { - // XXX: Do we want to log here? In case we start to store other - // state types in the registry, then this operation will likely fail - // quite often, producing some false-positives in the logs... - return false, err + // Skip this key but continue iteration + s.log.Warnf("invalid S3 state loading object key %v", key) + return true, nil + } + if !st.Stored && !st.Failed { + // This is from an older version where state could be stored in the + // registry even if the object wasn't processed, or if it encountered + // ephemeral download errors. We don't add these to the in-memory cache, + // so if we see them during a bucket scan we will still retry them. + return true, nil } - st.ID = key[len(awsS3ObjectStatePrefix):] - states = append(states, st) + states[st.ID()] = st return true, nil }) if err != nil { return err } - states = fixStates(states) - - for _, state := range states { - s.Update(state, "") - } - - return nil -} - -// fixStates cleans up the registry states when updating from an older version -// of filebeat potentially writing invalid entries. -func fixStates(states []state) []state { - if len(states) == 0 { - return states - } - - // we use a map of states here, so to identify and merge duplicate entries. - idx := map[string]*state{} - for i := range states { - state := &states[i] - - old, exists := idx[state.ID] - if !exists { - idx[state.ID] = state - } else { - mergeStates(old, state) // overwrite the entry in 'old' - } - } - - if len(idx) == len(states) { - return states - } - - i := 0 - newStates := make([]state, len(idx)) - for _, state := range idx { - newStates[i] = *state - i++ - } - return newStates -} - -// mergeStates merges 2 states by trying to determine the 'newer' state. -// The st state is overwritten with the updated fields. -func mergeStates(st, other *state) { - // update file meta-data. As these are updated concurrently by the - // inputs, select the newer state based on the update timestamp. - if st.LastModified.Before(other.LastModified) { - st.LastModified = other.LastModified - } -} + s.Lock() + s.states = states + s.Unlock() -func (s *states) writeStates(store *statestore.Store) error { - for _, state := range s.GetStates() { - key := awsS3ObjectStatePrefix + state.ID - if err := store.Set(key, state); err != nil { - return err - } - } return nil } diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 39dc4cf82e63..2eb436f145cd 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -325,7 +325,7 @@ func TestStatesDelete(t *testing.T) { test := test t.Run(name, func(t *testing.T) { states := test.states() - states.Delete(test.deleteID) + states.DeleteState(test.deleteID) assert.Equal(t, test.expected, states.GetStates()) }) } From fc641e16fa221d242d6abc2f64bb5523a1ab9d98 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 22 Apr 2024 11:00:32 -0400 Subject: [PATCH 35/99] give the registry accessor its own mutex --- x-pack/filebeat/input/awss3/states.go | 33 +++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 3ecfbbec899c..e70b6a8e23a2 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -20,16 +20,17 @@ const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" // states handles list of s3 object state. One must use newStates to instantiate a // file states registry. Using the zero-value is not safe. type states struct { - // This mutex must be held to access states or store - sync.Mutex - log *logp.Logger - // known states, indexed by state ID - states map[string]state + // Completed S3 object states, indexed by state ID. + // statesLock must be held to access states. + states map[string]state + statesLock sync.Mutex - // The store used to persist state changes to the registry - store *statestore.Store + // The store used to persist state changes to the registry. + // storeLock must be held to access store. + store *statestore.Store + storeLock sync.Mutex } // newStates generates a new states registry. @@ -43,31 +44,34 @@ func newStates(ctx v2.Context, store *statestore.Store) (*states, error) { } func (s *states) AlreadyProcessed(state state) bool { - s.Lock() - defer s.Unlock() + s.statesLock.Lock() + defer s.statesLock.Unlock() // Our in-memory table only stores completed objects _, ok := s.states[state.ID()] return ok } func (s *states) AddState(state state) { - s.Lock() - defer s.Unlock() id := state.ID() // Update in-memory copy + s.statesLock.Lock() s.states[id] = state + s.statesLock.Unlock() // Persist to the registry + s.storeLock.Lock() key := awsS3ObjectStatePrefix + id if err := s.store.Set(key, state); err != nil { s.log.Errorw("Failed to write states to the registry", "error", err) } + s.storeLock.Unlock() } func (s *states) loadFromRegistry() error { - var states map[string]state + states := map[string]state{} + s.storeLock.Lock() err := s.store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { return true, nil @@ -91,13 +95,14 @@ func (s *states) loadFromRegistry() error { states[st.ID()] = st return true, nil }) + s.storeLock.Unlock() if err != nil { return err } - s.Lock() + s.statesLock.Lock() s.states = states - s.Unlock() + s.statesLock.Unlock() return nil } From 4a9cb60ac836252e486e8a7619ba86fc98ebeb60 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 23 Apr 2024 14:55:06 -0400 Subject: [PATCH 36/99] update tests --- .../input/awss3/input_benchmark_test.go | 10 +- x-pack/filebeat/input/awss3/s3.go | 21 +- x-pack/filebeat/input/awss3/s3_objects.go | 8 +- .../filebeat/input/awss3/s3_objects_test.go | 2 +- x-pack/filebeat/input/awss3/s3_test.go | 20 +- x-pack/filebeat/input/awss3/state.go | 12 - x-pack/filebeat/input/awss3/state_test.go | 2 +- x-pack/filebeat/input/awss3/states.go | 2 +- x-pack/filebeat/input/awss3/states_test.go | 306 ++++-------------- 9 files changed, 85 insertions(+), 298 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index e05e5b461ca6..af18f5559a38 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -18,6 +18,7 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore" "github.com/elastic/beats/v7/libbeat/statestore/storetest" + "github.com/stretchr/testify/assert" "github.com/elastic/beats/v7/libbeat/beat" @@ -342,14 +343,11 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult return } - err = store.Set(awsS3WriteCommitPrefix+"bucket"+listPrefix, &commitWriteState{time.Time{}}) - if err != nil { - errChan <- err - return - } + states, err := newStates(inputCtx, store) + assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}, numberOfWorkers) - s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) + s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) if err := s3Poller.Poll(ctx); err != nil { if !errors.Is(err, context.DeadlineExceeded) { diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index b321bb245665..2af2edd12806 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -19,15 +19,12 @@ import ( "github.com/elastic/go-concert/timed" ) -const maxCircuitBreaker = 10 - -type commitWriteState struct { - time.Time -} +// var instead of const so it can be reduced during unit tests (instead of waiting +// through 10 minutes of retry backoff) +var readerLoopMaxCircuitBreaker = 10 type s3ObjectPayload struct { s3ObjectHandler s3ObjectHandler - s3ObjectEvent s3EventV2 objectState state } @@ -38,7 +35,6 @@ type s3Poller struct { region string provider string bucketPollInterval time.Duration - workerSem *awscommon.Sem s3 s3API log *logp.Logger metrics *inputMetrics @@ -72,7 +68,6 @@ func newS3Poller(log *logp.Logger, region: awsRegion, provider: provider, bucketPollInterval: bucketPollInterval, - workerSem: awscommon.NewSem(numberOfWorkers), s3: s3, log: log, metrics: metrics, @@ -105,7 +100,7 @@ func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s // Process S3 object (download, parse, create events). err := objHandler.ProcessS3Object() - if errors.Is(err, s3DownloadError) { + if errors.Is(err, errS3DownloadFailed) { // Download errors are ephemeral. Add a backoff delay, then skip to the // next iteration so we don't mark the object as permanently failed. rateLimitWaiter.Wait() @@ -147,16 +142,12 @@ func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s page, err := paginator.NextPage(ctx) if err != nil { - if !paginator.HasMorePages() { - break - } - p.log.Warnw("Error when paginating listing.", "error", err) // QuotaExceededError is client-side rate limiting in the AWS sdk, // don't include it in the circuit breaker count if !errors.As(err, &ratelimit.QuotaExceededError{}) { circuitBreaker++ - if circuitBreaker >= maxCircuitBreaker { + if circuitBreaker >= readerLoopMaxCircuitBreaker { p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) break } @@ -175,7 +166,7 @@ func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) for _, object := range page.Contents { state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified) - if p.states.AlreadyProcessed(state) { + if p.states.IsProcessed(state) { p.log.Debugw("skipping state.", "state", state) continue } diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go index 50e4c2fa245d..21dfa2243e7b 100644 --- a/x-pack/filebeat/input/awss3/s3_objects.go +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -43,10 +43,10 @@ type s3ObjectProcessorFactory struct { backupConfig backupConfig } -// s3DownloadError reports problems downloading an S3 object. Download errors +// errS3DownloadFailed reports problems downloading an S3 object. Download errors // should never treated as permanent, they are just an indication to apply a // retry backoff until the connection is healthy again. -var s3DownloadError = errors.New("S3 download failure") +var errS3DownloadFailed = errors.New("S3 download failure") func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig, maxWorkers int) *s3ObjectProcessorFactory { if metrics == nil { @@ -142,7 +142,7 @@ func (p *s3ObjectProcessor) ProcessS3Object() error { if err != nil { // Wrap downloadError in the result so the caller knows it's not a // permanent failure. - return fmt.Errorf("%v: %v", s3DownloadError, err) + return fmt.Errorf("%w: %w", errS3DownloadFailed, err) } defer body.Close() p.s3Metadata = meta @@ -440,7 +440,7 @@ func (p *s3ObjectProcessor) FinalizeS3Object() error { if bucketName == "" { return nil } - backupKey := p.s3Obj.S3.Object.Key + p.backupConfig.BackupToBucketPrefix + backupKey := p.backupConfig.BackupToBucketPrefix + p.s3Obj.S3.Object.Key _, err := p.s3.CopyObject(p.ctx, p.s3Obj.S3.Bucket.Name, bucketName, p.s3Obj.S3.Object.Key, backupKey) if err != nil { return fmt.Errorf("failed to copy object to backup bucket: %w", err) diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go index 6732c12e0579..5a3e70adb509 100644 --- a/x-pack/filebeat/input/awss3/s3_objects_test.go +++ b/x-pack/filebeat/input/awss3/s3_objects_test.go @@ -157,7 +157,7 @@ func TestS3ObjectProcessor(t *testing.T) { ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() require.Error(t, err) - assert.True(t, errors.Is(err, errFakeConnectivityFailure), "expected errFakeConnectivityFailure error") + assert.True(t, errors.Is(err, errS3DownloadFailed), "expected errS3DownloadFailed") }) t.Run("no error empty result in download", func(t *testing.T) { diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index b94ba7cfb09b..be1d65b796eb 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -13,7 +13,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/beats/v7/libbeat/statestore" @@ -134,12 +133,16 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) + states, err := newStates(inputCtx, store) + require.NoError(t, err, "states creation must succeed") + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) - t.Run("retry after Poll error", func(t *testing.T) { + t.Run("restart bucket scan after paging errors", func(t *testing.T) { + // Change the restart limit to 2 consecutive errors, so the test doesn't + // take too long to run + readerLoopMaxCircuitBreaker = 2 storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) store, err := storeReg.Get("test") if err != nil { @@ -176,13 +179,13 @@ func TestS3Poller(t *testing.T) { // Initial Next gets an error. mockPagerFirst.EXPECT(). HasMorePages(). - Times(10). + Times(2). DoAndReturn(func() bool { return true }) mockPagerFirst.EXPECT(). NextPage(gomock.Any()). - Times(5). + Times(2). DoAndReturn(func(_ context.Context, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) { return nil, errFakeConnectivityFailure }) @@ -257,8 +260,9 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) + states, err := newStates(inputCtx, store) + require.NoError(t, err, "states creation must succeed") + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) } diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index fb4cafd7019e..4b7e09f9e7fa 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -5,7 +5,6 @@ package awss3 import ( - "fmt" "time" ) @@ -55,14 +54,3 @@ func (s *state) ID() string { func (s *state) IsEqual(c *state) bool { return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) } - -// String returns string representation of the struct -func (s *state) String() string { - return fmt.Sprintf( - "{ID: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}", - s.ID, - s.Bucket, - s.Key, - s.Etag, - s.LastModified) -} diff --git a/x-pack/filebeat/input/awss3/state_test.go b/x-pack/filebeat/input/awss3/state_test.go index 24a5e9d81b4e..375a44ce79e2 100644 --- a/x-pack/filebeat/input/awss3/state_test.go +++ b/x-pack/filebeat/input/awss3/state_test.go @@ -61,7 +61,7 @@ func TestStateIsEqual(t *testing.T) { Key: "/key/to/this/file/1", Etag: "etag", LastModified: lastModifed, - Error: true, + Failed: true, }, { Bucket: "bucket a", diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index e70b6a8e23a2..0227dd76cf08 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -43,7 +43,7 @@ func newStates(ctx v2.Context, store *statestore.Store) (*states, error) { return states, states.loadFromRegistry() } -func (s *states) AlreadyProcessed(state state) bool { +func (s *states) IsProcessed(state state) bool { s.statesLock.Lock() defer s.statesLock.Unlock() // Our in-memory table only stores completed objects diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 2eb436f145cd..2f8bbf58fdfb 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -14,6 +14,7 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore/storetest" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/elastic-agent-libs/logp" @@ -46,287 +47,92 @@ var inputCtx = v2.Context{ Cancelation: context.Background(), } -func TestStatesIsNewAndMustSkip(t *testing.T) { +func TestStatesAddStateAndIsProcessed(t *testing.T) { type stateTestCase struct { - states func() *states - state state - mustBeNew bool - persistentStoreKV map[string]interface{} - expectedMustSkip bool - expectedIsNew bool + // An initialization callback to invoke on the (initially empty) states. + statesEdit func(states *states) + + // The state to call IsProcessed on and the expected result + state state + expectedIsProcessed bool + + // If true, the test will run statesEdit, then create a new states + // object from the same persistent store before calling IsProcessed + // (to test persistence between restarts). + shouldReload bool } lastModified := time.Date(2022, time.June, 30, 14, 13, 00, 0, time.UTC) + testState1 := newState("bucket", "key", "etag", lastModified) + testState2 := newState("bucket1", "key1", "etag1", lastModified) tests := map[string]stateTestCase{ "with empty states": { - states: func() *states { - return newStates(inputCtx) - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: false, }, "not existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states + statesEdit: func(states *states) { + states.AddState(testState2) }, - state: newState("bucket1", "key1", "etag1", "listPrefix1", lastModified), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: false, }, "existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: true, - expectedIsNew: false, - }, - "with different etag": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag1", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag2", "listPrefix", lastModified), - expectedMustSkip: false, - expectedIsNew: true, - }, - "with different lastmodified": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(1*time.Second)), - expectedMustSkip: false, - expectedIsNew: true, - }, - "with stored state": { - states: func() *states { - states := newStates(inputCtx) - aState := newState("bucket", "key", "etag", "listPrefix", lastModified) - aState.Stored = true - states.Update(aState, "") - return states + statesEdit: func(states *states) { + states.AddState(testState1) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - mustBeNew: true, - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: true, }, - "with error state": { - states: func() *states { - states := newStates(inputCtx) - aState := newState("bucket", "key", "etag", "listPrefix", lastModified) - aState.Error = true - states.Update(aState, "") - return states + "existing stored state is persisted": { + statesEdit: func(states *states) { + state := testState1 + state.Stored = true + states.AddState(state) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - mustBeNew: true, - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: true, }, - "before commit write": { - states: func() *states { - return newStates(inputCtx) + "existing failed state is persisted": { + statesEdit: func(states *states) { + state := testState1 + state.Failed = true + states.AddState(state) }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(-1*time.Second)), - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: true, }, - "same commit write": { - states: func() *states { - return newStates(inputCtx) - }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, + "existing unprocessed state is not persisted": { + statesEdit: func(states *states) { + states.AddState(testState1) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: true, - expectedIsNew: true, - }, - "after commit write": { - states: func() *states { - return newStates(inputCtx) - }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(time.Second)), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: false, }, } for name, test := range tests { test := test t.Run(name, func(t *testing.T) { - states := test.states() store := openTestStatestore() persistentStore, err := store.Access() if err != nil { t.Fatalf("unexpected err: %v", err) } - for key, value := range test.persistentStoreKV { - _ = persistentStore.Set(key, value) + states, err := newStates(inputCtx, persistentStore) + require.NoError(t, err, "states creation must succeed") + if test.statesEdit != nil { + test.statesEdit(states) } - - if test.mustBeNew { - test.state.LastModified = test.state.LastModified.Add(1 * time.Second) + if test.shouldReload { + states, err = newStates(inputCtx, persistentStore) + require.NoError(t, err, "states creation must succeed") } - isNew := states.IsNew(test.state) - assert.Equal(t, test.expectedIsNew, isNew) - - mustSkip := states.MustSkip(test.state, persistentStore) - assert.Equal(t, test.expectedMustSkip, mustSkip) - }) - } -} - -func TestStatesDelete(t *testing.T) { - type stateTestCase struct { - states func() *states - deleteID string - expected []state - } - - lastModified := time.Date(2021, time.July, 22, 18, 38, 00, 0, time.UTC) - tests := map[string]stateTestCase{ - "delete empty states": { - states: func() *states { - return newStates(inputCtx) - }, - deleteID: "an id", - expected: []state{}, - }, - "delete not existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - deleteID: "an id", - expected: []state{ - { - ID: stateID("bucket", "key", "etag", lastModified), - Bucket: "bucket", - Key: "key", - Etag: "etag", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete only one existing": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - deleteID: stateID("bucket", "key", "etag", lastModified), - expected: []state{}, - }, - "delete first": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey1etag1" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key3", "etag3", lastModified), - Bucket: "bucket", - Key: "key3", - Etag: "etag3", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key2", "etag2", lastModified), - Bucket: "bucket", - Key: "key2", - Etag: "etag2", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete last": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey3etag3" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key1", "etag1", lastModified), - Bucket: "bucket", - Key: "key1", - Etag: "etag1", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key2", "etag2", lastModified), - Bucket: "bucket", - Key: "key2", - Etag: "etag2", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete any": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey2etag2" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key1", "etag1", lastModified), - Bucket: "bucket", - Key: "key1", - Etag: "etag1", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key3", "etag3", lastModified), - Bucket: "bucket", - Key: "key3", - Etag: "etag3", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - } - - for name, test := range tests { - test := test - t.Run(name, func(t *testing.T) { - states := test.states() - states.DeleteState(test.deleteID) - assert.Equal(t, test.expected, states.GetStates()) + isProcessed := states.IsProcessed(test.state) + assert.Equal(t, test.expectedIsProcessed, isProcessed) }) } } From 3d93d22419506fa201550ef07d9db574ef3c6e5d Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 23 Apr 2024 15:15:56 -0400 Subject: [PATCH 37/99] make check --- x-pack/filebeat/input/awss3/input_benchmark_test.go | 3 ++- x-pack/filebeat/input/awss3/s3.go | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index af18f5559a38..c176e9ef24a4 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -16,9 +16,10 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + "github.com/elastic/beats/v7/libbeat/statestore" "github.com/elastic/beats/v7/libbeat/statestore/storetest" - "github.com/stretchr/testify/assert" "github.com/elastic/beats/v7/libbeat/beat" diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 2af2edd12806..8909f78bb39d 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -12,6 +12,7 @@ import ( "time" "github.com/aws/aws-sdk-go-v2/aws/ratelimit" + "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/backoff" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" From 7d6369ff35098f1c9b38bbc48e4f07f676efd282 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 23 Apr 2024 15:27:55 -0400 Subject: [PATCH 38/99] lint --- x-pack/filebeat/input/awss3/input_benchmark_test.go | 3 +-- x-pack/filebeat/input/awss3/s3_objects_test.go | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index c176e9ef24a4..5d22d1411687 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -8,7 +8,6 @@ import ( "context" "errors" "fmt" - "io/ioutil" "os" "path/filepath" "runtime" @@ -134,7 +133,7 @@ type constantS3 struct { var _ s3API = (*constantS3)(nil) func newConstantS3(t testing.TB) *constantS3 { - data, err := ioutil.ReadFile(cloudtrailTestFile) + data, err := os.ReadFile(cloudtrailTestFile) if err != nil { t.Fatal(err) } diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go index 5a3e70adb509..28e8f4f42a52 100644 --- a/x-pack/filebeat/input/awss3/s3_objects_test.go +++ b/x-pack/filebeat/input/awss3/s3_objects_test.go @@ -8,7 +8,8 @@ import ( "bytes" "context" "errors" - "io/ioutil" + "io" + "os" "path/filepath" "strings" "testing" @@ -27,7 +28,7 @@ import ( ) func newS3Object(t testing.TB, filename, contentType string) (s3EventV2, *s3.GetObjectOutput) { - data, err := ioutil.ReadFile(filename) + data, err := os.ReadFile(filename) if err != nil { t.Fatal(err) } @@ -39,7 +40,7 @@ func newS3GetObjectResponse(filename string, data []byte, contentType string) *s r := bytes.NewReader(data) getObjectOutput := s3.GetObjectOutput{} getObjectOutput.ContentLength = int64(r.Len()) - getObjectOutput.Body = ioutil.NopCloser(r) + getObjectOutput.Body = io.NopCloser(r) if contentType != "" { getObjectOutput.ContentType = &contentType } From b4b5b281e54754278cc5bf39df80eaf4b2a22729 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 23 Apr 2024 15:40:07 -0400 Subject: [PATCH 39/99] lint --- x-pack/filebeat/input/awss3/states.go | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 0227dd76cf08..edbbcc73793e 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -82,6 +82,7 @@ func (s *states) loadFromRegistry() error { if err := dec.Decode(&st); err != nil { // Skip this key but continue iteration s.log.Warnf("invalid S3 state loading object key %v", key) + //nolint:nilerr // One bad object shouldn't stop iteration return true, nil } if !st.Stored && !st.Failed { From 942ae03b69f1d56b71b6bea0aea45eb6400f0e01 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 26 Apr 2024 11:44:07 -0400 Subject: [PATCH 40/99] cleaning up context use --- x-pack/filebeat/input/awss3/input.go | 2 +- x-pack/filebeat/input/awss3/input_benchmark_test.go | 2 +- x-pack/filebeat/input/awss3/s3_test.go | 4 ++-- x-pack/filebeat/input/awss3/states.go | 6 ++---- x-pack/filebeat/input/awss3/states_test.go | 4 ++-- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 1fe18be5a2c1..139f8d5f7b88 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -139,7 +139,7 @@ func (in *s3Input) runS3Poller( } defer persistentStore.Close() - states, err := newStates(inputContext, persistentStore) + states, err := newStates(inputContext.Logger, persistentStore) if err != nil { return fmt.Errorf("can not start persistent store: %w", err) } diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index a2fb0c38cd41..dd421f1a590d 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -339,7 +339,7 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult return } - states, err := newStates(inputCtx, store) + states, err := newStates(inputCtx.Logger, store) assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index 24ce31b012f3..893eec5cc7de 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -133,7 +133,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) - states, err := newStates(inputCtx, store) + states, err := newStates(inputCtx.Logger, store) require.NoError(t, err, "states creation must succeed") receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) receiver.Poll(ctx) @@ -260,7 +260,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) - states, err := newStates(inputCtx, store) + states, err := newStates(inputCtx.Logger, store) require.NoError(t, err, "states creation must succeed") receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) receiver.Poll(ctx) diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index edbbcc73793e..3289f2c984bf 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -8,8 +8,6 @@ import ( "strings" "sync" - v2 "github.com/elastic/beats/v7/filebeat/input/v2" - "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/beats/v7/libbeat/statestore" @@ -34,9 +32,9 @@ type states struct { } // newStates generates a new states registry. -func newStates(ctx v2.Context, store *statestore.Store) (*states, error) { +func newStates(log *logp.Logger, store *statestore.Store) (*states, error) { states := &states{ - log: ctx.Logger.Named("states"), + log: log.Named("states"), states: map[string]state{}, store: store, } diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 2f8bbf58fdfb..eea943248ffd 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -121,13 +121,13 @@ func TestStatesAddStateAndIsProcessed(t *testing.T) { if err != nil { t.Fatalf("unexpected err: %v", err) } - states, err := newStates(inputCtx, persistentStore) + states, err := newStates(inputCtx.Logger, persistentStore) require.NoError(t, err, "states creation must succeed") if test.statesEdit != nil { test.statesEdit(states) } if test.shouldReload { - states, err = newStates(inputCtx, persistentStore) + states, err = newStates(inputCtx.Logger, persistentStore) require.NoError(t, err, "states creation must succeed") } From 2c084bbf374cc0ad68d1a415113a51a1bdf95506 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 26 Apr 2024 13:09:27 -0400 Subject: [PATCH 41/99] splitting S3 and SQS into distinct inputs internally --- x-pack/filebeat/input/awss3/input.go | 415 ++------------------------- x-pack/filebeat/input/awss3/s3.go | 232 +++++++++++++++ x-pack/filebeat/input/awss3/sqs.go | 186 ++++++++++++ 3 files changed, 439 insertions(+), 394 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 01a681106d94..c1920e2814d3 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -5,22 +5,12 @@ package awss3 import ( - "context" - "errors" "fmt" - "net/url" - "strings" - "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/aws/retry" - "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/aws/aws-sdk-go-v2/service/sqs" - "github.com/aws/smithy-go" "github.com/elastic/beats/v7/filebeat/beater" v2 "github.com/elastic/beats/v7/filebeat/input/v2" - "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/feature" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" conf "github.com/elastic/elastic-agent-libs/config" @@ -46,6 +36,14 @@ type s3InputManager struct { store beater.StateStore } +// s3Input is a input for reading logs from S3 when triggered by an SQS message. +type s3Input struct { + config config + awsConfig awssdk.Config + store beater.StateStore + metrics *inputMetrics +} + func (im *s3InputManager) Init(grp unison.Group, mode v2.Mode) error { return nil } @@ -59,15 +57,7 @@ func (im *s3InputManager) Create(cfg *conf.C) (v2.Input, error) { return newInput(config, im.store) } -// s3Input is a input for reading logs from S3 when triggered by an SQS message. -type s3Input struct { - config config - awsConfig awssdk.Config - store beater.StateStore - metrics *inputMetrics -} - -func newInput(config config, store beater.StateStore) (*s3Input, error) { +func newInput(config config, store beater.StateStore) (v2.Input, error) { awsConfig, err := awscommon.InitializeAWSConfig(config.AWSConfig) if config.AWSConfig.Endpoint != "" { @@ -85,386 +75,23 @@ func newInput(config config, store beater.StateStore) (*s3Input, error) { return nil, fmt.Errorf("failed to initialize AWS credentials: %w", err) } - return &s3Input{ - config: config, - awsConfig: awsConfig, - store: store, - }, nil -} - -func (in *s3Input) Name() string { return inputName } - -func (in *s3Input) Test(ctx v2.TestContext) error { - return nil -} - -func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - - if in.config.QueueURL != "" { - return in.runQueueReader(ctx, inputContext, pipeline) - } - - if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - return in.runS3Poller(ctx, inputContext, pipeline) - } - - return nil -} - -func (in *s3Input) runQueueReader( - ctx context.Context, - inputContext v2.Context, - pipeline beat.Pipeline, -) error { - configRegion := in.config.RegionName - urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) - if err != nil && configRegion == "" { - // Only report an error if we don't have a configured region - // to fall back on. - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } else if configRegion != "" && configRegion != urlRegion { - inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) + if config.QueueURL != "" { + return newSQSReaderInput(config, awsConfig, store) + //return in.runQueueReader(ctx, inputContext, pipeline) } - in.awsConfig.Region = urlRegion - - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) + if config.BucketARN != "" || config.NonAWSBucketName != "" { + return newS3PollerInput(config, awsConfig, store) + //return in.runS3Poller(ctx, inputContext, pipeline) } - defer receiver.metrics.Close() - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) - - receiver.Receive(ctx) - return nil -} - -func (in *s3Input) runS3Poller( - ctx context.Context, - inputContext v2.Context, - pipeline beat.Pipeline, -) error { - - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) - if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() - - // Connect to the registry and create our states lookup - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) - } - defer persistentStore.Close() - - states, err := newStates(inputContext.Logger, persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } - - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Poller(inputContext, ctx, client, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() - - poller.Poll(ctx) - return nil -} - -func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { - sqsAPI := &awsSQSAPI{ - client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - }), - queueURL: in.config.QueueURL, - apiTimeout: in.config.APITimeout, - visibilityTimeout: in.config.VisibilityTimeout, - longPollWaitTime: in.config.SQSWaitTime, - } - - s3API := &awsS3API{ - client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }), - } - - log := ctx.Logger.With("queue_url", in.config.QueueURL) - log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) - log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) - - if in.config.BackupConfig.GetBucketName() != "" { - log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + - "or prefixes to avoid an infinite loop.") - } - - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - script, err := newScriptFromConfig(log.Named("sqs_script"), in.config.SQSScript) - if err != nil { - return nil, err - } - in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig) - - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory) - - sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) - - return sqsReader, nil -} - -type nonAWSBucketResolver struct { - endpoint string -} - -func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.EndpointResolverOptions) (awssdk.Endpoint, error) { - return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil -} - -func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { - var bucketName string - var bucketID string - if in.config.NonAWSBucketName != "" { - bucketName = in.config.NonAWSBucketName - bucketID = bucketName - } else if in.config.BucketARN != "" { - bucketName = getBucketNameFromARN(in.config.BucketARN) - bucketID = in.config.BucketARN - } - - s3Client := s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.NonAWSBucketName != "" { - o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} - } - - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - - o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { - so.MaxAttempts = 5 - // Recover quickly when requests start working again - so.NoRetryIncrement = 100 - }) - }) - regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) - if err != nil { - return nil, fmt.Errorf("failed to get AWS region for bucket: %w", err) - } - - originalAwsConfigRegion := in.awsConfig.Region - - in.awsConfig.Region = regionName - - if regionName != originalAwsConfigRegion { - s3Client = s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.NonAWSBucketName != "" { - o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} - } - - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }) - } - - s3API := &awsS3API{ - client: s3Client, - } - - log := ctx.Logger.With("bucket", bucketID) - log.Infof("number_of_workers is set to %v.", in.config.NumberOfWorkers) - log.Infof("bucket_list_interval is set to %v.", in.config.BucketListInterval) - log.Infof("bucket_list_prefix is set to %v.", in.config.BucketListPrefix) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig) - s3Poller := newS3Poller(log.Named("s3_poller"), - in.metrics, - s3API, - client, - s3EventHandlerFactory, - states, - bucketID, - in.config.BucketListPrefix, - in.awsConfig.Region, - getProviderFromDomain(in.config.AWSConfig.Endpoint, in.config.ProviderOverride), - in.config.NumberOfWorkers, - in.config.BucketListInterval) - - return s3Poller, nil -} - -var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") - -func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { - // get region from queueURL - // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs - // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue - u, err := url.Parse(queueURL) - if err != nil { - return "", fmt.Errorf(queueURL + " is not a valid URL") - } - if (u.Scheme == "https" || u.Scheme == "http") && u.Host != "" { - queueHostSplit := strings.SplitN(u.Host, ".", 3) - // check for sqs queue url - if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { - if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - return queueHostSplit[1], nil - } - } - - // check for vpce url - queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) - if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { - if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - return queueHostSplitVPC[2], nil - } - } - } - return "", errBadQueueURL -} - -func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { - getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ - Bucket: awssdk.String(bucketName), - }) - - if err != nil { - return "", err - } - - // Region us-east-1 have a LocationConstraint of null. - if len(getBucketLocationOutput.LocationConstraint) == 0 { - return "us-east-1", nil - } - - return string(getBucketLocationOutput.LocationConstraint), nil -} - -func getBucketNameFromARN(bucketARN string) string { - bucketMetadata := strings.Split(bucketARN, ":") - bucketName := bucketMetadata[len(bucketMetadata)-1] - return bucketName -} - -func getProviderFromDomain(endpoint string, ProviderOverride string) string { - if ProviderOverride != "" { - return ProviderOverride - } - if endpoint == "" { - return "aws" - } - // List of popular S3 SaaS providers - providers := map[string]string{ - "amazonaws.com": "aws", - "c2s.sgov.gov": "aws", - "c2s.ic.gov": "aws", - "amazonaws.com.cn": "aws", - "backblazeb2.com": "backblaze", - "cloudflarestorage.com": "cloudflare", - "wasabisys.com": "wasabi", - "digitaloceanspaces.com": "digitalocean", - "dream.io": "dreamhost", - "scw.cloud": "scaleway", - "googleapis.com": "gcp", - "cloud.it": "arubacloud", - "linodeobjects.com": "linode", - "vultrobjects.com": "vultr", - "appdomain.cloud": "ibm", - "aliyuncs.com": "alibaba", - "oraclecloud.com": "oracle", - "exo.io": "exoscale", - "upcloudobjects.com": "upcloud", - "ilandcloud.com": "iland", - "zadarazios.com": "zadara", - } - - parsedEndpoint, _ := url.Parse(endpoint) - for key, provider := range providers { - // support endpoint with and without scheme (http(s)://abc.xyz, abc.xyz) - constraint := parsedEndpoint.Hostname() - if len(parsedEndpoint.Scheme) == 0 { - constraint = parsedEndpoint.Path - } - if strings.HasSuffix(constraint, key) { - return provider - } - } - return "unknown" -} - -func pollSqsWaitingMetric(ctx context.Context, receiver *sqsReader) { - // Run GetApproximateMessageCount before start of timer to set initial count for sqs waiting metric - // This is to avoid misleading values in metric when sqs messages are processed before the ticker channel kicks in - if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { - return - } - - t := time.NewTicker(time.Minute) - defer t.Stop() - for { - select { - case <-ctx.Done(): - return - case <-t.C: - if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { - return - } - } - } -} - -// updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error -// If there is an error, the metric is reinitialized to -1 and true is returned -func updateMessageCount(receiver *sqsReader, ctx context.Context) bool { - count, err := receiver.GetApproximateMessageCount(ctx) - - var apiError smithy.APIError - if errors.As(err, &apiError) { - switch apiError.ErrorCode() { - case sqsAccessDeniedErrorCode: - // stop polling if auth error is encountered - // Set it back to -1 because there is a permission error - receiver.metrics.sqsMessagesWaiting.Set(int64(-1)) - return true - } - } + return nil, fmt.Errorf("configuration has no SQS queue URL and no S3 bucket ARN") - receiver.metrics.sqsMessagesWaiting.Set(int64(count)) - return false + // return &s3Input{ + // config: config, + // awsConfig: awsConfig, + // store: store, + // }, nil } // boolPtr returns a pointer to b. diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 1c3331ce28a9..cdba12ca86d9 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -8,11 +8,18 @@ import ( "context" "errors" "fmt" + "net/url" + "strings" "sync" "time" + awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws/ratelimit" + "github.com/aws/aws-sdk-go-v2/aws/retry" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/elastic/beats/v7/filebeat/beater" + v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/backoff" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" @@ -29,6 +36,12 @@ type s3ObjectPayload struct { objectState state } +type s3PollerInput struct { + config config + awsConfig awssdk.Config + store beater.StateStore +} + type s3Poller struct { numberOfWorkers int bucket string @@ -45,6 +58,148 @@ type s3Poller struct { workersProcessingMap *sync.Map } +func (in *s3PollerInput) Name() string { return inputName } + +func (in *s3PollerInput) Test(ctx v2.TestContext) error { + return nil +} + +func newS3PollerInput( + config config, + awsConfig awssdk.Config, + store beater.StateStore, +) (v2.Input, error) { + return &s3PollerInput{ + config: config, + awsConfig: awsConfig, + store: store, + }, nil +} + +func (in *s3PollerInput) Run( + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + + // Create client for publishing events and receive notification of their ACKs. + client, err := pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) + if err != nil { + return fmt.Errorf("failed to create pipeline client: %w", err) + } + defer client.Close() + + // Connect to the registry and create our states lookup + persistentStore, err := in.store.Access() + if err != nil { + return fmt.Errorf("can not access persistent store: %w", err) + } + defer persistentStore.Close() + + states, err := newStates(inputContext.Logger, persistentStore) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } + + // Create S3 receiver and S3 notification processor. + poller, err := in.createS3Poller(inputContext, ctx, client, states) + if err != nil { + return fmt.Errorf("failed to initialize s3 poller: %w", err) + } + defer poller.metrics.Close() + + poller.Poll(ctx) + return nil +} + +func (in *s3PollerInput) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { + var bucketName string + var bucketID string + if in.config.NonAWSBucketName != "" { + bucketName = in.config.NonAWSBucketName + bucketID = bucketName + } else if in.config.BucketARN != "" { + bucketName = getBucketNameFromARN(in.config.BucketARN) + bucketID = in.config.BucketARN + } + + s3Client := s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { + if in.config.NonAWSBucketName != "" { + o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} + } + + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = in.config.PathStyle + + o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { + so.MaxAttempts = 5 + // Recover quickly when requests start working again + so.NoRetryIncrement = 100 + }) + }) + regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) + if err != nil { + return nil, fmt.Errorf("failed to get AWS region for bucket: %w", err) + } + + originalAwsConfigRegion := in.awsConfig.Region + + in.awsConfig.Region = regionName + + if regionName != originalAwsConfigRegion { + s3Client = s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { + if in.config.NonAWSBucketName != "" { + o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} + } + + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = in.config.PathStyle + }) + } + + s3API := &awsS3API{ + client: s3Client, + } + + log := ctx.Logger.With("bucket", bucketID) + log.Infof("number_of_workers is set to %v.", in.config.NumberOfWorkers) + log.Infof("bucket_list_interval is set to %v.", in.config.BucketListInterval) + log.Infof("bucket_list_prefix is set to %v.", in.config.BucketListPrefix) + log.Infof("AWS region is set to %v.", in.awsConfig.Region) + + fileSelectors := in.config.FileSelectors + if len(in.config.FileSelectors) == 0 { + fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} + } + metrics := newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, fileSelectors, in.config.BackupConfig) + s3Poller := newS3Poller(log.Named("s3_poller"), + metrics, + s3API, + client, + s3EventHandlerFactory, + states, + bucketID, + in.config.BucketListPrefix, + in.awsConfig.Region, + getProviderFromDomain(in.config.AWSConfig.Endpoint, in.config.ProviderOverride), + in.config.NumberOfWorkers, + in.config.BucketListInterval) + + return s3Poller, nil +} + func newS3Poller(log *logp.Logger, metrics *inputMetrics, s3 s3API, @@ -209,3 +364,80 @@ func (p *s3Poller) Poll(ctx context.Context) { _ = timed.Wait(ctx, p.bucketPollInterval) } } + +func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { + getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ + Bucket: awssdk.String(bucketName), + }) + + if err != nil { + return "", err + } + + // Region us-east-1 have a LocationConstraint of null. + if len(getBucketLocationOutput.LocationConstraint) == 0 { + return "us-east-1", nil + } + + return string(getBucketLocationOutput.LocationConstraint), nil +} + +func getBucketNameFromARN(bucketARN string) string { + bucketMetadata := strings.Split(bucketARN, ":") + bucketName := bucketMetadata[len(bucketMetadata)-1] + return bucketName +} + +func getProviderFromDomain(endpoint string, ProviderOverride string) string { + if ProviderOverride != "" { + return ProviderOverride + } + if endpoint == "" { + return "aws" + } + // List of popular S3 SaaS providers + providers := map[string]string{ + "amazonaws.com": "aws", + "c2s.sgov.gov": "aws", + "c2s.ic.gov": "aws", + "amazonaws.com.cn": "aws", + "backblazeb2.com": "backblaze", + "cloudflarestorage.com": "cloudflare", + "wasabisys.com": "wasabi", + "digitaloceanspaces.com": "digitalocean", + "dream.io": "dreamhost", + "scw.cloud": "scaleway", + "googleapis.com": "gcp", + "cloud.it": "arubacloud", + "linodeobjects.com": "linode", + "vultrobjects.com": "vultr", + "appdomain.cloud": "ibm", + "aliyuncs.com": "alibaba", + "oraclecloud.com": "oracle", + "exo.io": "exoscale", + "upcloudobjects.com": "upcloud", + "ilandcloud.com": "iland", + "zadarazios.com": "zadara", + } + + parsedEndpoint, _ := url.Parse(endpoint) + for key, provider := range providers { + // support endpoint with and without scheme (http(s)://abc.xyz, abc.xyz) + constraint := parsedEndpoint.Hostname() + if len(parsedEndpoint.Scheme) == 0 { + constraint = parsedEndpoint.Path + } + if strings.HasSuffix(constraint, key) { + return provider + } + } + return "unknown" +} + +type nonAWSBucketResolver struct { + endpoint string +} + +func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.EndpointResolverOptions) (awssdk.Endpoint, error) { + return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil +} diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 0b005e484116..5f7c73e1bd17 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -6,12 +6,23 @@ package awss3 import ( "context" + "errors" + "fmt" + "net/url" "strconv" + "strings" "sync" "time" + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/aws/aws-sdk-go-v2/service/sqs/types" + "github.com/aws/smithy-go" + "github.com/elastic/beats/v7/filebeat/beater" + v2 "github.com/elastic/beats/v7/filebeat/input/v2" + "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/atomic" "github.com/elastic/elastic-agent-libs/logp" ) @@ -21,6 +32,11 @@ const ( sqsApproximateNumberOfMessages = "ApproximateNumberOfMessages" ) +type sqsReaderInput struct { + config config + awsConfig awssdk.Config +} + type sqsReader struct { maxMessagesInflight int activeMessages atomic.Int @@ -37,6 +53,105 @@ type sqsReader struct { workerWg sync.WaitGroup } +func newSQSReaderInput(config config, + awsConfig awssdk.Config, + store beater.StateStore, +) (v2.Input, error) { + return &sqsReaderInput{ + config: config, + awsConfig: awsConfig, + }, nil +} + +func (in *sqsReaderInput) Name() string { return inputName } + +func (in *sqsReaderInput) Test(ctx v2.TestContext) error { + return nil +} + +func (in *sqsReaderInput) Run( + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) + } + + in.awsConfig.Region = urlRegion + + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, pipeline) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() + + // Poll metrics periodically in the background + go pollSqsWaitingMetric(ctx, receiver) + + receiver.Receive(ctx) + return nil +} + +func (in *sqsReaderInput) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { + sqsAPI := &awsSQSAPI{ + client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + }), + queueURL: in.config.QueueURL, + apiTimeout: in.config.APITimeout, + visibilityTimeout: in.config.VisibilityTimeout, + longPollWaitTime: in.config.SQSWaitTime, + } + + s3API := &awsS3API{ + client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = in.config.PathStyle + }), + } + + log := ctx.Logger.With("queue_url", in.config.QueueURL) + log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) + log.Infof("AWS region is set to %v.", in.awsConfig.Region) + log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) + log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) + + if in.config.BackupConfig.GetBucketName() != "" { + log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + + "or prefixes to avoid an infinite loop.") + } + + fileSelectors := in.config.FileSelectors + if len(in.config.FileSelectors) == 0 { + fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} + } + script, err := newScriptFromConfig(log.Named("sqs_script"), in.config.SQSScript) + if err != nil { + return nil, err + } + metrics := newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, fileSelectors, in.config.BackupConfig) + + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory) + + sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) + + return sqsReader, nil +} + func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessagesInflight int, msgHandler sqsProcessor) *sqsReader { if metrics == nil { // Metrics are optional. Initialize a stub. @@ -141,3 +256,74 @@ func (r *sqsReader) GetApproximateMessageCount(ctx context.Context) (int, error) } return -1, err } + +var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") + +func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { + // get region from queueURL + // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs + // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue + u, err := url.Parse(queueURL) + if err != nil { + return "", fmt.Errorf(queueURL + " is not a valid URL") + } + if (u.Scheme == "https" || u.Scheme == "http") && u.Host != "" { + queueHostSplit := strings.SplitN(u.Host, ".", 3) + // check for sqs queue url + if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { + if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { + return queueHostSplit[1], nil + } + } + + // check for vpce url + queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) + if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { + if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { + return queueHostSplitVPC[2], nil + } + } + } + return "", errBadQueueURL +} + +func pollSqsWaitingMetric(ctx context.Context, receiver *sqsReader) { + // Run GetApproximateMessageCount before start of timer to set initial count for sqs waiting metric + // This is to avoid misleading values in metric when sqs messages are processed before the ticker channel kicks in + if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { + return + } + + t := time.NewTicker(time.Minute) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { + return + } + } + } +} + +// updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error +// If there is an error, the metric is reinitialized to -1 and true is returned +func updateMessageCount(receiver *sqsReader, ctx context.Context) bool { + count, err := receiver.GetApproximateMessageCount(ctx) + + var apiError smithy.APIError + if errors.As(err, &apiError) { + switch apiError.ErrorCode() { + case sqsAccessDeniedErrorCode: + // stop polling if auth error is encountered + // Set it back to -1 because there is a permission error + receiver.metrics.sqsMessagesWaiting.Set(int64(-1)) + return true + } + } + + receiver.metrics.sqsMessagesWaiting.Set(int64(count)) + return false +} From dbe4691b18c621e5c8c9c3d2b6141a024495610a Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Mon, 29 Apr 2024 09:56:59 -0400 Subject: [PATCH 42/99] splitting awss3 into two input objects --- x-pack/filebeat/input/awss3/config.go | 47 +++++ x-pack/filebeat/input/awss3/input.go | 40 +--- .../input/awss3/input_benchmark_test.go | 2 +- x-pack/filebeat/input/awss3/s3.go | 178 +++++++----------- x-pack/filebeat/input/awss3/sqs.go | 7 +- x-pack/filebeat/input/awss3/states.go | 77 +++----- 6 files changed, 154 insertions(+), 197 deletions(-) diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index bf29b641f6d4..771d09f76ec6 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -9,6 +9,9 @@ import ( "fmt" "time" + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/retry" + "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/dustin/go-humanize" "github.com/elastic/beats/v7/libbeat/common/cfgtype" @@ -222,3 +225,47 @@ func (rc *readerConfig) InitDefaults() { rc.MaxBytes = 10 * humanize.MiByte rc.LineTerminator = readfile.AutoLineTerminator } + +func (c config) getBucketName() string { + if c.NonAWSBucketName != "" { + return c.NonAWSBucketName + } + if c.BucketARN != "" { + return getBucketNameFromARN(c.BucketARN) + } + return "" +} + +func (c config) getBucketARN() string { + if c.NonAWSBucketName != "" { + return c.NonAWSBucketName + } + if c.BucketARN != "" { + return c.BucketARN + } + return "" +} + +func (c config) s3OptionsFn(o *s3.Options) { + if c.NonAWSBucketName != "" { + o.EndpointResolver = nonAWSBucketResolver{endpoint: c.AWSConfig.Endpoint} + } + + if c.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = c.PathStyle + + o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { + so.MaxAttempts = 5 + // Recover quickly when requests start working again + so.NoRetryIncrement = 100 + }) +} + +func (c config) getFileSelectors() []fileSelectorConfig { + if len(c.FileSelectors) == 0 { + return []fileSelectorConfig{{ReaderConfig: c.ReaderConfig}} + } + return c.FileSelectors +} diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index c1920e2814d3..fb5f23881452 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -17,10 +17,7 @@ import ( "github.com/elastic/go-concert/unison" ) -const ( - inputName = "aws-s3" - sqsAccessDeniedErrorCode = "AccessDeniedException" -) +const inputName = "aws-s3" func Plugin(store beater.StateStore) v2.Plugin { return v2.Plugin{ @@ -36,14 +33,6 @@ type s3InputManager struct { store beater.StateStore } -// s3Input is a input for reading logs from S3 when triggered by an SQS message. -type s3Input struct { - config config - awsConfig awssdk.Config - store beater.StateStore - metrics *inputMetrics -} - func (im *s3InputManager) Init(grp unison.Group, mode v2.Mode) error { return nil } @@ -54,11 +43,10 @@ func (im *s3InputManager) Create(cfg *conf.C) (v2.Input, error) { return nil, err } - return newInput(config, im.store) -} - -func newInput(config config, store beater.StateStore) (v2.Input, error) { awsConfig, err := awscommon.InitializeAWSConfig(config.AWSConfig) + if err != nil { + return nil, fmt.Errorf("initializing AWS config: %w", err) + } if config.AWSConfig.Endpoint != "" { // Add a custom endpointResolver to the awsConfig so that all the requests are routed to this endpoint @@ -71,27 +59,19 @@ func newInput(config config, store beater.StateStore) (v2.Input, error) { }) } - if err != nil { - return nil, fmt.Errorf("failed to initialize AWS credentials: %w", err) - } - if config.QueueURL != "" { - return newSQSReaderInput(config, awsConfig, store) - //return in.runQueueReader(ctx, inputContext, pipeline) + return newSQSReaderInput(config, awsConfig) } if config.BucketARN != "" || config.NonAWSBucketName != "" { - return newS3PollerInput(config, awsConfig, store) - //return in.runS3Poller(ctx, inputContext, pipeline) + persistentStore, err := im.store.Access() + if err != nil { + return nil, fmt.Errorf("can not access persistent store: %w", err) + } + return newS3PollerInput(config, awsConfig, persistentStore) } return nil, fmt.Errorf("configuration has no SQS queue URL and no S3 bucket ARN") - - // return &s3Input{ - // config: config, - // awsConfig: awsConfig, - // store: store, - // }, nil } // boolPtr returns a pointer to b. diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index dd421f1a590d..de17583a4194 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -343,7 +343,7 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) - s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) + s3Poller := newS3Poller(logp.NewLogger(inputName), config, nil, metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", time.Second) s3Poller.Poll(ctx) }(i, wg) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index cdba12ca86d9..9b32e9ba725d 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -11,17 +11,15 @@ import ( "net/url" "strings" "sync" - "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws/ratelimit" - "github.com/aws/aws-sdk-go-v2/aws/retry" "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/elastic/beats/v7/filebeat/beater" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/backoff" + "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/timed" @@ -39,23 +37,19 @@ type s3ObjectPayload struct { type s3PollerInput struct { config config awsConfig awssdk.Config - store beater.StateStore + store *statestore.Store } type s3Poller struct { - numberOfWorkers int - bucket string - listPrefix string - region string - provider string - bucketPollInterval time.Duration - s3 s3API - log *logp.Logger - metrics *inputMetrics - client beat.Client - s3ObjectHandler s3ObjectHandlerFactory - states *states - workersProcessingMap *sync.Map + log *logp.Logger + config config + awsConfig awssdk.Config + provider string + s3 s3API + metrics *inputMetrics + client beat.Client + s3ObjectHandler s3ObjectHandlerFactory + states *states } func (in *s3PollerInput) Name() string { return inputName } @@ -67,8 +61,9 @@ func (in *s3PollerInput) Test(ctx v2.TestContext) error { func newS3PollerInput( config config, awsConfig awssdk.Config, - store beater.StateStore, + store *statestore.Store, ) (v2.Input, error) { + return &s3PollerInput{ config: config, awsConfig: awsConfig, @@ -82,6 +77,13 @@ func (in *s3PollerInput) Run( ) error { ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + defer in.store.Close() + + states, err := newStates(inputContext.Logger, in.store) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } + // Create client for publishing events and receive notification of their ACKs. client, err := pipeline.ConnectWith(beat.ClientConfig{ EventListener: awscommon.NewEventACKHandler(), @@ -96,20 +98,8 @@ func (in *s3PollerInput) Run( } defer client.Close() - // Connect to the registry and create our states lookup - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) - } - defer persistentStore.Close() - - states, err := newStates(inputContext.Logger, persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Poller(inputContext, ctx, client, states) + poller, err := in.createS3Poller(inputContext.Logger, inputContext.ID, ctx, client, states) if err != nil { return fmt.Errorf("failed to initialize s3 poller: %w", err) } @@ -119,127 +109,82 @@ func (in *s3PollerInput) Run( return nil } -func (in *s3PollerInput) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { - var bucketName string - var bucketID string - if in.config.NonAWSBucketName != "" { - bucketName = in.config.NonAWSBucketName - bucketID = bucketName - } else if in.config.BucketARN != "" { - bucketName = getBucketNameFromARN(in.config.BucketARN) - bucketID = in.config.BucketARN - } - - s3Client := s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.NonAWSBucketName != "" { - o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} - } - - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - - o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { - so.MaxAttempts = 5 - // Recover quickly when requests start working again - so.NoRetryIncrement = 100 - }) - }) - regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) +func (in *s3PollerInput) createS3API(ctx context.Context) (*awsS3API, error) { + s3Client := s3.NewFromConfig(in.awsConfig, in.config.s3OptionsFn) + regionName, err := getRegionForBucket(ctx, s3Client, in.config.getBucketName()) if err != nil { return nil, fmt.Errorf("failed to get AWS region for bucket: %w", err) } - - originalAwsConfigRegion := in.awsConfig.Region - - in.awsConfig.Region = regionName - - if regionName != originalAwsConfigRegion { - s3Client = s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.NonAWSBucketName != "" { - o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} - } - - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }) + // Can this really happen? + if regionName != in.awsConfig.Region { + in.awsConfig.Region = regionName + s3Client = s3.NewFromConfig(in.awsConfig, in.config.s3OptionsFn) } - s3API := &awsS3API{ + return &awsS3API{ client: s3Client, + }, nil +} + +func (in *s3PollerInput) createS3Poller(log *logp.Logger, inputID string, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { + s3API, err := in.createS3API(cancelCtx) + if err != nil { + return nil, err } - log := ctx.Logger.With("bucket", bucketID) + log = log.With("bucket", in.config.getBucketARN()) log.Infof("number_of_workers is set to %v.", in.config.NumberOfWorkers) log.Infof("bucket_list_interval is set to %v.", in.config.BucketListInterval) log.Infof("bucket_list_prefix is set to %v.", in.config.BucketListPrefix) log.Infof("AWS region is set to %v.", in.awsConfig.Region) - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - metrics := newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, fileSelectors, in.config.BackupConfig) + metrics := newInputMetrics(inputID, nil, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, in.config.getFileSelectors(), in.config.BackupConfig) s3Poller := newS3Poller(log.Named("s3_poller"), + in.config, in.awsConfig, metrics, s3API, client, s3EventHandlerFactory, states, - bucketID, - in.config.BucketListPrefix, - in.awsConfig.Region, - getProviderFromDomain(in.config.AWSConfig.Endpoint, in.config.ProviderOverride), - in.config.NumberOfWorkers, - in.config.BucketListInterval) + getProviderFromDomain(in.config.AWSConfig.Endpoint, in.config.ProviderOverride)) return s3Poller, nil } func newS3Poller(log *logp.Logger, + config config, + awsConfig awssdk.Config, metrics *inputMetrics, s3 s3API, client beat.Client, s3ObjectHandler s3ObjectHandlerFactory, states *states, - bucket string, - listPrefix string, - awsRegion string, provider string, - numberOfWorkers int, - bucketPollInterval time.Duration, ) *s3Poller { if metrics == nil { // Metrics are optional. Initialize a stub. metrics = newInputMetrics("", nil, 0) } return &s3Poller{ - numberOfWorkers: numberOfWorkers, - bucket: bucket, - listPrefix: listPrefix, - region: awsRegion, - provider: provider, - bucketPollInterval: bucketPollInterval, - s3: s3, - log: log, - metrics: metrics, - client: client, - s3ObjectHandler: s3ObjectHandler, - states: states, - workersProcessingMap: new(sync.Map), + config: config, + awsConfig: awsConfig, + provider: provider, + s3: s3, + log: log, + metrics: metrics, + client: client, + s3ObjectHandler: s3ObjectHandler, + states: states, } } func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { event := s3EventV2{} - event.AWSRegion = p.region + event.AWSRegion = p.awsConfig.Region event.Provider = p.provider event.S3.Bucket.Name = state.Bucket - event.S3.Bucket.ARN = p.bucket + event.S3.Bucket.ARN = p.config.getBucketARN() event.S3.Object.Key = state.Key acker := awscommon.NewEventACKTracker(ctx) @@ -278,8 +223,11 @@ func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s state.Stored = true } - // Persist the result - p.states.AddState(state) + // Persist the result, report any errors + err = p.states.AddState(state) + if err != nil { + p.log.Errorf("saving completed object state: %v", err.Error()) + } // Metrics p.metrics.s3ObjectsAckedTotal.Inc() @@ -289,11 +237,11 @@ func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { defer close(s3ObjectPayloadChan) - bucketName := getBucketNameFromARN(p.bucket) + bucketName := getBucketNameFromARN(p.config.getBucketARN()) errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) circuitBreaker := 0 - paginator := p.s3.ListObjectsPaginator(bucketName, p.listPrefix) + paginator := p.s3.ListObjectsPaginator(bucketName, p.config.BucketListPrefix) for paginator.HasMorePages() { page, err := paginator.NextPage(ctx) @@ -349,7 +297,7 @@ func (p *s3Poller) Poll(ctx context.Context) { workChan := make(chan *s3ObjectPayload) // Start the worker goroutines to listen on the work channel - for i := 0; i < p.numberOfWorkers; i++ { + for i := 0; i < p.config.NumberOfWorkers; i++ { workerWg.Add(1) go func() { defer workerWg.Done() @@ -361,7 +309,7 @@ func (p *s3Poller) Poll(ctx context.Context) { p.readerLoop(ctx, workChan) workerWg.Wait() - _ = timed.Wait(ctx, p.bucketPollInterval) + _ = timed.Wait(ctx, p.config.BucketListInterval) } } diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 5f7c73e1bd17..a673dcf5a7a6 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -20,7 +20,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sqs/types" "github.com/aws/smithy-go" - "github.com/elastic/beats/v7/filebeat/beater" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/atomic" @@ -28,6 +27,7 @@ import ( ) const ( + sqsAccessDeniedErrorCode = "AccessDeniedException" sqsRetryDelay = 10 * time.Second sqsApproximateNumberOfMessages = "ApproximateNumberOfMessages" ) @@ -53,10 +53,7 @@ type sqsReader struct { workerWg sync.WaitGroup } -func newSQSReaderInput(config config, - awsConfig awssdk.Config, - store beater.StateStore, -) (v2.Input, error) { +func newSQSReaderInput(config config, awsConfig awssdk.Config) (v2.Input, error) { return &sqsReaderInput{ config: config, awsConfig: awsConfig, diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 3289f2c984bf..3218b04ebbf0 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -8,9 +8,8 @@ import ( "strings" "sync" - "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/elastic-agent-libs/logp" ) const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" @@ -18,8 +17,6 @@ const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" // states handles list of s3 object state. One must use newStates to instantiate a // file states registry. Using the zero-value is not safe. type states struct { - log *logp.Logger - // Completed S3 object states, indexed by state ID. // statesLock must be held to access states. states map[string]state @@ -33,44 +30,9 @@ type states struct { // newStates generates a new states registry. func newStates(log *logp.Logger, store *statestore.Store) (*states, error) { - states := &states{ - log: log.Named("states"), - states: map[string]state{}, - store: store, - } - return states, states.loadFromRegistry() -} - -func (s *states) IsProcessed(state state) bool { - s.statesLock.Lock() - defer s.statesLock.Unlock() - // Our in-memory table only stores completed objects - _, ok := s.states[state.ID()] - return ok -} - -func (s *states) AddState(state state) { - - id := state.ID() - // Update in-memory copy - s.statesLock.Lock() - s.states[id] = state - s.statesLock.Unlock() - - // Persist to the registry - s.storeLock.Lock() - key := awsS3ObjectStatePrefix + id - if err := s.store.Set(key, state); err != nil { - s.log.Errorw("Failed to write states to the registry", "error", err) - } - s.storeLock.Unlock() -} - -func (s *states) loadFromRegistry() error { - states := map[string]state{} + stateTable := map[string]state{} - s.storeLock.Lock() - err := s.store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { + err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { return true, nil } @@ -79,7 +41,7 @@ func (s *states) loadFromRegistry() error { var st state if err := dec.Decode(&st); err != nil { // Skip this key but continue iteration - s.log.Warnf("invalid S3 state loading object key %v", key) + log.Warnf("invalid S3 state loading object key %v", key) //nolint:nilerr // One bad object shouldn't stop iteration return true, nil } @@ -91,17 +53,40 @@ func (s *states) loadFromRegistry() error { return true, nil } - states[st.ID()] = st + stateTable[st.ID()] = st return true, nil }) - s.storeLock.Unlock() if err != nil { - return err + return nil, err } + return &states{ + store: store, + states: stateTable, + }, nil +} + +func (s *states) IsProcessed(state state) bool { + s.statesLock.Lock() + defer s.statesLock.Unlock() + // Our in-memory table only stores completed objects + _, ok := s.states[state.ID()] + return ok +} + +func (s *states) AddState(state state) error { + id := state.ID() + // Update in-memory copy s.statesLock.Lock() - s.states = states + s.states[id] = state s.statesLock.Unlock() + // Persist to the registry + s.storeLock.Lock() + defer s.storeLock.Unlock() + key := awsS3ObjectStatePrefix + id + if err := s.store.Set(key, state); err != nil { + return err + } return nil } From e05c45d2b6cfcebd1f7dcb8cc37144a05fe26eab Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 12:57:58 -0400 Subject: [PATCH 43/99] reorganize {s3,sqs}.go by adding {s3,sqs}_input.go for the code specific to the input API --- x-pack/filebeat/input/awss3/config.go | 4 +- .../input/awss3/input_benchmark_test.go | 9 +- x-pack/filebeat/input/awss3/s3.go | 294 +----------------- x-pack/filebeat/input/awss3/s3_input.go | 247 +++++++++++++++ x-pack/filebeat/input/awss3/s3_test.go | 4 +- x-pack/filebeat/input/awss3/sqs.go | 157 +--------- x-pack/filebeat/input/awss3/sqs_input.go | 136 ++++++++ x-pack/filebeat/input/awss3/states.go | 10 +- x-pack/filebeat/input/awss3/states_test.go | 4 +- 9 files changed, 422 insertions(+), 443 deletions(-) create mode 100644 x-pack/filebeat/input/awss3/s3_input.go create mode 100644 x-pack/filebeat/input/awss3/sqs_input.go diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index 771d09f76ec6..37bb75fbb984 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -246,7 +246,9 @@ func (c config) getBucketARN() string { return "" } -func (c config) s3OptionsFn(o *s3.Options) { +// A callback to apply the configuration's settings to an S3 options struct. +// Should be provided to s3.NewFromConfig. +func (c config) s3ConfigModifier(o *s3.Options) { if c.NonAWSBucketName != "" { o.EndpointResolver = nonAWSBucketResolver{endpoint: c.AWSConfig.Endpoint} } diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index de17583a4194..859d396b3b04 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -329,9 +329,10 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult wg.Add(1) go func(i int, wg *sync.WaitGroup) { defer wg.Done() - listPrefix := fmt.Sprintf("list_prefix_%d", i) + curConfig := config + curConfig.BucketListPrefix = fmt.Sprintf("list_prefix_%d", i) s3API := newConstantS3(t) - s3API.pagerConstant = newS3PagerConstant(listPrefix) + s3API.pagerConstant = newS3PagerConstant(curConfig.BucketListPrefix) storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) store, err := storeReg.Get("test") if err != nil { @@ -339,11 +340,11 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult return } - states, err := newStates(inputCtx.Logger, store) + states, err := newStates(store) assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) - s3Poller := newS3Poller(logp.NewLogger(inputName), config, nil, metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", time.Second) + s3Poller := newS3Poller(logp.NewLogger(inputName), config, nil, metrics, s3API, client, s3EventHandlerFactory, states, "bucket", "region", "provider", time.Second) s3Poller.Poll(ctx) }(i, wg) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 9b32e9ba725d..b2c2d50fa2af 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -6,119 +6,24 @@ package awss3 import ( "context" - "errors" "fmt" "net/url" "strings" - "sync" awssdk "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/aws/ratelimit" "github.com/aws/aws-sdk-go-v2/service/s3" - - v2 "github.com/elastic/beats/v7/filebeat/input/v2" - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common/backoff" - "github.com/elastic/beats/v7/libbeat/statestore" - awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" - "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/go-concert/timed" ) -// var instead of const so it can be reduced during unit tests (instead of waiting -// through 10 minutes of retry backoff) -var readerLoopMaxCircuitBreaker = 10 - -type s3ObjectPayload struct { - s3ObjectHandler s3ObjectHandler - objectState state -} - -type s3PollerInput struct { - config config - awsConfig awssdk.Config - store *statestore.Store -} - -type s3Poller struct { - log *logp.Logger - config config - awsConfig awssdk.Config - provider string - s3 s3API - metrics *inputMetrics - client beat.Client - s3ObjectHandler s3ObjectHandlerFactory - states *states -} - -func (in *s3PollerInput) Name() string { return inputName } - -func (in *s3PollerInput) Test(ctx v2.TestContext) error { - return nil -} - -func newS3PollerInput( - config config, - awsConfig awssdk.Config, - store *statestore.Store, -) (v2.Input, error) { - - return &s3PollerInput{ - config: config, - awsConfig: awsConfig, - store: store, - }, nil -} - -func (in *s3PollerInput) Run( - inputContext v2.Context, - pipeline beat.Pipeline, -) error { - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - - defer in.store.Close() - - states, err := newStates(inputContext.Logger, in.store) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } - - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) - if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() - - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Poller(inputContext.Logger, inputContext.ID, ctx, client, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() - - poller.Poll(ctx) - return nil -} - -func (in *s3PollerInput) createS3API(ctx context.Context) (*awsS3API, error) { - s3Client := s3.NewFromConfig(in.awsConfig, in.config.s3OptionsFn) - regionName, err := getRegionForBucket(ctx, s3Client, in.config.getBucketName()) +func createS3API(ctx context.Context, config config, awsConfig awssdk.Config) (*awsS3API, error) { + s3Client := s3.NewFromConfig(awsConfig, config.s3ConfigModifier) + regionName, err := getRegionForBucket(ctx, s3Client, config.getBucketName()) if err != nil { return nil, fmt.Errorf("failed to get AWS region for bucket: %w", err) } // Can this really happen? - if regionName != in.awsConfig.Region { - in.awsConfig.Region = regionName - s3Client = s3.NewFromConfig(in.awsConfig, in.config.s3OptionsFn) + if regionName != awsConfig.Region { + awsConfig.Region = regionName + s3Client = s3.NewFromConfig(awsConfig, config.s3ConfigModifier) } return &awsS3API{ @@ -126,193 +31,6 @@ func (in *s3PollerInput) createS3API(ctx context.Context) (*awsS3API, error) { }, nil } -func (in *s3PollerInput) createS3Poller(log *logp.Logger, inputID string, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { - s3API, err := in.createS3API(cancelCtx) - if err != nil { - return nil, err - } - - log = log.With("bucket", in.config.getBucketARN()) - log.Infof("number_of_workers is set to %v.", in.config.NumberOfWorkers) - log.Infof("bucket_list_interval is set to %v.", in.config.BucketListInterval) - log.Infof("bucket_list_prefix is set to %v.", in.config.BucketListPrefix) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - - metrics := newInputMetrics(inputID, nil, in.config.MaxNumberOfMessages) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, in.config.getFileSelectors(), in.config.BackupConfig) - s3Poller := newS3Poller(log.Named("s3_poller"), - in.config, in.awsConfig, - metrics, - s3API, - client, - s3EventHandlerFactory, - states, - getProviderFromDomain(in.config.AWSConfig.Endpoint, in.config.ProviderOverride)) - - return s3Poller, nil -} - -func newS3Poller(log *logp.Logger, - config config, - awsConfig awssdk.Config, - metrics *inputMetrics, - s3 s3API, - client beat.Client, - s3ObjectHandler s3ObjectHandlerFactory, - states *states, - provider string, -) *s3Poller { - if metrics == nil { - // Metrics are optional. Initialize a stub. - metrics = newInputMetrics("", nil, 0) - } - return &s3Poller{ - config: config, - awsConfig: awsConfig, - provider: provider, - s3: s3, - log: log, - metrics: metrics, - client: client, - s3ObjectHandler: s3ObjectHandler, - states: states, - } -} - -func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { - event := s3EventV2{} - event.AWSRegion = p.awsConfig.Region - event.Provider = p.provider - event.S3.Bucket.Name = state.Bucket - event.S3.Bucket.ARN = p.config.getBucketARN() - event.S3.Object.Key = state.Key - - acker := awscommon.NewEventACKTracker(ctx) - - return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event) -} - -func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s3ObjectPayload) { - rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) - - for s3ObjectPayload := range s3ObjectPayloadChan { - objHandler := s3ObjectPayload.s3ObjectHandler - state := s3ObjectPayload.objectState - - // Process S3 object (download, parse, create events). - err := objHandler.ProcessS3Object() - if errors.Is(err, errS3DownloadFailed) { - // Download errors are ephemeral. Add a backoff delay, then skip to the - // next iteration so we don't mark the object as permanently failed. - rateLimitWaiter.Wait() - continue - } - // Reset the rate limit delay on results that aren't download errors. - rateLimitWaiter.Reset() - - // Wait for downloaded objects to be ACKed. - objHandler.Wait() - - if err != nil { - p.log.Errorf("failed processing S3 event for object key %q in bucket %q: %v", - state.Key, state.Bucket, err.Error()) - - // Non-retryable error. - state.Failed = true - } else { - state.Stored = true - } - - // Persist the result, report any errors - err = p.states.AddState(state) - if err != nil { - p.log.Errorf("saving completed object state: %v", err.Error()) - } - - // Metrics - p.metrics.s3ObjectsAckedTotal.Inc() - } -} - -func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { - defer close(s3ObjectPayloadChan) - - bucketName := getBucketNameFromARN(p.config.getBucketARN()) - - errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) - circuitBreaker := 0 - paginator := p.s3.ListObjectsPaginator(bucketName, p.config.BucketListPrefix) - for paginator.HasMorePages() { - page, err := paginator.NextPage(ctx) - - if err != nil { - p.log.Warnw("Error when paginating listing.", "error", err) - // QuotaExceededError is client-side rate limiting in the AWS sdk, - // don't include it in the circuit breaker count - if !errors.As(err, &ratelimit.QuotaExceededError{}) { - circuitBreaker++ - if circuitBreaker >= readerLoopMaxCircuitBreaker { - p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) - break - } - } - // add a backoff delay and try again - errorBackoff.Wait() - continue - } - // Reset the circuit breaker and the error backoff if a read is successful - circuitBreaker = 0 - errorBackoff.Reset() - - totListedObjects := len(page.Contents) - - // Metrics - p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) - for _, object := range page.Contents { - state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified) - if p.states.IsProcessed(state) { - p.log.Debugw("skipping state.", "state", state) - continue - } - - s3Processor := p.createS3ObjectProcessor(ctx, state) - if s3Processor == nil { - p.log.Debugw("empty s3 processor.", "state", state) - continue - } - - s3ObjectPayloadChan <- &s3ObjectPayload{ - s3ObjectHandler: s3Processor, - objectState: state, - } - - p.metrics.s3ObjectsProcessedTotal.Inc() - } - } -} - -func (p *s3Poller) Poll(ctx context.Context) { - for ctx.Err() == nil { - var workerWg sync.WaitGroup - workChan := make(chan *s3ObjectPayload) - - // Start the worker goroutines to listen on the work channel - for i := 0; i < p.config.NumberOfWorkers; i++ { - workerWg.Add(1) - go func() { - defer workerWg.Done() - p.workerLoop(ctx, workChan) - }() - } - - // Start reading data and wait for its processing to be done - p.readerLoop(ctx, workChan) - workerWg.Wait() - - _ = timed.Wait(ctx, p.config.BucketListInterval) - } -} - func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ Bucket: awssdk.String(bucketName), diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go new file mode 100644 index 000000000000..2321d36e87a7 --- /dev/null +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -0,0 +1,247 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "context" + "errors" + "fmt" + "sync" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/ratelimit" + + v2 "github.com/elastic/beats/v7/filebeat/input/v2" + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common/backoff" + "github.com/elastic/beats/v7/libbeat/statestore" + awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" + "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/go-concert/timed" +) + +// var instead of const so it can be reduced during unit tests (instead of waiting +// through 10 minutes of retry backoff) +var readerLoopMaxCircuitBreaker = 10 + +type s3PollerInput struct { + log *logp.Logger + config config + awsConfig awssdk.Config + provider string + s3 s3API + metrics *inputMetrics + client beat.Client + s3ObjectHandler s3ObjectHandlerFactory + states *states +} + +// s3FetchTask contains metadata for one S3 object that a worker should fech. +type s3FetchTask struct { + s3ObjectHandler s3ObjectHandler + objectState state +} + +func (in *s3PollerInput) Name() string { return inputName } + +func (in *s3PollerInput) Test(ctx v2.TestContext) error { + return nil +} + +func newS3PollerInput( + config config, + awsConfig awssdk.Config, + store *statestore.Store, +) (v2.Input, error) { + + states, err := newStates(store) + if err != nil { + return nil, fmt.Errorf("can not start persistent store: %w", err) + } + + return &s3PollerInput{ + config: config, + awsConfig: awsConfig, + states: states, + }, nil +} + +func (in *s3PollerInput) Run( + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + var err error + + defer in.states.Close() + + // Create client for publishing events and receive notification of their ACKs. + in.client, err = pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) + if err != nil { + return fmt.Errorf("failed to create pipeline client: %w", err) + } + defer in.client.Close() + + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + in.s3, err = createS3API(ctx, in.config, in.awsConfig) + if err != nil { + return fmt.Errorf("failed to create S3 API: %w", err) + } + + in.metrics = newInputMetrics(inputContext.ID, nil, in.config.MaxNumberOfMessages) + defer in.metrics.Close() + + in.s3ObjectHandler = newS3ObjectProcessorFactory( + inputContext.Logger.Named("s3"), + in.metrics, + in.s3, + in.config.getFileSelectors(), + in.config.BackupConfig) + + // Scan the bucket in a loop, delaying by the configured interval each + // iteration. + for ctx.Err() == nil { + in.runScan(ctx) + _ = timed.Wait(ctx, in.config.BucketListInterval) + } + + return nil +} + +func (in *s3PollerInput) runScan(ctx context.Context) { + var workerWg sync.WaitGroup + workChan := make(chan *s3FetchTask) + + // Start the worker goroutines to listen on the work channel + for i := 0; i < in.config.NumberOfWorkers; i++ { + workerWg.Add(1) + go func() { + defer workerWg.Done() + in.workerLoop(ctx, workChan) + }() + } + + // Start reading data and wait for its processing to be done + in.readerLoop(ctx, workChan) + workerWg.Wait() +} + +func (in *s3PollerInput) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { + event := s3EventV2{} + event.AWSRegion = in.awsConfig.Region + event.Provider = in.provider + event.S3.Bucket.Name = state.Bucket + event.S3.Bucket.ARN = in.config.getBucketARN() + event.S3.Object.Key = state.Key + + acker := awscommon.NewEventACKTracker(ctx) + + return in.s3ObjectHandler.Create(ctx, in.log, in.client, acker, event) +} + +func (in *s3PollerInput) workerLoop(ctx context.Context, workChan <-chan *s3FetchTask) { + rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) + + for s3ObjectPayload := range workChan { + objHandler := s3ObjectPayload.s3ObjectHandler + state := s3ObjectPayload.objectState + + // Process S3 object (download, parse, create events). + err := objHandler.ProcessS3Object() + if errors.Is(err, errS3DownloadFailed) { + // Download errors are ephemeral. Add a backoff delay, then skip to the + // next iteration so we don't mark the object as permanently failed. + rateLimitWaiter.Wait() + continue + } + // Reset the rate limit delay on results that aren't download errors. + rateLimitWaiter.Reset() + + // Wait for downloaded objects to be ACKed. + objHandler.Wait() + + if err != nil { + in.log.Errorf("failed processing S3 event for object key %q in bucket %q: %v", + state.Key, state.Bucket, err.Error()) + + // Non-retryable error. + state.Failed = true + } else { + state.Stored = true + } + + // Persist the result, report any errors + err = in.states.AddState(state) + if err != nil { + in.log.Errorf("saving completed object state: %v", err.Error()) + } + + // Metrics + in.metrics.s3ObjectsAckedTotal.Inc() + } +} + +func (in *s3PollerInput) readerLoop(ctx context.Context, workChan chan<- *s3FetchTask) { + defer close(workChan) + + bucketName := getBucketNameFromARN(in.config.getBucketARN()) + + errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) + circuitBreaker := 0 + paginator := in.s3.ListObjectsPaginator(bucketName, in.config.BucketListPrefix) + for paginator.HasMorePages() { + page, err := paginator.NextPage(ctx) + + if err != nil { + in.log.Warnw("Error when paginating listing.", "error", err) + // QuotaExceededError is client-side rate limiting in the AWS sdk, + // don't include it in the circuit breaker count + if !errors.As(err, &ratelimit.QuotaExceededError{}) { + circuitBreaker++ + if circuitBreaker >= readerLoopMaxCircuitBreaker { + in.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) + break + } + } + // add a backoff delay and try again + errorBackoff.Wait() + continue + } + // Reset the circuit breaker and the error backoff if a read is successful + circuitBreaker = 0 + errorBackoff.Reset() + + totListedObjects := len(page.Contents) + + // Metrics + in.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) + for _, object := range page.Contents { + state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified) + if in.states.IsProcessed(state) { + in.log.Debugw("skipping state.", "state", state) + continue + } + + s3Processor := in.createS3ObjectProcessor(ctx, state) + if s3Processor == nil { + in.log.Debugw("empty s3 processor.", "state", state) + continue + } + + workChan <- &s3FetchTask{ + s3ObjectHandler: s3Processor, + objectState: state, + } + + in.metrics.s3ObjectsProcessedTotal.Inc() + } + } +} diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index 893eec5cc7de..03572bfa1c39 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -133,7 +133,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) - states, err := newStates(inputCtx.Logger, store) + states, err := newStates(store) require.NoError(t, err, "states creation must succeed") receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) receiver.Poll(ctx) @@ -260,7 +260,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) - states, err := newStates(inputCtx.Logger, store) + states, err := newStates(store) require.NoError(t, err, "states creation must succeed") receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) receiver.Poll(ctx) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index a673dcf5a7a6..59f5a89d1afd 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -11,18 +11,11 @@ import ( "net/url" "strconv" "strings" - "sync" "time" - awssdk "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/service/s3" - "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/aws/aws-sdk-go-v2/service/sqs/types" "github.com/aws/smithy-go" - v2 "github.com/elastic/beats/v7/filebeat/input/v2" - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common/atomic" "github.com/elastic/elastic-agent-libs/logp" ) @@ -32,123 +25,6 @@ const ( sqsApproximateNumberOfMessages = "ApproximateNumberOfMessages" ) -type sqsReaderInput struct { - config config - awsConfig awssdk.Config -} - -type sqsReader struct { - maxMessagesInflight int - activeMessages atomic.Int - sqs sqsAPI - msgHandler sqsProcessor - log *logp.Logger - metrics *inputMetrics - - // The main loop sends incoming messages to workChan, and the worker - // goroutines read from it. - workChan chan types.Message - - // workerWg is used to wait on worker goroutines during shutdown - workerWg sync.WaitGroup -} - -func newSQSReaderInput(config config, awsConfig awssdk.Config) (v2.Input, error) { - return &sqsReaderInput{ - config: config, - awsConfig: awsConfig, - }, nil -} - -func (in *sqsReaderInput) Name() string { return inputName } - -func (in *sqsReaderInput) Test(ctx v2.TestContext) error { - return nil -} - -func (in *sqsReaderInput) Run( - inputContext v2.Context, - pipeline beat.Pipeline, -) error { - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - configRegion := in.config.RegionName - urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) - if err != nil && configRegion == "" { - // Only report an error if we don't have a configured region - // to fall back on. - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } else if configRegion != "" && configRegion != urlRegion { - inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) - } - - in.awsConfig.Region = urlRegion - - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) - } - defer receiver.metrics.Close() - - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) - - receiver.Receive(ctx) - return nil -} - -func (in *sqsReaderInput) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { - sqsAPI := &awsSQSAPI{ - client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - }), - queueURL: in.config.QueueURL, - apiTimeout: in.config.APITimeout, - visibilityTimeout: in.config.VisibilityTimeout, - longPollWaitTime: in.config.SQSWaitTime, - } - - s3API := &awsS3API{ - client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }), - } - - log := ctx.Logger.With("queue_url", in.config.QueueURL) - log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) - log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) - - if in.config.BackupConfig.GetBucketName() != "" { - log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + - "or prefixes to avoid an infinite loop.") - } - - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - script, err := newScriptFromConfig(log.Named("sqs_script"), in.config.SQSScript) - if err != nil { - return nil, err - } - metrics := newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, fileSelectors, in.config.BackupConfig) - - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory) - - sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) - - return sqsReader, nil -} - func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessagesInflight int, msgHandler sqsProcessor) *sqsReader { if metrics == nil { // Metrics are optional. Initialize a stub. @@ -285,42 +161,37 @@ func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { } func pollSqsWaitingMetric(ctx context.Context, receiver *sqsReader) { - // Run GetApproximateMessageCount before start of timer to set initial count for sqs waiting metric - // This is to avoid misleading values in metric when sqs messages are processed before the ticker channel kicks in - if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { - return - } - t := time.NewTicker(time.Minute) defer t.Stop() for { + if err := updateMessageCount(receiver, ctx); isSQSAuthError(err) { + // stop polling if auth error is encountered + // Set it back to -1 because there is a permission error + receiver.metrics.sqsMessagesWaiting.Set(int64(-1)) + return + } select { case <-ctx.Done(): return case <-t.C: - if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { - return - } } } } // updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error // If there is an error, the metric is reinitialized to -1 and true is returned -func updateMessageCount(receiver *sqsReader, ctx context.Context) bool { +func updateMessageCount(receiver *sqsReader, ctx context.Context) error { count, err := receiver.GetApproximateMessageCount(ctx) + if err == nil { + receiver.metrics.sqsMessagesWaiting.Set(int64(count)) + } + return err +} +func isSQSAuthError(err error) bool { var apiError smithy.APIError if errors.As(err, &apiError) { - switch apiError.ErrorCode() { - case sqsAccessDeniedErrorCode: - // stop polling if auth error is encountered - // Set it back to -1 because there is a permission error - receiver.metrics.sqsMessagesWaiting.Set(int64(-1)) - return true - } + return apiError.ErrorCode() == sqsAccessDeniedErrorCode } - - receiver.metrics.sqsMessagesWaiting.Set(int64(count)) return false } diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go new file mode 100644 index 000000000000..3bb986d0553c --- /dev/null +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -0,0 +1,136 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package awss3 + +import ( + "fmt" + "sync" + + awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/sqs" + "github.com/aws/aws-sdk-go-v2/service/sqs/types" + + v2 "github.com/elastic/beats/v7/filebeat/input/v2" + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common/atomic" + "github.com/elastic/elastic-agent-libs/logp" +) + +type sqsReaderInput struct { + config config + awsConfig awssdk.Config +} + +type sqsReader struct { + maxMessagesInflight int + activeMessages atomic.Int + sqs sqsAPI + msgHandler sqsProcessor + log *logp.Logger + metrics *inputMetrics + + // The main loop sends incoming messages to workChan, and the worker + // goroutines read from it. + workChan chan types.Message + + // workerWg is used to wait on worker goroutines during shutdown + workerWg sync.WaitGroup +} + +func newSQSReaderInput(config config, awsConfig awssdk.Config) (v2.Input, error) { + return &sqsReaderInput{ + config: config, + awsConfig: awsConfig, + }, nil +} + +func (in *sqsReaderInput) Name() string { return inputName } + +func (in *sqsReaderInput) Test(ctx v2.TestContext) error { + return nil +} + +func (in *sqsReaderInput) Run( + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) + } + + in.awsConfig.Region = urlRegion + + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, pipeline) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() + + // Poll metrics periodically in the background + go pollSqsWaitingMetric(ctx, receiver) + + receiver.Receive(ctx) + return nil +} +func (in *sqsReaderInput) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { + sqsAPI := &awsSQSAPI{ + client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + }), + queueURL: in.config.QueueURL, + apiTimeout: in.config.APITimeout, + visibilityTimeout: in.config.VisibilityTimeout, + longPollWaitTime: in.config.SQSWaitTime, + } + + s3API := &awsS3API{ + client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = in.config.PathStyle + }), + } + + log := ctx.Logger.With("queue_url", in.config.QueueURL) + log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) + log.Infof("AWS region is set to %v.", in.awsConfig.Region) + log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) + log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) + + if in.config.BackupConfig.GetBucketName() != "" { + log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + + "or prefixes to avoid an infinite loop.") + } + + fileSelectors := in.config.FileSelectors + if len(in.config.FileSelectors) == 0 { + fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} + } + script, err := newScriptFromConfig(log.Named("sqs_script"), in.config.SQSScript) + if err != nil { + return nil, err + } + metrics := newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, fileSelectors, in.config.BackupConfig) + + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory) + + sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) + + return sqsReader, nil +} diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 3218b04ebbf0..fe7a78a28bdc 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -9,7 +9,6 @@ import ( "sync" "github.com/elastic/beats/v7/libbeat/statestore" - "github.com/elastic/elastic-agent-libs/logp" ) const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" @@ -29,7 +28,7 @@ type states struct { } // newStates generates a new states registry. -func newStates(log *logp.Logger, store *statestore.Store) (*states, error) { +func newStates(store *statestore.Store) (*states, error) { stateTable := map[string]state{} err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { @@ -41,7 +40,6 @@ func newStates(log *logp.Logger, store *statestore.Store) (*states, error) { var st state if err := dec.Decode(&st); err != nil { // Skip this key but continue iteration - log.Warnf("invalid S3 state loading object key %v", key) //nolint:nilerr // One bad object shouldn't stop iteration return true, nil } @@ -90,3 +88,9 @@ func (s *states) AddState(state state) error { } return nil } + +func (s *states) Close() { + s.storeLock.Lock() + s.store.Close() + s.storeLock.Unlock() +} diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index eea943248ffd..082e5819f5ce 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -121,13 +121,13 @@ func TestStatesAddStateAndIsProcessed(t *testing.T) { if err != nil { t.Fatalf("unexpected err: %v", err) } - states, err := newStates(inputCtx.Logger, persistentStore) + states, err := newStates(persistentStore) require.NoError(t, err, "states creation must succeed") if test.statesEdit != nil { test.statesEdit(states) } if test.shouldReload { - states, err = newStates(inputCtx.Logger, persistentStore) + states, err = newStates(persistentStore) require.NoError(t, err, "states creation must succeed") } From 54f0a87dcd1b599b15b96eab32939942f60bb733 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 13:33:31 -0400 Subject: [PATCH 44/99] clean up sqs helpers --- .../input/awss3/input_benchmark_test.go | 13 ++++++-- x-pack/filebeat/input/awss3/s3_input.go | 31 ++++++++++++------- x-pack/filebeat/input/awss3/sqs.go | 16 +++++----- x-pack/filebeat/input/awss3/sqs_input.go | 2 +- x-pack/filebeat/input/awss3/sqs_test.go | 4 +-- 5 files changed, 42 insertions(+), 24 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 859d396b3b04..c1a84802a3cc 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -344,9 +344,18 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) - s3Poller := newS3Poller(logp.NewLogger(inputName), config, nil, metrics, s3API, client, s3EventHandlerFactory, states, "bucket", "region", "provider", time.Second) + s3Poller := &s3PollerInput{ + log: logp.NewLogger(inputName), + config: config, + metrics: metrics, + s3: s3API, + client: client, + s3ObjectHandler: s3EventHandlerFactory, + states: states, + provider: "provider", + } - s3Poller.Poll(ctx) + s3Poller.scanLoop(ctx) }(i, wg) } diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index 2321d36e87a7..9c12e67b145a 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -68,23 +68,28 @@ func newS3PollerInput( }, nil } +func createClient(pipeline beat.Pipeline) (beat.Client, error) { + return pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) +} + func (in *s3PollerInput) Run( inputContext v2.Context, pipeline beat.Pipeline, ) error { + log := inputContext.Logger.Named("s3") var err error defer in.states.Close() // Create client for publishing events and receive notification of their ACKs. - in.client, err = pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) + in.client, err = createClient(pipeline) if err != nil { return fmt.Errorf("failed to create pipeline client: %w", err) } @@ -100,7 +105,7 @@ func (in *s3PollerInput) Run( defer in.metrics.Close() in.s3ObjectHandler = newS3ObjectProcessorFactory( - inputContext.Logger.Named("s3"), + log, in.metrics, in.s3, in.config.getFileSelectors(), @@ -108,12 +113,16 @@ func (in *s3PollerInput) Run( // Scan the bucket in a loop, delaying by the configured interval each // iteration. + in.scanLoop(ctx) + + return nil +} + +func (in *s3PollerInput) scanLoop(ctx context.Context) { for ctx.Err() == nil { in.runScan(ctx) _ = timed.Wait(ctx, in.config.BucketListInterval) } - - return nil } func (in *s3PollerInput) runScan(ctx context.Context) { diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 59f5a89d1afd..f27a3c6fa2c0 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -118,8 +118,8 @@ func (r *sqsReader) startWorkers(ctx context.Context) { } } -func (r *sqsReader) GetApproximateMessageCount(ctx context.Context) (int, error) { - attributes, err := r.sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) +func getApproximateMessageCount(ctx context.Context, sqs sqsAPI) (int, error) { + attributes, err := sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) if err == nil { if c, found := attributes[sqsApproximateNumberOfMessages]; found { if messagesCount, err := strconv.Atoi(c); err == nil { @@ -160,14 +160,14 @@ func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { return "", errBadQueueURL } -func pollSqsWaitingMetric(ctx context.Context, receiver *sqsReader) { +func pollSqsWaitingMetric(ctx context.Context, sqs sqsAPI, metrics *inputMetrics) { t := time.NewTicker(time.Minute) defer t.Stop() for { - if err := updateMessageCount(receiver, ctx); isSQSAuthError(err) { + if err := updateMessageCount(ctx, sqs, metrics); isSQSAuthError(err) { // stop polling if auth error is encountered // Set it back to -1 because there is a permission error - receiver.metrics.sqsMessagesWaiting.Set(int64(-1)) + metrics.sqsMessagesWaiting.Set(int64(-1)) return } select { @@ -180,10 +180,10 @@ func pollSqsWaitingMetric(ctx context.Context, receiver *sqsReader) { // updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error // If there is an error, the metric is reinitialized to -1 and true is returned -func updateMessageCount(receiver *sqsReader, ctx context.Context) error { - count, err := receiver.GetApproximateMessageCount(ctx) +func updateMessageCount(ctx context.Context, sqs sqsAPI, metrics *inputMetrics) error { + count, err := getApproximateMessageCount(ctx, sqs) if err == nil { - receiver.metrics.sqsMessagesWaiting.Set(int64(count)) + metrics.sqsMessagesWaiting.Set(int64(count)) } return err } diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index 3bb986d0553c..8a5a1b6fa3c6 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -78,7 +78,7 @@ func (in *sqsReaderInput) Run( defer receiver.metrics.Close() // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) + go pollSqsWaitingMetric(ctx, receiver.sqs, receiver.metrics) receiver.Receive(ctx) return nil diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index 2ab261173d16..463e358d0f59 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -135,7 +135,7 @@ func TestGetApproximateMessageCount(t *testing.T) { ) receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) - receivedCount, err := receiver.GetApproximateMessageCount(ctx) + receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) assert.Equal(t, count, receivedCount) assert.Nil(t, err) }) @@ -160,7 +160,7 @@ func TestGetApproximateMessageCount(t *testing.T) { ) receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) - receivedCount, err := receiver.GetApproximateMessageCount(ctx) + receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) assert.Equal(t, -1, receivedCount) assert.NotNil(t, err) }) From d3964577d2eb92bec8c92fd6725bab63703cd3c6 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 13:35:50 -0400 Subject: [PATCH 45/99] fix merge --- x-pack/filebeat/input/awss3/input.go | 380 --------------------------- 1 file changed, 380 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 9eca05b1f0a7..fb5f23881452 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -63,7 +63,6 @@ func (im *s3InputManager) Create(cfg *conf.C) (v2.Input, error) { return newSQSReaderInput(config, awsConfig) } -<<<<<<< HEAD if config.BucketARN != "" || config.NonAWSBucketName != "" { persistentStore, err := im.store.Access() if err != nil { @@ -73,385 +72,6 @@ func (im *s3InputManager) Create(cfg *conf.C) (v2.Input, error) { } return nil, fmt.Errorf("configuration has no SQS queue URL and no S3 bucket ARN") -======= - return &s3Input{ - config: config, - awsConfig: awsConfig, - store: store, - }, nil -} - -func (in *s3Input) Name() string { return inputName } - -func (in *s3Input) Test(ctx v2.TestContext) error { - return nil -} - -func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - - if in.config.QueueURL != "" { - return in.runQueueReader(ctx, inputContext, pipeline) - } - - if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - return in.runS3Poller(ctx, inputContext, pipeline) - } - - return nil -} - -func (in *s3Input) runQueueReader( - ctx context.Context, - inputContext v2.Context, - pipeline beat.Pipeline, -) error { - configRegion := in.config.RegionName - urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) - if err != nil && configRegion == "" { - // Only report an error if we don't have a configured region - // to fall back on. - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } else if configRegion != "" && configRegion != urlRegion { - inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) - } - - in.awsConfig.Region = urlRegion - - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) - } - defer receiver.metrics.Close() - - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) - - return receiver.Receive(ctx) -} - -func (in *s3Input) runS3Poller( - ctx context.Context, - inputContext v2.Context, - pipeline beat.Pipeline, -) error { - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) - if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() - - // Connect to the registry and create our states lookup - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) - } - defer persistentStore.Close() - - states, err := newStates(inputContext, persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) - } - - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Poller(inputContext, ctx, client, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() - - return poller.Poll(ctx) -} - -func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { - sqsAPI := &awsSQSAPI{ - client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - }), - queueURL: in.config.QueueURL, - apiTimeout: in.config.APITimeout, - visibilityTimeout: in.config.VisibilityTimeout, - longPollWaitTime: in.config.SQSWaitTime, - } - - s3API := &awsS3API{ - client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }), - } - - log := ctx.Logger.With("queue_url", in.config.QueueURL) - log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) - log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) - - if in.config.BackupConfig.GetBucketName() != "" { - log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + - "or prefixes to avoid an infinite loop.") - } - - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - script, err := newScriptFromConfig(log.Named("sqs_script"), in.config.SQSScript) - if err != nil { - return nil, err - } - in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) - - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) - - sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) - - return sqsReader, nil -} - -type nonAWSBucketResolver struct { - endpoint string -} - -func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.EndpointResolverOptions) (awssdk.Endpoint, error) { - return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil -} - -func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { - var bucketName string - var bucketID string - if in.config.NonAWSBucketName != "" { - bucketName = in.config.NonAWSBucketName - bucketID = bucketName - } else if in.config.BucketARN != "" { - bucketName = getBucketNameFromARN(in.config.BucketARN) - bucketID = in.config.BucketARN - } - - s3Client := s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.NonAWSBucketName != "" { - o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} - } - - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - - o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { - so.MaxAttempts = 5 - // Recover quickly when requests start working again - so.NoRetryIncrement = 100 - }) - }) - regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) - if err != nil { - return nil, fmt.Errorf("failed to get AWS region for bucket: %w", err) - } - - originalAwsConfigRegion := in.awsConfig.Region - - in.awsConfig.Region = regionName - - if regionName != originalAwsConfigRegion { - s3Client = s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.NonAWSBucketName != "" { - o.EndpointResolver = nonAWSBucketResolver{endpoint: in.config.AWSConfig.Endpoint} - } - - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }) - } - - s3API := &awsS3API{ - client: s3Client, - } - - log := ctx.Logger.With("bucket", bucketID) - log.Infof("number_of_workers is set to %v.", in.config.NumberOfWorkers) - log.Infof("bucket_list_interval is set to %v.", in.config.BucketListInterval) - log.Infof("bucket_list_prefix is set to %v.", in.config.BucketListPrefix) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) - s3Poller := newS3Poller(log.Named("s3_poller"), - in.metrics, - s3API, - client, - s3EventHandlerFactory, - states, - bucketID, - in.config.BucketListPrefix, - in.awsConfig.Region, - getProviderFromDomain(in.config.AWSConfig.Endpoint, in.config.ProviderOverride), - in.config.NumberOfWorkers, - in.config.BucketListInterval) - - return s3Poller, nil -} - -var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") - -func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { - // get region from queueURL - // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs - // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue - u, err := url.Parse(queueURL) - if err != nil { - return "", fmt.Errorf(queueURL + " is not a valid URL") - } - if (u.Scheme == "https" || u.Scheme == "http") && u.Host != "" { - queueHostSplit := strings.SplitN(u.Host, ".", 3) - // check for sqs queue url - if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { - if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - return queueHostSplit[1], nil - } - } - - // check for vpce url - queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) - if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { - if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - return queueHostSplitVPC[2], nil - } - } - } - return "", errBadQueueURL -} - -func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { - getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ - Bucket: awssdk.String(bucketName), - }) - - if err != nil { - return "", err - } - - // Region us-east-1 have a LocationConstraint of null. - if len(getBucketLocationOutput.LocationConstraint) == 0 { - return "us-east-1", nil - } - - return string(getBucketLocationOutput.LocationConstraint), nil -} - -func getBucketNameFromARN(bucketARN string) string { - bucketMetadata := strings.Split(bucketARN, ":") - bucketName := bucketMetadata[len(bucketMetadata)-1] - return bucketName -} - -func getProviderFromDomain(endpoint string, ProviderOverride string) string { - if ProviderOverride != "" { - return ProviderOverride - } - if endpoint == "" { - return "aws" - } - // List of popular S3 SaaS providers - providers := map[string]string{ - "amazonaws.com": "aws", - "c2s.sgov.gov": "aws", - "c2s.ic.gov": "aws", - "amazonaws.com.cn": "aws", - "backblazeb2.com": "backblaze", - "cloudflarestorage.com": "cloudflare", - "wasabisys.com": "wasabi", - "digitaloceanspaces.com": "digitalocean", - "dream.io": "dreamhost", - "scw.cloud": "scaleway", - "googleapis.com": "gcp", - "cloud.it": "arubacloud", - "linodeobjects.com": "linode", - "vultrobjects.com": "vultr", - "appdomain.cloud": "ibm", - "aliyuncs.com": "alibaba", - "oraclecloud.com": "oracle", - "exo.io": "exoscale", - "upcloudobjects.com": "upcloud", - "ilandcloud.com": "iland", - "zadarazios.com": "zadara", - } - - parsedEndpoint, _ := url.Parse(endpoint) - for key, provider := range providers { - // support endpoint with and without scheme (http(s)://abc.xyz, abc.xyz) - constraint := parsedEndpoint.Hostname() - if len(parsedEndpoint.Scheme) == 0 { - constraint = parsedEndpoint.Path - } - if strings.HasSuffix(constraint, key) { - return provider - } - } - return "unknown" -} - -func pollSqsWaitingMetric(ctx context.Context, receiver *sqsReader) { - // Run GetApproximateMessageCount before start of timer to set initial count for sqs waiting metric - // This is to avoid misleading values in metric when sqs messages are processed before the ticker channel kicks in - if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { - return - } - - t := time.NewTicker(time.Minute) - defer t.Stop() - for { - select { - case <-ctx.Done(): - return - case <-t.C: - if shouldReturn := updateMessageCount(receiver, ctx); shouldReturn { - return - } - } - } -} - -// updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error -// If there is an error, the metric is reinitialized to -1 and true is returned -func updateMessageCount(receiver *sqsReader, ctx context.Context) bool { - count, err := receiver.GetApproximateMessageCount(ctx) - - var apiError smithy.APIError - if errors.As(err, &apiError) { - switch apiError.ErrorCode() { - case sqsAccessDeniedErrorCode: - // stop polling if auth error is encountered - // Set it back to -1 because there is a permission error - receiver.metrics.sqsMessagesWaiting.Set(int64(-1)) - return true - } - } - - receiver.metrics.sqsMessagesWaiting.Set(int64(count)) - return false ->>>>>>> main } // boolPtr returns a pointer to b. From be54ac708a50c6c82aebb564c967d3cac9aab827 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 13:41:14 -0400 Subject: [PATCH 46/99] update tests --- x-pack/filebeat/input/awss3/s3_input.go | 4 +- x-pack/filebeat/input/awss3/s3_test.go | 50 +++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index 9c12e67b145a..af0e45ebe921 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -120,12 +120,12 @@ func (in *s3PollerInput) Run( func (in *s3PollerInput) scanLoop(ctx context.Context) { for ctx.Err() == nil { - in.runScan(ctx) + in.runPoll(ctx) _ = timed.Wait(ctx, in.config.BucketListInterval) } } -func (in *s3PollerInput) runScan(ctx context.Context) { +func (in *s3PollerInput) runPoll(ctx context.Context) { var workerWg sync.WaitGroup workChan := make(chan *s3FetchTask) diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index 03572bfa1c39..91f3bdddf03b 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -135,10 +135,38 @@ func TestS3Poller(t *testing.T) { s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) states, err := newStates(store) require.NoError(t, err, "states creation must succeed") - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) - receiver.Poll(ctx) + poller := &s3PollerInput{ + log: logp.NewLogger(inputName), + config: config{ + NumberOfWorkers: numberOfWorkers, + BucketListInterval: pollInterval, + BucketARN: bucket, + BucketListPrefix: "key", + RegionName: "region", + }, + s3: mockAPI, + client: mockPublisher, + s3ObjectHandler: s3ObjProc, + states: states, + provider: "provider", + } + poller.runPoll(ctx) }) + /* + func newS3Poller(log *logp.Logger, + - metrics *inputMetrics, + - s3 s3API, + - client beat.Client, + - s3ObjectHandler s3ObjectHandlerFactory, + - states *states, + - bucket string, + - listPrefix string, + - awsRegion string, + - provider string, + - numberOfWorkers int, + - bucketPollInterval time.Duration,*/ + t.Run("restart bucket scan after paging errors", func(t *testing.T) { // Change the restart limit to 2 consecutive errors, so the test doesn't // take too long to run @@ -262,7 +290,21 @@ func TestS3Poller(t *testing.T) { s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) states, err := newStates(store) require.NoError(t, err, "states creation must succeed") - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) - receiver.Poll(ctx) + poller := &s3PollerInput{ + log: logp.NewLogger(inputName), + config: config{ + NumberOfWorkers: numberOfWorkers, + BucketListInterval: pollInterval, + BucketARN: bucket, + BucketListPrefix: "key", + RegionName: "region", + }, + s3: mockAPI, + client: mockPublisher, + s3ObjectHandler: s3ObjProc, + states: states, + provider: "provider", + } + poller.runPoll(ctx) }) } From 568d2b052ec1225b007f595048a8cf1e0c117e71 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 30 Apr 2024 15:40:49 -0400 Subject: [PATCH 47/99] merge sqsReaderInput and sqsReader --- x-pack/filebeat/input/awss3/sqs.go | 131 +++------------ x-pack/filebeat/input/awss3/sqs_input.go | 201 +++++++++++++++++------ 2 files changed, 167 insertions(+), 165 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index f27a3c6fa2c0..2a4255335723 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -5,7 +5,6 @@ package awss3 import ( - "context" "errors" "fmt" "net/url" @@ -16,7 +15,7 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sqs/types" "github.com/aws/smithy-go" - "github.com/elastic/elastic-agent-libs/logp" + v2 "github.com/elastic/beats/v7/filebeat/input/v2" ) const ( @@ -25,111 +24,6 @@ const ( sqsApproximateNumberOfMessages = "ApproximateNumberOfMessages" ) -func newSQSReader(log *logp.Logger, metrics *inputMetrics, sqs sqsAPI, maxMessagesInflight int, msgHandler sqsProcessor) *sqsReader { - if metrics == nil { - // Metrics are optional. Initialize a stub. - metrics = newInputMetrics("", nil, 0) - } - return &sqsReader{ - maxMessagesInflight: maxMessagesInflight, - sqs: sqs, - msgHandler: msgHandler, - log: log, - metrics: metrics, - workChan: make(chan types.Message), - } -} - -// The main loop of the reader, that fetches messages from SQS -// and forwards them to workers via workChan. -func (r *sqsReader) Receive(ctx context.Context) { - r.startWorkers(ctx) - r.readerLoop(ctx) - - // Close the work channel to signal to the workers that we're done, - // then wait for them to finish. - close(r.workChan) - r.workerWg.Wait() -} - -func (r *sqsReader) readerLoop(ctx context.Context) { - for ctx.Err() == nil { - msgs := r.readMessages(ctx) - - for _, msg := range msgs { - select { - case <-ctx.Done(): - case r.workChan <- msg: - } - } - } -} - -func (r *sqsReader) workerLoop(ctx context.Context) { - for msg := range r.workChan { - start := time.Now() - - id := r.metrics.beginSQSWorker() - if err := r.msgHandler.ProcessSQS(ctx, &msg); err != nil { - r.log.Warnw("Failed processing SQS message.", - "error", err, - "message_id", *msg.MessageId, - "elapsed_time_ns", time.Since(start)) - } - r.metrics.endSQSWorker(id) - r.activeMessages.Dec() - } -} - -func (r *sqsReader) readMessages(ctx context.Context) []types.Message { - // We try to read enough messages to bring activeMessages up to the - // total worker count (plus one, to unblock us when workers are ready - // for more messages) - readCount := r.maxMessagesInflight + 1 - r.activeMessages.Load() - if readCount <= 0 { - return nil - } - msgs, err := r.sqs.ReceiveMessage(ctx, readCount) - for err != nil && ctx.Err() == nil { - r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) - // Wait for the retry delay, but stop early if the context is cancelled. - select { - case <-ctx.Done(): - return nil - case <-time.After(sqsRetryDelay): - } - msgs, err = r.sqs.ReceiveMessage(ctx, readCount) - } - r.activeMessages.Add(len(msgs)) - r.log.Debugf("Received %v SQS messages.", len(msgs)) - r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) - return msgs -} - -func (r *sqsReader) startWorkers(ctx context.Context) { - // Start the worker goroutines that will process messages from workChan - // until the input shuts down. - for i := 0; i < r.maxMessagesInflight; i++ { - r.workerWg.Add(1) - go func() { - defer r.workerWg.Done() - r.workerLoop(ctx) - }() - } -} - -func getApproximateMessageCount(ctx context.Context, sqs sqsAPI) (int, error) { - attributes, err := sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) - if err == nil { - if c, found := attributes[sqsApproximateNumberOfMessages]; found { - if messagesCount, err := strconv.Atoi(c); err == nil { - return messagesCount, nil - } - } - } - return -1, err -} - var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { @@ -160,18 +54,18 @@ func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { return "", errBadQueueURL } -func pollSqsWaitingMetric(ctx context.Context, sqs sqsAPI, metrics *inputMetrics) { +func pollSqsWaitingMetric(canceler v2.Canceler, sqs sqsAPI, metrics *inputMetrics) { t := time.NewTicker(time.Minute) defer t.Stop() for { - if err := updateMessageCount(ctx, sqs, metrics); isSQSAuthError(err) { + if err := updateMessageCount(canceler, sqs, metrics); isSQSAuthError(err) { // stop polling if auth error is encountered // Set it back to -1 because there is a permission error metrics.sqsMessagesWaiting.Set(int64(-1)) return } select { - case <-ctx.Done(): + case <-canceler.Done(): return case <-t.C: } @@ -180,14 +74,27 @@ func pollSqsWaitingMetric(ctx context.Context, sqs sqsAPI, metrics *inputMetrics // updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error // If there is an error, the metric is reinitialized to -1 and true is returned -func updateMessageCount(ctx context.Context, sqs sqsAPI, metrics *inputMetrics) error { - count, err := getApproximateMessageCount(ctx, sqs) +func updateMessageCount(canceler v2.Canceler, sqs sqsAPI, metrics *inputMetrics) error { + count, err := getApproximateMessageCount(canceler, sqs) if err == nil { metrics.sqsMessagesWaiting.Set(int64(count)) } return err } +func getApproximateMessageCount(canceler v2.Canceler, sqs sqsAPI) (int, error) { + ctx := v2.GoContextFromCanceler(canceler) + attributes, err := sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) + if err == nil { + if c, found := attributes[sqsApproximateNumberOfMessages]; found { + if messagesCount, err := strconv.Atoi(c); err == nil { + return messagesCount, nil + } + } + } + return -1, err +} + func isSQSAuthError(err error) bool { var apiError smithy.APIError if errors.As(err, &apiError) { diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index 8a5a1b6fa3c6..f30f63e9f20f 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -5,8 +5,10 @@ package awss3 import ( + "context" "fmt" "sync" + "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" @@ -20,18 +22,19 @@ import ( ) type sqsReaderInput struct { - config config - awsConfig awssdk.Config -} - -type sqsReader struct { - maxMessagesInflight int + config config + awsConfig awssdk.Config + maxMessagesInFlight int activeMessages atomic.Int sqs sqsAPI + s3 s3API msgHandler sqsProcessor log *logp.Logger metrics *inputMetrics + // The expected region based on the queue URL + detectedRegion string + // The main loop sends incoming messages to workChan, and the worker // goroutines read from it. workChan chan types.Message @@ -41,9 +44,43 @@ type sqsReader struct { } func newSQSReaderInput(config config, awsConfig awssdk.Config) (v2.Input, error) { + detectedRegion, err := getRegionFromQueueURL(config.QueueURL, config.AWSConfig.Endpoint) + if config.RegionName != "" { + awsConfig.Region = config.RegionName + } else if err != nil { + // Only report an error if we don't have a configured region + // to fall back on. + return nil, fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } + + sqsAPI := &awsSQSAPI{ + client: sqs.NewFromConfig(awsConfig, func(o *sqs.Options) { + if config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + }), + queueURL: config.QueueURL, + apiTimeout: config.APITimeout, + visibilityTimeout: config.VisibilityTimeout, + longPollWaitTime: config.SQSWaitTime, + } + + s3API := &awsS3API{ + client: s3.NewFromConfig(awsConfig, func(o *s3.Options) { + if config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = config.PathStyle + }), + } + return &sqsReaderInput{ - config: config, - awsConfig: awsConfig, + config: config, + awsConfig: awsConfig, + sqs: sqsAPI, + s3: s3API, + detectedRegion: detectedRegion, + workChan: make(chan types.Message), }, nil } @@ -57,57 +94,29 @@ func (in *sqsReaderInput) Run( inputContext v2.Context, pipeline beat.Pipeline, ) error { - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - configRegion := in.config.RegionName - urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) - if err != nil && configRegion == "" { - // Only report an error if we don't have a configured region - // to fall back on. - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } else if configRegion != "" && configRegion != urlRegion { - inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) - } - - in.awsConfig.Region = urlRegion - - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) + // Create SQS reader and S3 notification processor. + err := in.initialize(inputContext, pipeline) if err != nil { return fmt.Errorf("failed to initialize sqs receiver: %w", err) } - defer receiver.metrics.Close() + defer in.metrics.Close() // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver.sqs, receiver.metrics) + go pollSqsWaitingMetric(inputContext.Cancelation, in.sqs, in.metrics) - receiver.Receive(ctx) + in.Receive(inputContext.Cancelation) return nil } -func (in *sqsReaderInput) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { - sqsAPI := &awsSQSAPI{ - client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - }), - queueURL: in.config.QueueURL, - apiTimeout: in.config.APITimeout, - visibilityTimeout: in.config.VisibilityTimeout, - longPollWaitTime: in.config.SQSWaitTime, - } - - s3API := &awsS3API{ - client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }), - } +func (in *sqsReaderInput) initialize(ctx v2.Context, pipeline beat.Pipeline) error { log := ctx.Logger.With("queue_url", in.config.QueueURL) + in.log = log log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) log.Infof("AWS region is set to %v.", in.awsConfig.Region) + if in.awsConfig.Region != in.detectedRegion { + log.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", in.awsConfig.Region, in.detectedRegion, in.awsConfig.Region) + + } log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) @@ -116,21 +125,107 @@ func (in *sqsReaderInput) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeli "or prefixes to avoid an infinite loop.") } + in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + + var err error + in.msgHandler, err = in.createEventProcessor(pipeline) + if err != nil { + return err + } + + in.maxMessagesInFlight = in.config.MaxNumberOfMessages + return nil +} + +func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProcessor, error) { fileSelectors := in.config.FileSelectors if len(in.config.FileSelectors) == 0 { fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} } - script, err := newScriptFromConfig(log.Named("sqs_script"), in.config.SQSScript) + s3EventHandlerFactory := newS3ObjectProcessorFactory(in.log.Named("s3"), in.metrics, in.s3, fileSelectors, in.config.BackupConfig) + + script, err := newScriptFromConfig(in.log.Named("sqs_script"), in.config.SQSScript) if err != nil { return nil, err } - metrics := newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + return newSQSS3EventProcessor(in.log.Named("sqs_s3_event"), in.metrics, in.sqs, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory), nil +} + +// The main loop of the reader, that fetches messages from SQS +// and forwards them to workers via workChan. +func (r *sqsReaderInput) Receive(canceler v2.Canceler) { + ctx := v2.GoContextFromCanceler(canceler) + r.startWorkers(ctx) + r.readerLoop(ctx) + + // Close the work channel to signal to the workers that we're done, + // then wait for them to finish. + close(r.workChan) + r.workerWg.Wait() +} - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, fileSelectors, in.config.BackupConfig) +func (r *sqsReaderInput) readerLoop(ctx context.Context) { + for ctx.Err() == nil { + msgs := r.readMessages(ctx) + + for _, msg := range msgs { + select { + case <-ctx.Done(): + case r.workChan <- msg: + } + } + } +} - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory) +func (r *sqsReaderInput) workerLoop(ctx context.Context) { + for msg := range r.workChan { + start := time.Now() + + id := r.metrics.beginSQSWorker() + if err := r.msgHandler.ProcessSQS(ctx, &msg); err != nil { + r.log.Warnw("Failed processing SQS message.", + "error", err, + "message_id", *msg.MessageId, + "elapsed_time_ns", time.Since(start)) + } + r.metrics.endSQSWorker(id) + r.activeMessages.Dec() + } +} - sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) +func (r *sqsReaderInput) readMessages(ctx context.Context) []types.Message { + // We try to read enough messages to bring activeMessages up to the + // total worker count (plus one, to unblock us when workers are ready + // for more messages) + readCount := r.config.MaxNumberOfMessages + 1 - r.activeMessages.Load() + if readCount <= 0 { + return nil + } + msgs, err := r.sqs.ReceiveMessage(ctx, readCount) + for err != nil && ctx.Err() == nil { + r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) + // Wait for the retry delay, but stop early if the context is cancelled. + select { + case <-ctx.Done(): + return nil + case <-time.After(sqsRetryDelay): + } + msgs, err = r.sqs.ReceiveMessage(ctx, readCount) + } + r.activeMessages.Add(len(msgs)) + r.log.Debugf("Received %v SQS messages.", len(msgs)) + r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) + return msgs +} - return sqsReader, nil +func (r *sqsReaderInput) startWorkers(ctx context.Context) { + // Start the worker goroutines that will process messages from workChan + // until the input shuts down. + for i := 0; i < r.config.MaxNumberOfMessages; i++ { + r.workerWg.Add(1) + go func() { + defer r.workerWg.Done() + r.workerLoop(ctx) + }() + } } From 383c11158a83910bc71e4ebe7c54b17e9eead40b Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 09:23:38 -0400 Subject: [PATCH 48/99] get tests building again --- .../input/awss3/input_benchmark_test.go | 8 +++++- x-pack/filebeat/input/awss3/sqs_test.go | 28 ++++++++++++++++--- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index c1a84802a3cc..f11ec5fab337 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -220,7 +220,13 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) - sqsReader := newSQSReader(log.Named("sqs"), metrics, sqsAPI, maxMessagesInflight, sqsMessageHandler) + sqsReader := &sqsReaderInput{ + log: log.Named("sqs"), + metrics: metrics, + sqs: sqsAPI, + maxMessagesInFlight: maxMessagesInflight, + msgHandler: sqsMessageHandler, + } ctx, cancel := context.WithCancel(context.Background()) b.Cleanup(cancel) diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index 463e358d0f59..e65fe75e0c27 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -71,7 +71,12 @@ func TestSQSReceiver(t *testing.T) { Return(nil) // Execute sqsReader and verify calls/state. - receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) + receiver := &sqsReaderInput{ + log: logp.NewLogger(inputName), + sqs: mockAPI, + maxMessagesInFlight: maxMessages, + msgHandler: mockMsgHandler, + } receiver.Receive(ctx) }) @@ -103,7 +108,12 @@ func TestSQSReceiver(t *testing.T) { ) // Execute SQSReceiver and verify calls/state. - receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) + receiver := &sqsReaderInput{ + log: logp.NewLogger(inputName), + sqs: mockAPI, + maxMessagesInFlight: maxMessages, + msgHandler: mockMsgHandler, + } receiver.Receive(ctx) }) } @@ -134,7 +144,12 @@ func TestGetApproximateMessageCount(t *testing.T) { }), ) - receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) + receiver := &sqsReaderInput{ + log: logp.NewLogger(inputName), + sqs: mockAPI, + maxMessagesInFlight: maxMessages, + msgHandler: mockMsgHandler, + } receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) assert.Equal(t, count, receivedCount) assert.Nil(t, err) @@ -159,7 +174,12 @@ func TestGetApproximateMessageCount(t *testing.T) { }), ) - receiver := newSQSReader(logp.NewLogger(inputName), nil, mockAPI, maxMessages, mockMsgHandler) + receiver := &sqsReaderInput{ + log: logp.NewLogger(inputName), + sqs: mockAPI, + maxMessagesInFlight: maxMessages, + msgHandler: mockMsgHandler, + } receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) assert.Equal(t, -1, receivedCount) assert.NotNil(t, err) From 4b2ea11287da7f55db882d4b930ca26980592670 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 13:40:27 -0400 Subject: [PATCH 49/99] remove redundant fields --- .../input/awss3/input_benchmark_test.go | 10 +- x-pack/filebeat/input/awss3/sqs_input.go | 178 +++++++++--------- x-pack/filebeat/input/awss3/sqs_test.go | 32 ++-- 3 files changed, 114 insertions(+), 106 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index f11ec5fab337..ed9123de0eaf 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -221,11 +221,11 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) sqsReader := &sqsReaderInput{ - log: log.Named("sqs"), - metrics: metrics, - sqs: sqsAPI, - maxMessagesInFlight: maxMessagesInflight, - msgHandler: sqsMessageHandler, + log: log.Named("sqs"), + config: config{MaxNumberOfMessages: maxMessagesInflight}, + metrics: metrics, + sqs: sqsAPI, + msgHandler: sqsMessageHandler, } ctx, cancel := context.WithCancel(context.Background()) diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index f30f63e9f20f..733ede10fa9c 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -22,22 +22,22 @@ import ( ) type sqsReaderInput struct { - config config - awsConfig awssdk.Config - maxMessagesInFlight int - activeMessages atomic.Int - sqs sqsAPI - s3 s3API - msgHandler sqsProcessor - log *logp.Logger - metrics *inputMetrics + config config + awsConfig awssdk.Config + activeMessages atomic.Int + sqs sqsAPI + s3 s3API + msgHandler sqsProcessor + log *logp.Logger + metrics *inputMetrics // The expected region based on the queue URL detectedRegion string - // The main loop sends incoming messages to workChan, and the worker - // goroutines read from it. - workChan chan types.Message + // Workers send on workRequestChan to indicate they're ready for the next + // message, and the reader loop replies on workResponseChan. + workRequestChan chan struct{} + workResponseChan chan types.Message // workerWg is used to wait on worker goroutines during shutdown workerWg sync.WaitGroup @@ -75,12 +75,13 @@ func newSQSReaderInput(config config, awsConfig awssdk.Config) (v2.Input, error) } return &sqsReaderInput{ - config: config, - awsConfig: awsConfig, - sqs: sqsAPI, - s3: s3API, - detectedRegion: detectedRegion, - workChan: make(chan types.Message), + config: config, + awsConfig: awsConfig, + sqs: sqsAPI, + s3: s3API, + detectedRegion: detectedRegion, + workRequestChan: make(chan struct{}, config.MaxNumberOfMessages), + workResponseChan: make(chan types.Message), }, nil } @@ -94,47 +95,31 @@ func (in *sqsReaderInput) Run( inputContext v2.Context, pipeline beat.Pipeline, ) error { - // Create SQS reader and S3 notification processor. - err := in.initialize(inputContext, pipeline) + in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) + in.logConfigSummary() + + in.metrics = newInputMetrics(inputContext.ID, nil, in.config.MaxNumberOfMessages) + defer in.metrics.Close() + + var err error + in.msgHandler, err = in.createEventProcessor(pipeline) if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) + return fmt.Errorf("failed to initialize sqs reader: %w", err) } - defer in.metrics.Close() // Poll metrics periodically in the background go pollSqsWaitingMetric(inputContext.Cancelation, in.sqs, in.metrics) - in.Receive(inputContext.Cancelation) + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + in.run(ctx) return nil } -func (in *sqsReaderInput) initialize(ctx v2.Context, pipeline beat.Pipeline) error { - log := ctx.Logger.With("queue_url", in.config.QueueURL) - in.log = log - log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) - log.Infof("AWS region is set to %v.", in.awsConfig.Region) - if in.awsConfig.Region != in.detectedRegion { - log.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", in.awsConfig.Region, in.detectedRegion, in.awsConfig.Region) - - } - log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) - log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) - - if in.config.BackupConfig.GetBucketName() != "" { - log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + - "or prefixes to avoid an infinite loop.") - } - - in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) +func (in *sqsReaderInput) run(ctx context.Context) { + in.startWorkers(ctx) + in.readerLoop(ctx) - var err error - in.msgHandler, err = in.createEventProcessor(pipeline) - if err != nil { - return err - } - - in.maxMessagesInFlight = in.config.MaxNumberOfMessages - return nil + in.workerWg.Wait() } func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProcessor, error) { @@ -151,81 +136,104 @@ func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProce return newSQSS3EventProcessor(in.log.Named("sqs_s3_event"), in.metrics, in.sqs, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory), nil } -// The main loop of the reader, that fetches messages from SQS -// and forwards them to workers via workChan. -func (r *sqsReaderInput) Receive(canceler v2.Canceler) { - ctx := v2.GoContextFromCanceler(canceler) - r.startWorkers(ctx) - r.readerLoop(ctx) - - // Close the work channel to signal to the workers that we're done, - // then wait for them to finish. - close(r.workChan) - r.workerWg.Wait() -} - -func (r *sqsReaderInput) readerLoop(ctx context.Context) { +func (in *sqsReaderInput) readerLoop(ctx context.Context) { + requestCount := 0 for ctx.Err() == nil { - msgs := r.readMessages(ctx) + // Check for any new pending work requests + for { + select { + case <-in. + } + } + + msgs := in.readMessages(ctx) for _, msg := range msgs { select { case <-ctx.Done(): - case r.workChan <- msg: + case in.workChan <- msg: } } } } -func (r *sqsReaderInput) workerLoop(ctx context.Context) { - for msg := range r.workChan { +func (in *sqsReaderInput) workerLoop(ctx context.Context) { + for ctx.Err() == nil { + select { + case <-ctx.Done(): + // Shutting down + return + case in.workRequestChan <- struct{}{}: + } + // We successfully sent a work request, now we must wait for the + // response (even if ctx expires). + msg, ok := <-in.workResponseChan + if !ok { + // No task available, reader is shutting down + return + } start := time.Now() - id := r.metrics.beginSQSWorker() - if err := r.msgHandler.ProcessSQS(ctx, &msg); err != nil { - r.log.Warnw("Failed processing SQS message.", + id := in.metrics.beginSQSWorker() + if err := in.msgHandler.ProcessSQS(ctx, &msg); err != nil { + in.log.Warnw("Failed processing SQS message.", "error", err, "message_id", *msg.MessageId, "elapsed_time_ns", time.Since(start)) } - r.metrics.endSQSWorker(id) - r.activeMessages.Dec() + in.metrics.endSQSWorker(id) } } -func (r *sqsReaderInput) readMessages(ctx context.Context) []types.Message { +func (in *sqsReaderInput) readMessages(ctx context.Context) []types.Message { // We try to read enough messages to bring activeMessages up to the // total worker count (plus one, to unblock us when workers are ready // for more messages) - readCount := r.config.MaxNumberOfMessages + 1 - r.activeMessages.Load() + readCount := in.config.MaxNumberOfMessages + 1 - in.activeMessages.Load() if readCount <= 0 { return nil } - msgs, err := r.sqs.ReceiveMessage(ctx, readCount) + msgs, err := in.sqs.ReceiveMessage(ctx, readCount) for err != nil && ctx.Err() == nil { - r.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) + in.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) // Wait for the retry delay, but stop early if the context is cancelled. select { case <-ctx.Done(): return nil case <-time.After(sqsRetryDelay): } - msgs, err = r.sqs.ReceiveMessage(ctx, readCount) + msgs, err = in.sqs.ReceiveMessage(ctx, readCount) } - r.activeMessages.Add(len(msgs)) - r.log.Debugf("Received %v SQS messages.", len(msgs)) - r.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) + in.activeMessages.Add(len(msgs)) + in.log.Debugf("Received %v SQS messages.", len(msgs)) + in.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) return msgs } -func (r *sqsReaderInput) startWorkers(ctx context.Context) { +func (in *sqsReaderInput) startWorkers(ctx context.Context) { // Start the worker goroutines that will process messages from workChan // until the input shuts down. - for i := 0; i < r.config.MaxNumberOfMessages; i++ { - r.workerWg.Add(1) + for i := 0; i < in.config.MaxNumberOfMessages; i++ { + in.workerWg.Add(1) go func() { - defer r.workerWg.Done() - r.workerLoop(ctx) + defer in.workerWg.Done() + in.workerLoop(ctx) }() } } + +func (in *sqsReaderInput) logConfigSummary() { + log := in.log + log.Infof("AWS api_timeout is set to %v.", in.config.APITimeout) + log.Infof("AWS region is set to %v.", in.awsConfig.Region) + if in.awsConfig.Region != in.detectedRegion { + log.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", in.awsConfig.Region, in.detectedRegion, in.awsConfig.Region) + } + log.Infof("AWS SQS visibility_timeout is set to %v.", in.config.VisibilityTimeout) + log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) + + if in.config.BackupConfig.GetBucketName() != "" { + log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + + "or prefixes to avoid an infinite loop.") + } +} diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index e65fe75e0c27..570b5fe9023b 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -72,10 +72,10 @@ func TestSQSReceiver(t *testing.T) { // Execute sqsReader and verify calls/state. receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - sqs: mockAPI, - maxMessagesInFlight: maxMessages, - msgHandler: mockMsgHandler, + log: logp.NewLogger(inputName), + config: config{MaxNumberOfMessages: maxMessages}, + sqs: mockAPI, + msgHandler: mockMsgHandler, } receiver.Receive(ctx) }) @@ -109,10 +109,10 @@ func TestSQSReceiver(t *testing.T) { // Execute SQSReceiver and verify calls/state. receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - sqs: mockAPI, - maxMessagesInFlight: maxMessages, - msgHandler: mockMsgHandler, + log: logp.NewLogger(inputName), + config: config{MaxNumberOfMessages: maxMessages}, + sqs: mockAPI, + msgHandler: mockMsgHandler, } receiver.Receive(ctx) }) @@ -145,10 +145,10 @@ func TestGetApproximateMessageCount(t *testing.T) { ) receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - sqs: mockAPI, - maxMessagesInFlight: maxMessages, - msgHandler: mockMsgHandler, + log: logp.NewLogger(inputName), + config: config{MaxNumberOfMessages: maxMessages}, + sqs: mockAPI, + msgHandler: mockMsgHandler, } receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) assert.Equal(t, count, receivedCount) @@ -175,10 +175,10 @@ func TestGetApproximateMessageCount(t *testing.T) { ) receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - sqs: mockAPI, - maxMessagesInFlight: maxMessages, - msgHandler: mockMsgHandler, + log: logp.NewLogger(inputName), + config: config{MaxNumberOfMessages: maxMessages}, + sqs: mockAPI, + msgHandler: mockMsgHandler, } receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) assert.Equal(t, -1, receivedCount) From de3681673791be1f9c94e6565cc0f7412264693d Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 15:30:22 -0400 Subject: [PATCH 50/99] more reorganization --- .../input/awss3/input_benchmark_test.go | 2 +- x-pack/filebeat/input/awss3/input_test.go | 5 +- x-pack/filebeat/input/awss3/sqs.go | 91 ++++++++---- x-pack/filebeat/input/awss3/sqs_input.go | 131 +++++++++--------- x-pack/filebeat/input/awss3/sqs_test.go | 27 +--- 5 files changed, 141 insertions(+), 115 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index ed9123de0eaf..06facd292ac6 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -240,7 +240,7 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR b.ResetTimer() start := time.Now() - sqsReader.Receive(ctx) + sqsReader.run(ctx) b.StopTimer() elapsed := time.Since(start) diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index 0a3053f7f1b9..a8015435dd40 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -88,10 +88,7 @@ func TestGetRegionFromQueueURL(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := getRegionFromQueueURL(test.queueURL, test.endpoint) - if !sameError(err, test.wantErr) { - t.Errorf("unexpected error: got:%v want:%v", err, test.wantErr) - } + got := getRegionFromQueueURL(test.queueURL, test.endpoint) if got != test.want { t.Errorf("unexpected result: got:%q want:%q", got, test.want) } diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 2a4255335723..297d613728d0 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -5,8 +5,8 @@ package awss3 import ( + "context" "errors" - "fmt" "net/url" "strconv" "strings" @@ -15,9 +15,14 @@ import ( "github.com/aws/aws-sdk-go-v2/service/sqs/types" "github.com/aws/smithy-go" - v2 "github.com/elastic/beats/v7/filebeat/input/v2" + "github.com/elastic/elastic-agent-libs/logp" ) +type messageCountMonitor struct { + sqs sqsAPI + metrics *inputMetrics +} + const ( sqsAccessDeniedErrorCode = "AccessDeniedException" sqsRetryDelay = 10 * time.Second @@ -26,46 +31,77 @@ const ( var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") -func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { +func getRegionFromQueueURL(queueURL, endpoint string) string { // get region from queueURL // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue u, err := url.Parse(queueURL) if err != nil { - return "", fmt.Errorf(queueURL + " is not a valid URL") + return "" } - if (u.Scheme == "https" || u.Scheme == "http") && u.Host != "" { - queueHostSplit := strings.SplitN(u.Host, ".", 3) - // check for sqs queue url - if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { - if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - return queueHostSplit[1], nil - } + + // check for sqs queue url + host := strings.SplitN(u.Host, ".", 3) + if len(host) == 3 && host[0] == "sqs" { + if host[2] == endpoint || (endpoint == "" && strings.HasPrefix(host[2], "amazonaws.")) { + return host[1] } + } - // check for vpce url - queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) - if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { - if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - return queueHostSplitVPC[2], nil - } + // check for vpce url + host = strings.SplitN(u.Host, ".", 5) + if len(host) == 5 && host[1] == "sqs" { + if host[4] == endpoint || (endpoint == "" && strings.HasPrefix(host[4], "amazonaws.")) { + return host[2] + } + } + + return "" +} + +// readSQSMessages reads up to the requested number of SQS messages via +// ReceiveMessage. It always returns at least one result unless the +// context expires +func readSQSMessages( + ctx context.Context, + log *logp.Logger, + sqs sqsAPI, + metrics *inputMetrics, + count int, +) []types.Message { + if count <= 0 { + return nil + } + msgs, err := sqs.ReceiveMessage(ctx, count) + for (err != nil || len(msgs) == 0) && ctx.Err() == nil { + if err != nil { + log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) + } + // Wait for the retry delay, but stop early if the context is cancelled. + select { + case <-ctx.Done(): + return nil + case <-time.After(sqsRetryDelay): } + msgs, err = sqs.ReceiveMessage(ctx, count) } - return "", errBadQueueURL + log.Debugf("Received %v SQS messages.", len(msgs)) + metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) + return msgs } -func pollSqsWaitingMetric(canceler v2.Canceler, sqs sqsAPI, metrics *inputMetrics) { +func (mcm messageCountMonitor) run(ctx context.Context) { t := time.NewTicker(time.Minute) defer t.Stop() for { - if err := updateMessageCount(canceler, sqs, metrics); isSQSAuthError(err) { + if err := mcm.updateMessageCount(ctx); isSQSAuthError(err) { // stop polling if auth error is encountered // Set it back to -1 because there is a permission error - metrics.sqsMessagesWaiting.Set(int64(-1)) + mcm.metrics.sqsMessagesWaiting.Set(int64(-1)) return } select { - case <-canceler.Done(): + case <-ctx.Done(): return case <-t.C: } @@ -74,17 +110,16 @@ func pollSqsWaitingMetric(canceler v2.Canceler, sqs sqsAPI, metrics *inputMetric // updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error // If there is an error, the metric is reinitialized to -1 and true is returned -func updateMessageCount(canceler v2.Canceler, sqs sqsAPI, metrics *inputMetrics) error { - count, err := getApproximateMessageCount(canceler, sqs) +func (mcm messageCountMonitor) updateMessageCount(ctx context.Context) error { + count, err := mcm.getApproximateMessageCount(ctx) if err == nil { - metrics.sqsMessagesWaiting.Set(int64(count)) + mcm.metrics.sqsMessagesWaiting.Set(int64(count)) } return err } -func getApproximateMessageCount(canceler v2.Canceler, sqs sqsAPI) (int, error) { - ctx := v2.GoContextFromCanceler(canceler) - attributes, err := sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) +func (mcm messageCountMonitor) getApproximateMessageCount(ctx context.Context) (int, error) { + attributes, err := mcm.sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) if err == nil { if c, found := attributes[sqsApproximateNumberOfMessages]; found { if messagesCount, err := strconv.Atoi(c); err == nil { diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index 733ede10fa9c..de852fb8bf36 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -17,19 +17,17 @@ import ( v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common/atomic" "github.com/elastic/elastic-agent-libs/logp" ) type sqsReaderInput struct { - config config - awsConfig awssdk.Config - activeMessages atomic.Int - sqs sqsAPI - s3 s3API - msgHandler sqsProcessor - log *logp.Logger - metrics *inputMetrics + config config + awsConfig awssdk.Config + sqs sqsAPI + s3 s3API + msgHandler sqsProcessor + log *logp.Logger + metrics *inputMetrics // The expected region based on the queue URL detectedRegion string @@ -43,14 +41,14 @@ type sqsReaderInput struct { workerWg sync.WaitGroup } -func newSQSReaderInput(config config, awsConfig awssdk.Config) (v2.Input, error) { - detectedRegion, err := getRegionFromQueueURL(config.QueueURL, config.AWSConfig.Endpoint) +func newSQSReaderInput(config config, awsConfig awssdk.Config) (*sqsReaderInput, error) { + detectedRegion := getRegionFromQueueURL(config.QueueURL, config.AWSConfig.Endpoint) if config.RegionName != "" { awsConfig.Region = config.RegionName - } else if err != nil { + } else if detectedRegion == "" { // Only report an error if we don't have a configured region // to fall back on. - return nil, fmt.Errorf("failed to get AWS region from queue_url: %w", err) + return nil, fmt.Errorf("failed to get AWS region from queue_url: %w", errBadQueueURL) } sqsAPI := &awsSQSAPI{ @@ -107,11 +105,17 @@ func (in *sqsReaderInput) Run( return fmt.Errorf("failed to initialize sqs reader: %w", err) } + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + // Poll metrics periodically in the background - go pollSqsWaitingMetric(inputContext.Cancelation, in.sqs, in.metrics) + go messageCountMonitor{ + sqs: in.sqs, + metrics: in.metrics, + }.run(ctx) - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + // Start the main run loop in.run(ctx) + return nil } @@ -137,21 +141,21 @@ func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProce } func (in *sqsReaderInput) readerLoop(ctx context.Context) { + // requestCount is the number of outstanding work requests that the + // reader will try to fulfill requestCount := 0 for ctx.Err() == nil { - // Check for any new pending work requests - for { - select { - case <-in. - } - } + // Block to wait for more requests if requestCount is zero + requestCount += channelRequestCount(ctx, in.workRequestChan, requestCount == 0) - msgs := in.readMessages(ctx) + msgs := readSQSMessages(ctx, in.log, in.sqs, in.metrics, requestCount) for _, msg := range msgs { select { case <-ctx.Done(): - case in.workChan <- msg: + return + case in.workResponseChan <- msg: + requestCount-- } } } @@ -159,55 +163,30 @@ func (in *sqsReaderInput) readerLoop(ctx context.Context) { func (in *sqsReaderInput) workerLoop(ctx context.Context) { for ctx.Err() == nil { + // Send a work request select { case <-ctx.Done(): // Shutting down return case in.workRequestChan <- struct{}{}: } - // We successfully sent a work request, now we must wait for the - // response (even if ctx expires). - msg, ok := <-in.workResponseChan - if !ok { - // No task available, reader is shutting down - return - } - start := time.Now() - - id := in.metrics.beginSQSWorker() - if err := in.msgHandler.ProcessSQS(ctx, &msg); err != nil { - in.log.Warnw("Failed processing SQS message.", - "error", err, - "message_id", *msg.MessageId, - "elapsed_time_ns", time.Since(start)) - } - in.metrics.endSQSWorker(id) - } -} - -func (in *sqsReaderInput) readMessages(ctx context.Context) []types.Message { - // We try to read enough messages to bring activeMessages up to the - // total worker count (plus one, to unblock us when workers are ready - // for more messages) - readCount := in.config.MaxNumberOfMessages + 1 - in.activeMessages.Load() - if readCount <= 0 { - return nil - } - msgs, err := in.sqs.ReceiveMessage(ctx, readCount) - for err != nil && ctx.Err() == nil { - in.log.Warnw("SQS ReceiveMessage returned an error. Will retry after a short delay.", "error", err) - // Wait for the retry delay, but stop early if the context is cancelled. + // The request is sent, wait for a response select { case <-ctx.Done(): - return nil - case <-time.After(sqsRetryDelay): + return + case msg := <-in.workResponseChan: + start := time.Now() + + id := in.metrics.beginSQSWorker() + if err := in.msgHandler.ProcessSQS(ctx, &msg); err != nil { + in.log.Warnw("Failed processing SQS message.", + "error", err, + "message_id", *msg.MessageId, + "elapsed_time_ns", time.Since(start)) + } + in.metrics.endSQSWorker(id) } - msgs, err = in.sqs.ReceiveMessage(ctx, readCount) } - in.activeMessages.Add(len(msgs)) - in.log.Debugf("Received %v SQS messages.", len(msgs)) - in.metrics.sqsMessagesReceivedTotal.Add(uint64(len(msgs))) - return msgs } func (in *sqsReaderInput) startWorkers(ctx context.Context) { @@ -237,3 +216,31 @@ func (in *sqsReaderInput) logConfigSummary() { "or prefixes to avoid an infinite loop.") } } + +// Read all pending requests and return their count. If block is true, +// waits until the result is at least 1, unless the context expires. +func channelRequestCount( + ctx context.Context, + requestChan chan struct{}, + block bool, +) int { + requestCount := 0 + if block { + // Wait until at least one request comes in. + select { + case <-ctx.Done(): + return 0 + case <-requestChan: + requestCount++ + } + } + // Read as many requests as we can without blocking. + for { + select { + case <-requestChan: + requestCount++ + default: + return requestCount + } + } +} diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index 570b5fe9023b..d93985f8d50a 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -77,7 +77,7 @@ func TestSQSReceiver(t *testing.T) { sqs: mockAPI, msgHandler: mockMsgHandler, } - receiver.Receive(ctx) + receiver.run(ctx) }) t.Run("retry after ReceiveMessage error", func(t *testing.T) { @@ -114,7 +114,7 @@ func TestSQSReceiver(t *testing.T) { sqs: mockAPI, msgHandler: mockMsgHandler, } - receiver.Receive(ctx) + receiver.run(ctx) }) } @@ -126,14 +126,13 @@ func TestGetApproximateMessageCount(t *testing.T) { attrName := []types.QueueAttributeName{sqsApproximateNumberOfMessages} attr := map[string]string{"ApproximateNumberOfMessages": "500"} - t.Run("GetApproximateMessageCount success", func(t *testing.T) { + t.Run("getApproximateMessageCount success", func(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), testTimeout) defer cancel() ctrl, ctx := gomock.WithContext(ctx, t) defer ctrl.Finish() mockAPI := NewMockSQSAPI(ctrl) - mockMsgHandler := NewMockSQSProcessor(ctrl) gomock.InOrder( mockAPI.EXPECT(). @@ -144,15 +143,10 @@ func TestGetApproximateMessageCount(t *testing.T) { }), ) - receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - config: config{MaxNumberOfMessages: maxMessages}, - sqs: mockAPI, - msgHandler: mockMsgHandler, - } - receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) + receivedCount, err := + messageCountMonitor{sqs: mockAPI}.getApproximateMessageCount(ctx) assert.Equal(t, count, receivedCount) - assert.Nil(t, err) + assert.NoError(t, err) }) t.Run("GetApproximateMessageCount error", func(t *testing.T) { @@ -163,7 +157,6 @@ func TestGetApproximateMessageCount(t *testing.T) { defer ctrl.Finish() mockAPI := NewMockSQSAPI(ctrl) - mockMsgHandler := NewMockSQSProcessor(ctrl) gomock.InOrder( mockAPI.EXPECT(). @@ -174,13 +167,7 @@ func TestGetApproximateMessageCount(t *testing.T) { }), ) - receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - config: config{MaxNumberOfMessages: maxMessages}, - sqs: mockAPI, - msgHandler: mockMsgHandler, - } - receivedCount, err := getApproximateMessageCount(ctx, receiver.sqs) + receivedCount, err := messageCountMonitor{sqs: mockAPI}.getApproximateMessageCount(ctx) assert.Equal(t, -1, receivedCount) assert.NotNil(t, err) }) From 31173342735b8f8613859777f69e48e1cec6b296 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 15:41:27 -0400 Subject: [PATCH 51/99] organizing --- x-pack/filebeat/input/awss3/sqs_input.go | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index de852fb8bf36..a32cd68cf407 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -126,20 +126,6 @@ func (in *sqsReaderInput) run(ctx context.Context) { in.workerWg.Wait() } -func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProcessor, error) { - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } - s3EventHandlerFactory := newS3ObjectProcessorFactory(in.log.Named("s3"), in.metrics, in.s3, fileSelectors, in.config.BackupConfig) - - script, err := newScriptFromConfig(in.log.Named("sqs_script"), in.config.SQSScript) - if err != nil { - return nil, err - } - return newSQSS3EventProcessor(in.log.Named("sqs_s3_event"), in.metrics, in.sqs, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory), nil -} - func (in *sqsReaderInput) readerLoop(ctx context.Context) { // requestCount is the number of outstanding work requests that the // reader will try to fulfill @@ -212,11 +198,25 @@ func (in *sqsReaderInput) logConfigSummary() { log.Infof("AWS SQS max_number_of_messages is set to %v.", in.config.MaxNumberOfMessages) if in.config.BackupConfig.GetBucketName() != "" { - log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets" + + log.Warnf("You have the backup_to_bucket functionality activated with SQS. Please make sure to set appropriate destination buckets " + "or prefixes to avoid an infinite loop.") } } +func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProcessor, error) { + fileSelectors := in.config.FileSelectors + if len(in.config.FileSelectors) == 0 { + fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} + } + s3EventHandlerFactory := newS3ObjectProcessorFactory(in.log.Named("s3"), in.metrics, in.s3, fileSelectors, in.config.BackupConfig) + + script, err := newScriptFromConfig(in.log.Named("sqs_script"), in.config.SQSScript) + if err != nil { + return nil, err + } + return newSQSS3EventProcessor(in.log.Named("sqs_s3_event"), in.metrics, in.sqs, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory), nil +} + // Read all pending requests and return their count. If block is true, // waits until the result is at least 1, unless the context expires. func channelRequestCount( From 6b43ac5d7f1e2f721fd3f25af2d3ab5afdf28414 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 15:44:24 -0400 Subject: [PATCH 52/99] reordering code --- x-pack/filebeat/input/awss3/config.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index 37bb75fbb984..87b6efa13b40 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -266,8 +266,8 @@ func (c config) s3ConfigModifier(o *s3.Options) { } func (c config) getFileSelectors() []fileSelectorConfig { - if len(c.FileSelectors) == 0 { - return []fileSelectorConfig{{ReaderConfig: c.ReaderConfig}} + if len(c.FileSelectors) > 0 { + return c.FileSelectors } - return c.FileSelectors + return []fileSelectorConfig{{ReaderConfig: c.ReaderConfig}} } From 3712af4b8edd122824ac7ac981740346df3e270c Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 16:42:12 -0400 Subject: [PATCH 53/99] clean up states initialization --- x-pack/filebeat/input/awss3/input.go | 6 +- .../input/awss3/input_benchmark_test.go | 14 +--- x-pack/filebeat/input/awss3/s3_input.go | 25 +++---- x-pack/filebeat/input/awss3/s3_test.go | 32 ++------- x-pack/filebeat/input/awss3/sqs_test.go | 1 - x-pack/filebeat/input/awss3/states.go | 69 ++++++++++++------- x-pack/filebeat/input/awss3/states_test.go | 17 +---- 7 files changed, 66 insertions(+), 98 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index fb5f23881452..f2bc399c30e5 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -64,11 +64,7 @@ func (im *s3InputManager) Create(cfg *conf.C) (v2.Input, error) { } if config.BucketARN != "" || config.NonAWSBucketName != "" { - persistentStore, err := im.store.Access() - if err != nil { - return nil, fmt.Errorf("can not access persistent store: %w", err) - } - return newS3PollerInput(config, awsConfig, persistentStore) + return newS3PollerInput(config, awsConfig, im.store) } return nil, fmt.Errorf("configuration has no SQS queue URL and no S3 bucket ARN") diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 06facd292ac6..2f300d42501e 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -17,9 +17,6 @@ import ( "github.com/stretchr/testify/assert" - "github.com/elastic/beats/v7/libbeat/statestore" - "github.com/elastic/beats/v7/libbeat/statestore/storetest" - "github.com/elastic/beats/v7/libbeat/beat" "github.com/aws/aws-sdk-go-v2/aws" @@ -339,14 +336,9 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult curConfig.BucketListPrefix = fmt.Sprintf("list_prefix_%d", i) s3API := newConstantS3(t) s3API.pagerConstant = newS3PagerConstant(curConfig.BucketListPrefix) - storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) - store, err := storeReg.Get("test") - if err != nil { - errChan <- fmt.Errorf("failed to access store: %w", err) - return - } + store := openTestStatestore() - states, err := newStates(store) + states, err := newStates(nil, store) assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}) @@ -361,7 +353,7 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult provider: "provider", } - s3Poller.scanLoop(ctx) + s3Poller.run(ctx) }(i, wg) } diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index af0e45ebe921..ad27a40f561b 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -13,10 +13,10 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws/ratelimit" + "github.com/elastic/beats/v7/filebeat/beater" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/backoff" - "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/timed" @@ -30,6 +30,7 @@ type s3PollerInput struct { log *logp.Logger config config awsConfig awssdk.Config + store beater.StateStore provider string s3 s3API metrics *inputMetrics @@ -53,18 +54,13 @@ func (in *s3PollerInput) Test(ctx v2.TestContext) error { func newS3PollerInput( config config, awsConfig awssdk.Config, - store *statestore.Store, + store beater.StateStore, ) (v2.Input, error) { - states, err := newStates(store) - if err != nil { - return nil, fmt.Errorf("can not start persistent store: %w", err) - } - return &s3PollerInput{ config: config, awsConfig: awsConfig, - states: states, + store: store, }, nil } @@ -86,6 +82,11 @@ func (in *s3PollerInput) Run( log := inputContext.Logger.Named("s3") var err error + // Load the persistent S3 polling state. + in.states, err = newStates(log, in.store) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } defer in.states.Close() // Create client for publishing events and receive notification of their ACKs. @@ -111,14 +112,14 @@ func (in *s3PollerInput) Run( in.config.getFileSelectors(), in.config.BackupConfig) - // Scan the bucket in a loop, delaying by the configured interval each - // iteration. - in.scanLoop(ctx) + in.run(ctx) return nil } -func (in *s3PollerInput) scanLoop(ctx context.Context) { +func (in *s3PollerInput) run(ctx context.Context) { + // Scan the bucket in a loop, delaying by the configured interval each + // iteration. for ctx.Err() == nil { in.runPoll(ctx) _ = timed.Wait(ctx, in.config.BucketListInterval) diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index 91f3bdddf03b..ba5fa4b32dcd 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -15,8 +15,6 @@ import ( "github.com/golang/mock/gomock" "github.com/stretchr/testify/require" - "github.com/elastic/beats/v7/libbeat/statestore" - "github.com/elastic/beats/v7/libbeat/statestore/storetest" "github.com/elastic/elastic-agent-libs/logp" ) @@ -29,11 +27,7 @@ func TestS3Poller(t *testing.T) { const testTimeout = 1 * time.Second t.Run("Poll success", func(t *testing.T) { - storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) - store, err := storeReg.Get("test") - if err != nil { - t.Fatalf("Failed to access store: %v", err) - } + store := openTestStatestore() ctx, cancel := context.WithTimeout(context.Background(), testTimeout) defer cancel() @@ -133,7 +127,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) - states, err := newStates(store) + states, err := newStates(nil, store) require.NoError(t, err, "states creation must succeed") poller := &s3PollerInput{ log: logp.NewLogger(inputName), @@ -153,29 +147,11 @@ func TestS3Poller(t *testing.T) { poller.runPoll(ctx) }) - /* - func newS3Poller(log *logp.Logger, - - metrics *inputMetrics, - - s3 s3API, - - client beat.Client, - - s3ObjectHandler s3ObjectHandlerFactory, - - states *states, - - bucket string, - - listPrefix string, - - awsRegion string, - - provider string, - - numberOfWorkers int, - - bucketPollInterval time.Duration,*/ - t.Run("restart bucket scan after paging errors", func(t *testing.T) { // Change the restart limit to 2 consecutive errors, so the test doesn't // take too long to run readerLoopMaxCircuitBreaker = 2 - storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) - store, err := storeReg.Get("test") - if err != nil { - t.Fatalf("Failed to access store: %v", err) - } + store := openTestStatestore() ctx, cancel := context.WithTimeout(context.Background(), testTimeout+pollInterval) defer cancel() @@ -288,7 +264,7 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) - states, err := newStates(store) + states, err := newStates(nil, store) require.NoError(t, err, "states creation must succeed") poller := &s3PollerInput{ log: logp.NewLogger(inputName), diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index d93985f8d50a..c6c7be7a72af 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -121,7 +121,6 @@ func TestSQSReceiver(t *testing.T) { func TestGetApproximateMessageCount(t *testing.T) { logp.TestingSetup() - const maxMessages = 5 const count = 500 attrName := []types.QueueAttributeName{sqsApproximateNumberOfMessages} attr := map[string]string{"ApproximateNumberOfMessages": "500"} diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index fe7a78a28bdc..cb40abbd41f0 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -5,10 +5,13 @@ package awss3 import ( + "fmt" "strings" "sync" + "github.com/elastic/beats/v7/filebeat/beater" "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/elastic-agent-libs/logp" ) const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" @@ -28,34 +31,15 @@ type states struct { } // newStates generates a new states registry. -func newStates(store *statestore.Store) (*states, error) { - stateTable := map[string]state{} - - err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { - if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { - return true, nil - } - - // try to decode. Ignore faulty/incompatible values. - var st state - if err := dec.Decode(&st); err != nil { - // Skip this key but continue iteration - //nolint:nilerr // One bad object shouldn't stop iteration - return true, nil - } - if !st.Stored && !st.Failed { - // This is from an older version where state could be stored in the - // registry even if the object wasn't processed, or if it encountered - // ephemeral download errors. We don't add these to the in-memory cache, - // so if we see them during a bucket scan we will still retry them. - return true, nil - } +func newStates(log *logp.Logger, stateStore beater.StateStore) (*states, error) { + store, err := stateStore.Access() + if err != nil { + return nil, fmt.Errorf("can't access persistent store: %w", err) + } - stateTable[st.ID()] = st - return true, nil - }) + stateTable, err := loadS3StatesFromRegistry(log, store) if err != nil { - return nil, err + return nil, fmt.Errorf("loading S3 input state: %w", err) } return &states{ @@ -94,3 +78,36 @@ func (s *states) Close() { s.store.Close() s.storeLock.Unlock() } + +func loadS3StatesFromRegistry(log *logp.Logger, store *statestore.Store) (map[string]state, error) { + stateTable := map[string]state{} + err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { + if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { + return true, nil + } + + // try to decode. Ignore faulty/incompatible values. + var st state + if err := dec.Decode(&st); err != nil { + // Skip this key but continue iteration + if log != nil { + log.Warnf("invalid S3 state loading object key %v", key) + } + return true, nil + } + if !st.Stored && !st.Failed { + // This is from an older version where state could be stored in the + // registry even if the object wasn't processed, or if it encountered + // ephemeral download errors. We don't add these to the in-memory cache, + // so if we see them during a bucket scan we will still retry them. + return true, nil + } + + stateTable[st.ID()] = st + return true, nil + }) + if err != nil { + return nil, err + } + return stateTable, nil +} diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 082e5819f5ce..dc345d5f88e8 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -5,7 +5,6 @@ package awss3 import ( - "context" "testing" "time" @@ -15,9 +14,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - - v2 "github.com/elastic/beats/v7/filebeat/input/v2" - "github.com/elastic/elastic-agent-libs/logp" ) type testInputStore struct { @@ -42,11 +38,6 @@ func (s *testInputStore) CleanupInterval() time.Duration { return 24 * time.Hour } -var inputCtx = v2.Context{ - Logger: logp.NewLogger("test"), - Cancelation: context.Background(), -} - func TestStatesAddStateAndIsProcessed(t *testing.T) { type stateTestCase struct { // An initialization callback to invoke on the (initially empty) states. @@ -117,17 +108,13 @@ func TestStatesAddStateAndIsProcessed(t *testing.T) { test := test t.Run(name, func(t *testing.T) { store := openTestStatestore() - persistentStore, err := store.Access() - if err != nil { - t.Fatalf("unexpected err: %v", err) - } - states, err := newStates(persistentStore) + states, err := newStates(nil, store) require.NoError(t, err, "states creation must succeed") if test.statesEdit != nil { test.statesEdit(states) } if test.shouldReload { - states, err = newStates(persistentStore) + states, err = newStates(nil, store) require.NoError(t, err, "states creation must succeed") } From 894ba4c8e4890e213fc2d78f75d65bfd389a1d74 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 1 May 2024 17:20:09 -0400 Subject: [PATCH 54/99] remove unused helper --- x-pack/filebeat/input/awss3/input_test.go | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index a8015435dd40..526ccd29f477 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -95,14 +95,3 @@ func TestGetRegionFromQueueURL(t *testing.T) { }) } } - -func sameError(a, b error) bool { - switch { - case a == nil && b == nil: - return true - case a == nil, b == nil: - return false - default: - return a.Error() == b.Error() - } -} From 31f3b95f1a96975c045ea93899f2a4e16323fa11 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 2 May 2024 10:53:41 -0400 Subject: [PATCH 55/99] working on test updates --- .../input/awss3/input_benchmark_test.go | 22 +++++++-- x-pack/filebeat/input/awss3/sqs_input.go | 48 +++++++++++++++---- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 2f300d42501e..4e0ec9765dae 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -16,7 +16,9 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/aws/aws-sdk-go-v2/aws" @@ -214,16 +216,20 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR s3API := newConstantS3(t) pipeline := &fakePipeline{} conf := makeBenchmarkConfig(t) - - s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) - sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) - sqsReader := &sqsReaderInput{ + conf.MaxNumberOfMessages = maxMessagesInflight + + //s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) + //sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) + sqsReader, err := newSQSReaderInput( + config{MaxNumberOfMessages: maxMessagesInflight}, aws.Config{}) + require.NoError(t, err, "newSQSReaderInput must succeed") + /*sqsReader := &sqsReaderInput{ log: log.Named("sqs"), config: config{MaxNumberOfMessages: maxMessagesInflight}, metrics: metrics, sqs: sqsAPI, msgHandler: sqsMessageHandler, - } + }*/ ctx, cancel := context.WithCancel(context.Background()) b.Cleanup(cancel) @@ -235,6 +241,12 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR cancel() }() + sqsReader.setup(v2.Context{}, &fakePipeline{}) + // Override the internal helper APIs with mocked versions + sqsReader.s3 = s3API + sqsReader.sqs = sqsAPI + sqsReader.metrics = metrics + b.ResetTimer() start := time.Now() sqsReader.run(ctx) diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index a32cd68cf407..a33eac13eab7 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -93,19 +93,46 @@ func (in *sqsReaderInput) Run( inputContext v2.Context, pipeline beat.Pipeline, ) error { + // Initialize everything for this run + err := in.setup(inputContext, pipeline) + if err != nil { + return err + } + + // Start the main run loop + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + in.run(ctx) + in.cleanup() + + return nil +} + +// Apply internal initialization based on the parameters of Run, in +// preparation for calling run. setup and run are separate functions so +// tests can apply mocks and overrides before the run loop starts. +func (in *sqsReaderInput) setup(inputContext v2.Context, pipeline beat.Pipeline) error { in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) - in.logConfigSummary() in.metrics = newInputMetrics(inputContext.ID, nil, in.config.MaxNumberOfMessages) - defer in.metrics.Close() var err error in.msgHandler, err = in.createEventProcessor(pipeline) if err != nil { return fmt.Errorf("failed to initialize sqs reader: %w", err) } + return nil +} - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) +// Release internal resources created during setup (currently just metrics). +// This is its own function so tests can handle the run loop in isolation. +func (in *sqsReaderInput) cleanup() { + in.metrics.Close() +} + +// Create the main goroutines for the input (workers, message count monitor) +// and begin the run loop. +func (in *sqsReaderInput) run(ctx context.Context) { + in.logConfigSummary() // Poll metrics periodically in the background go messageCountMonitor{ @@ -113,13 +140,7 @@ func (in *sqsReaderInput) Run( metrics: in.metrics, }.run(ctx) - // Start the main run loop - in.run(ctx) - - return nil -} - -func (in *sqsReaderInput) run(ctx context.Context) { + fmt.Printf("hi fae: sqsReaderInput.run\n") in.startWorkers(ctx) in.readerLoop(ctx) @@ -131,16 +152,21 @@ func (in *sqsReaderInput) readerLoop(ctx context.Context) { // reader will try to fulfill requestCount := 0 for ctx.Err() == nil { + fmt.Printf("hi fae: readerLoop begin\n") // Block to wait for more requests if requestCount is zero requestCount += channelRequestCount(ctx, in.workRequestChan, requestCount == 0) + fmt.Printf("hi fae: requestCount %v\n", requestCount) msgs := readSQSMessages(ctx, in.log, in.sqs, in.metrics, requestCount) + fmt.Printf("hi fae: got %v messages\n", len(msgs)) for _, msg := range msgs { + fmt.Printf("hi fae, sending one message\n") select { case <-ctx.Done(): return case in.workResponseChan <- msg: + fmt.Printf("hi fae, message sent\n") requestCount-- } } @@ -155,12 +181,14 @@ func (in *sqsReaderInput) workerLoop(ctx context.Context) { // Shutting down return case in.workRequestChan <- struct{}{}: + fmt.Printf("hi fae: sent work request\n") } // The request is sent, wait for a response select { case <-ctx.Done(): return case msg := <-in.workResponseChan: + fmt.Printf("hi fae: received work response\n") start := time.Now() id := in.metrics.beginSQSWorker() From 7d12f0ad840b129585cd5703e513ea77086dfc14 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 2 May 2024 14:57:04 -0400 Subject: [PATCH 56/99] fix benchmark tests --- x-pack/filebeat/input/awss3/input.go | 2 +- .../input/awss3/input_benchmark_test.go | 50 ++++----- x-pack/filebeat/input/awss3/sqs_input.go | 105 +++++++++--------- 3 files changed, 74 insertions(+), 83 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index f2bc399c30e5..2e88b797f318 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -60,7 +60,7 @@ func (im *s3InputManager) Create(cfg *conf.C) (v2.Input, error) { } if config.QueueURL != "" { - return newSQSReaderInput(config, awsConfig) + return newSQSReaderInput(config, awsConfig), nil } if config.BucketARN != "" || config.NonAWSBucketName != "" { diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 4e0ec9765dae..879e98597ee1 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -18,7 +18,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/aws/aws-sdk-go-v2/aws" @@ -209,44 +208,33 @@ file_selectors: func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkResult { return testing.Benchmark(func(b *testing.B) { - log := logp.NewLogger(inputName) - metricRegistry := monitoring.NewRegistry() - metrics := newInputMetrics("test_id", metricRegistry, maxMessagesInflight) - sqsAPI := newConstantSQS() - s3API := newConstantS3(t) pipeline := &fakePipeline{} - conf := makeBenchmarkConfig(t) - conf.MaxNumberOfMessages = maxMessagesInflight //s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) //sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) - sqsReader, err := newSQSReaderInput( - config{MaxNumberOfMessages: maxMessagesInflight}, aws.Config{}) - require.NoError(t, err, "newSQSReaderInput must succeed") - /*sqsReader := &sqsReaderInput{ - log: log.Named("sqs"), - config: config{MaxNumberOfMessages: maxMessagesInflight}, - metrics: metrics, - sqs: sqsAPI, - msgHandler: sqsMessageHandler, - }*/ + conf := makeBenchmarkConfig(t) + conf.MaxNumberOfMessages = maxMessagesInflight + sqsReader := newSQSReaderInput(conf, aws.Config{}) + sqsReader.log = log.Named("sqs") + sqsReader.metrics = newInputMetrics("test_id", monitoring.NewRegistry(), maxMessagesInflight) + sqsReader.sqs = newConstantSQS() + sqsReader.s3 = newConstantS3(t) + //sqsReader.msgHandler = sqsMessageHandler + + var err error + sqsReader.msgHandler, err = sqsReader.createEventProcessor(pipeline) + require.NoError(t, err, "createEventProcessor must succeed") ctx, cancel := context.WithCancel(context.Background()) b.Cleanup(cancel) go func() { - for metrics.sqsMessagesReceivedTotal.Get() < uint64(b.N) { + for sqsReader.metrics.sqsMessagesReceivedTotal.Get() < uint64(b.N) { time.Sleep(5 * time.Millisecond) } cancel() }() - sqsReader.setup(v2.Context{}, &fakePipeline{}) - // Override the internal helper APIs with mocked versions - sqsReader.s3 = s3API - sqsReader.sqs = sqsAPI - sqsReader.metrics = metrics - b.ResetTimer() start := time.Now() sqsReader.run(ctx) @@ -256,14 +244,14 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR b.ReportMetric(float64(maxMessagesInflight), "max_messages_inflight") b.ReportMetric(elapsed.Seconds(), "sec") - b.ReportMetric(float64(metrics.s3EventsCreatedTotal.Get()), "events") - b.ReportMetric(float64(metrics.s3EventsCreatedTotal.Get())/elapsed.Seconds(), "events_per_sec") + b.ReportMetric(float64(sqsReader.metrics.s3EventsCreatedTotal.Get()), "events") + b.ReportMetric(float64(sqsReader.metrics.s3EventsCreatedTotal.Get())/elapsed.Seconds(), "events_per_sec") - b.ReportMetric(float64(metrics.s3BytesProcessedTotal.Get()), "s3_bytes") - b.ReportMetric(float64(metrics.s3BytesProcessedTotal.Get())/elapsed.Seconds(), "s3_bytes_per_sec") + b.ReportMetric(float64(sqsReader.metrics.s3BytesProcessedTotal.Get()), "s3_bytes") + b.ReportMetric(float64(sqsReader.metrics.s3BytesProcessedTotal.Get())/elapsed.Seconds(), "s3_bytes_per_sec") - b.ReportMetric(float64(metrics.sqsMessagesDeletedTotal.Get()), "sqs_messages") - b.ReportMetric(float64(metrics.sqsMessagesDeletedTotal.Get())/elapsed.Seconds(), "sqs_messages_per_sec") + b.ReportMetric(float64(sqsReader.metrics.sqsMessagesDeletedTotal.Get()), "sqs_messages") + b.ReportMetric(float64(sqsReader.metrics.sqsMessagesDeletedTotal.Get())/elapsed.Seconds(), "sqs_messages_per_sec") }) } diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index a33eac13eab7..7ca5a4240d7f 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -18,6 +18,7 @@ import ( v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/elastic-agent-libs/logp" + "github.com/elastic/elastic-agent-libs/monitoring" ) type sqsReaderInput struct { @@ -41,46 +42,14 @@ type sqsReaderInput struct { workerWg sync.WaitGroup } -func newSQSReaderInput(config config, awsConfig awssdk.Config) (*sqsReaderInput, error) { - detectedRegion := getRegionFromQueueURL(config.QueueURL, config.AWSConfig.Endpoint) - if config.RegionName != "" { - awsConfig.Region = config.RegionName - } else if detectedRegion == "" { - // Only report an error if we don't have a configured region - // to fall back on. - return nil, fmt.Errorf("failed to get AWS region from queue_url: %w", errBadQueueURL) - } - - sqsAPI := &awsSQSAPI{ - client: sqs.NewFromConfig(awsConfig, func(o *sqs.Options) { - if config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - }), - queueURL: config.QueueURL, - apiTimeout: config.APITimeout, - visibilityTimeout: config.VisibilityTimeout, - longPollWaitTime: config.SQSWaitTime, - } - - s3API := &awsS3API{ - client: s3.NewFromConfig(awsConfig, func(o *s3.Options) { - if config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = config.PathStyle - }), - } - +// Simple wrapper to handle creation of internal channels +func newSQSReaderInput(config config, awsConfig awssdk.Config) *sqsReaderInput { return &sqsReaderInput{ config: config, awsConfig: awsConfig, - sqs: sqsAPI, - s3: s3API, - detectedRegion: detectedRegion, workRequestChan: make(chan struct{}, config.MaxNumberOfMessages), workResponseChan: make(chan types.Message), - }, nil + } } func (in *sqsReaderInput) Name() string { return inputName } @@ -94,7 +63,7 @@ func (in *sqsReaderInput) Run( pipeline beat.Pipeline, ) error { // Initialize everything for this run - err := in.setup(inputContext, pipeline) + err := in.setup(inputContext, pipeline, nil) if err != nil { return err } @@ -109,11 +78,48 @@ func (in *sqsReaderInput) Run( // Apply internal initialization based on the parameters of Run, in // preparation for calling run. setup and run are separate functions so -// tests can apply mocks and overrides before the run loop starts. -func (in *sqsReaderInput) setup(inputContext v2.Context, pipeline beat.Pipeline) error { +// tests can apply mocks and overrides. Tests should apply overrides +// after calling setup (or skip setup entirely if everything is already +// initialized). If a test does call setup, it can use optRegistry to +// report metrics under a subregistry instead of using the global one. +func (in *sqsReaderInput) setup( + inputContext v2.Context, + pipeline beat.Pipeline, + optRegistry *monitoring.Registry, +) error { + detectedRegion := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if in.config.RegionName != "" { + in.awsConfig.Region = in.config.RegionName + } else if detectedRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", errBadQueueURL) + } + + in.sqs = &awsSQSAPI{ + client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + }), + queueURL: in.config.QueueURL, + apiTimeout: in.config.APITimeout, + visibilityTimeout: in.config.VisibilityTimeout, + longPollWaitTime: in.config.SQSWaitTime, + } + + in.s3 = &awsS3API{ + client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { + if in.config.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } + o.UsePathStyle = in.config.PathStyle + }), + } + in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) - in.metrics = newInputMetrics(inputContext.ID, nil, in.config.MaxNumberOfMessages) + in.metrics = newInputMetrics(inputContext.ID, optRegistry, in.config.MaxNumberOfMessages) var err error in.msgHandler, err = in.createEventProcessor(pipeline) @@ -140,7 +146,7 @@ func (in *sqsReaderInput) run(ctx context.Context) { metrics: in.metrics, }.run(ctx) - fmt.Printf("hi fae: sqsReaderInput.run\n") + //fmt.Printf("hi fae: sqsReaderInput.run\n") in.startWorkers(ctx) in.readerLoop(ctx) @@ -152,21 +158,21 @@ func (in *sqsReaderInput) readerLoop(ctx context.Context) { // reader will try to fulfill requestCount := 0 for ctx.Err() == nil { - fmt.Printf("hi fae: readerLoop begin\n") + //fmt.Printf("hi fae: readerLoop begin\n") // Block to wait for more requests if requestCount is zero requestCount += channelRequestCount(ctx, in.workRequestChan, requestCount == 0) - fmt.Printf("hi fae: requestCount %v\n", requestCount) + //fmt.Printf("hi fae: requestCount %v\n", requestCount) msgs := readSQSMessages(ctx, in.log, in.sqs, in.metrics, requestCount) - fmt.Printf("hi fae: got %v messages\n", len(msgs)) + //fmt.Printf("hi fae: got %v messages\n", len(msgs)) for _, msg := range msgs { - fmt.Printf("hi fae, sending one message\n") + //fmt.Printf("hi fae, sending one message\n") select { case <-ctx.Done(): return case in.workResponseChan <- msg: - fmt.Printf("hi fae, message sent\n") + //fmt.Printf("hi fae, message sent\n") requestCount-- } } @@ -181,14 +187,14 @@ func (in *sqsReaderInput) workerLoop(ctx context.Context) { // Shutting down return case in.workRequestChan <- struct{}{}: - fmt.Printf("hi fae: sent work request\n") + //fmt.Printf("hi fae: sent work request\n") } // The request is sent, wait for a response select { case <-ctx.Done(): return case msg := <-in.workResponseChan: - fmt.Printf("hi fae: received work response\n") + //fmt.Printf("hi fae: received work response\n") start := time.Now() id := in.metrics.beginSQSWorker() @@ -232,10 +238,7 @@ func (in *sqsReaderInput) logConfigSummary() { } func (in *sqsReaderInput) createEventProcessor(pipeline beat.Pipeline) (sqsProcessor, error) { - fileSelectors := in.config.FileSelectors - if len(in.config.FileSelectors) == 0 { - fileSelectors = []fileSelectorConfig{{ReaderConfig: in.config.ReaderConfig}} - } + fileSelectors := in.config.getFileSelectors() s3EventHandlerFactory := newS3ObjectProcessorFactory(in.log.Named("s3"), in.metrics, in.s3, fileSelectors, in.config.BackupConfig) script, err := newScriptFromConfig(in.log.Named("sqs_script"), in.config.SQSScript) From f1b7761aeb6b77becc3fe2acfa3a3c9b6ad47f55 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 2 May 2024 21:16:29 -0400 Subject: [PATCH 57/99] updating unit tests --- .../input/awss3/input_benchmark_test.go | 1 + x-pack/filebeat/input/awss3/s3_test.go | 42 ++++++------ x-pack/filebeat/input/awss3/sqs_input.go | 8 --- x-pack/filebeat/input/awss3/sqs_test.go | 64 ++++++++++--------- 4 files changed, 58 insertions(+), 57 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index 879e98597ee1..d9abd7983ccb 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -313,6 +313,7 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult }() config := makeBenchmarkConfig(t) + config.NumberOfWorkers = numberOfWorkers b.ResetTimer() start := time.Now() diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index ba5fa4b32dcd..ac4285c19912 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -143,6 +143,7 @@ func TestS3Poller(t *testing.T) { s3ObjectHandler: s3ObjProc, states: states, provider: "provider", + metrics: newInputMetrics("", nil, 0), } poller.runPoll(ctx) }) @@ -158,36 +159,36 @@ func TestS3Poller(t *testing.T) { ctrl, ctx := gomock.WithContext(ctx, t) defer ctrl.Finish() - mockAPI := NewMockS3API(ctrl) - mockPagerFirst := NewMockS3Pager(ctrl) - mockPagerSecond := NewMockS3Pager(ctrl) + mockS3 := NewMockS3API(ctrl) + mockErrorPager := NewMockS3Pager(ctrl) + mockSuccessPager := NewMockS3Pager(ctrl) mockPublisher := NewMockBeatClient(ctrl) gomock.InOrder( // Initial ListObjectPaginator gets an error. - mockAPI.EXPECT(). + mockS3.EXPECT(). ListObjectsPaginator(gomock.Eq(bucket), gomock.Eq("key")). Times(1). DoAndReturn(func(_, _ string) s3Pager { - return mockPagerFirst + return mockErrorPager }), // After waiting for pollInterval, it retries. - mockAPI.EXPECT(). + mockS3.EXPECT(). ListObjectsPaginator(gomock.Eq(bucket), gomock.Eq("key")). Times(1). DoAndReturn(func(_, _ string) s3Pager { - return mockPagerSecond + return mockSuccessPager }), ) // Initial Next gets an error. - mockPagerFirst.EXPECT(). + mockErrorPager.EXPECT(). HasMorePages(). Times(2). DoAndReturn(func() bool { return true }) - mockPagerFirst.EXPECT(). + mockErrorPager.EXPECT(). NextPage(gomock.Any()). Times(2). DoAndReturn(func(_ context.Context, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) { @@ -195,13 +196,13 @@ func TestS3Poller(t *testing.T) { }) // After waiting for pollInterval, it retries. - mockPagerSecond.EXPECT(). + mockSuccessPager.EXPECT(). HasMorePages(). Times(1). DoAndReturn(func() bool { return true }) - mockPagerSecond.EXPECT(). + mockSuccessPager.EXPECT(). NextPage(gomock.Any()). Times(1). DoAndReturn(func(_ context.Context, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) { @@ -236,34 +237,34 @@ func TestS3Poller(t *testing.T) { }, nil }) - mockPagerSecond.EXPECT(). + mockSuccessPager.EXPECT(). HasMorePages(). Times(1). DoAndReturn(func() bool { return false }) - mockAPI.EXPECT(). + mockS3.EXPECT(). GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("key1")). Return(nil, errFakeConnectivityFailure) - mockAPI.EXPECT(). + mockS3.EXPECT(). GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("key2")). Return(nil, errFakeConnectivityFailure) - mockAPI.EXPECT(). + mockS3.EXPECT(). GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("key3")). Return(nil, errFakeConnectivityFailure) - mockAPI.EXPECT(). + mockS3.EXPECT(). GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("key4")). Return(nil, errFakeConnectivityFailure) - mockAPI.EXPECT(). + mockS3.EXPECT(). GetObject(gomock.Any(), gomock.Eq(bucket), gomock.Eq("key5")). Return(nil, errFakeConnectivityFailure) - s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}) + s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockS3, nil, backupConfig{}) states, err := newStates(nil, store) require.NoError(t, err, "states creation must succeed") poller := &s3PollerInput{ @@ -275,12 +276,13 @@ func TestS3Poller(t *testing.T) { BucketListPrefix: "key", RegionName: "region", }, - s3: mockAPI, + s3: mockS3, client: mockPublisher, s3ObjectHandler: s3ObjProc, states: states, provider: "provider", + metrics: newInputMetrics("", nil, 0), } - poller.runPoll(ctx) + poller.run(ctx) }) } diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index 7ca5a4240d7f..7f91e3b1259f 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -146,7 +146,6 @@ func (in *sqsReaderInput) run(ctx context.Context) { metrics: in.metrics, }.run(ctx) - //fmt.Printf("hi fae: sqsReaderInput.run\n") in.startWorkers(ctx) in.readerLoop(ctx) @@ -158,21 +157,16 @@ func (in *sqsReaderInput) readerLoop(ctx context.Context) { // reader will try to fulfill requestCount := 0 for ctx.Err() == nil { - //fmt.Printf("hi fae: readerLoop begin\n") // Block to wait for more requests if requestCount is zero requestCount += channelRequestCount(ctx, in.workRequestChan, requestCount == 0) - //fmt.Printf("hi fae: requestCount %v\n", requestCount) msgs := readSQSMessages(ctx, in.log, in.sqs, in.metrics, requestCount) - //fmt.Printf("hi fae: got %v messages\n", len(msgs)) for _, msg := range msgs { - //fmt.Printf("hi fae, sending one message\n") select { case <-ctx.Done(): return case in.workResponseChan <- msg: - //fmt.Printf("hi fae, message sent\n") requestCount-- } } @@ -187,14 +181,12 @@ func (in *sqsReaderInput) workerLoop(ctx context.Context) { // Shutting down return case in.workRequestChan <- struct{}{}: - //fmt.Printf("hi fae: sent work request\n") } // The request is sent, wait for a response select { case <-ctx.Done(): return case msg := <-in.workResponseChan: - //fmt.Printf("hi fae: received work response\n") start := time.Now() id := in.metrics.beginSQSWorker() diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index c6c7be7a72af..b1bfe20d906e 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -12,6 +12,7 @@ import ( "testing" "time" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/sqs/types" "github.com/gofrs/uuid" "github.com/golang/mock/gomock" @@ -38,31 +39,43 @@ func TestSQSReceiver(t *testing.T) { ctrl, ctx := gomock.WithContext(ctx, t) defer ctrl.Finish() - mockAPI := NewMockSQSAPI(ctrl) + mockSQS := NewMockSQSAPI(ctrl) mockMsgHandler := NewMockSQSProcessor(ctrl) msg := newSQSMessage(newS3Event("log.json")) - gomock.InOrder( - // Initial ReceiveMessage for maxMessages. - mockAPI.EXPECT(). - ReceiveMessage(gomock.Any(), gomock.Eq(maxMessages)). - Times(1). - DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { - // Return single message. - return []types.Message{msg}, nil - }), + // Execute sqsReader and verify calls/state. + receiver := newSQSReaderInput(config{MaxNumberOfMessages: maxMessages}, aws.Config{}) + receiver.log = logp.NewLogger(inputName) + receiver.sqs = mockSQS + receiver.msgHandler = mockMsgHandler + receiver.metrics = newInputMetrics("", nil, 0) + receiver.run(ctx) - // Follow up ReceiveMessages for either maxMessages-1 or maxMessages - // depending on how long processing of previous message takes. - mockAPI.EXPECT(). - ReceiveMessage(gomock.Any(), gomock.Any()). - Times(1). - DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { - // Stop the test. - cancel() - return nil, nil - }), - ) + // Initial ReceiveMessage for maxMessages. + mockSQS.EXPECT(). + ReceiveMessage(gomock.Any(), gomock.Eq(maxMessages)). + Times(1). + DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { + // Return single message. + return []types.Message{msg}, nil + }) + + // Follow up ReceiveMessages for either maxMessages-1 or maxMessages + // depending on how long processing of previous message takes. + mockSQS.EXPECT(). + ReceiveMessage(gomock.Any(), gomock.Any()). + Times(1). + DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { + // Stop the test. + cancel() + return nil, nil + }) + + mockSQS.EXPECT(). + GetQueueAttributes(gomock.Any(), gomock.Eq([]types.QueueAttributeName{sqsApproximateNumberOfMessages})). + DoAndReturn(func(_ context.Context, _ []types.QueueAttributeName) (map[string]string, error) { + return map[string]string{sqsApproximateNumberOfMessages: "10000"}, nil + }).AnyTimes() // Expect the one message returned to have been processed. mockMsgHandler.EXPECT(). @@ -70,14 +83,6 @@ func TestSQSReceiver(t *testing.T) { Times(1). Return(nil) - // Execute sqsReader and verify calls/state. - receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - config: config{MaxNumberOfMessages: maxMessages}, - sqs: mockAPI, - msgHandler: mockMsgHandler, - } - receiver.run(ctx) }) t.Run("retry after ReceiveMessage error", func(t *testing.T) { @@ -113,6 +118,7 @@ func TestSQSReceiver(t *testing.T) { config: config{MaxNumberOfMessages: maxMessages}, sqs: mockAPI, msgHandler: mockMsgHandler, + metrics: newInputMetrics("", nil, 0), } receiver.run(ctx) }) From fa222398621679ab065cfc50862d2c6cf5897be5 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 2 May 2024 22:52:55 -0400 Subject: [PATCH 58/99] fix remaining tests --- x-pack/filebeat/input/awss3/sqs_test.go | 48 +++++++++++++------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs_test.go b/x-pack/filebeat/input/awss3/sqs_test.go index b1bfe20d906e..368845f9ac6f 100644 --- a/x-pack/filebeat/input/awss3/sqs_test.go +++ b/x-pack/filebeat/input/awss3/sqs_test.go @@ -43,17 +43,9 @@ func TestSQSReceiver(t *testing.T) { mockMsgHandler := NewMockSQSProcessor(ctrl) msg := newSQSMessage(newS3Event("log.json")) - // Execute sqsReader and verify calls/state. - receiver := newSQSReaderInput(config{MaxNumberOfMessages: maxMessages}, aws.Config{}) - receiver.log = logp.NewLogger(inputName) - receiver.sqs = mockSQS - receiver.msgHandler = mockMsgHandler - receiver.metrics = newInputMetrics("", nil, 0) - receiver.run(ctx) - // Initial ReceiveMessage for maxMessages. mockSQS.EXPECT(). - ReceiveMessage(gomock.Any(), gomock.Eq(maxMessages)). + ReceiveMessage(gomock.Any(), gomock.Any()). Times(1). DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { // Return single message. @@ -83,6 +75,13 @@ func TestSQSReceiver(t *testing.T) { Times(1). Return(nil) + // Execute sqsReader and verify calls/state. + sqsReader := newSQSReaderInput(config{MaxNumberOfMessages: maxMessages}, aws.Config{}) + sqsReader.log = logp.NewLogger(inputName) + sqsReader.sqs = mockSQS + sqsReader.msgHandler = mockMsgHandler + sqsReader.metrics = newInputMetrics("", nil, 0) + sqsReader.run(ctx) }) t.Run("retry after ReceiveMessage error", func(t *testing.T) { @@ -91,36 +90,39 @@ func TestSQSReceiver(t *testing.T) { ctrl, ctx := gomock.WithContext(ctx, t) defer ctrl.Finish() - mockAPI := NewMockSQSAPI(ctrl) + mockSQS := NewMockSQSAPI(ctrl) mockMsgHandler := NewMockSQSProcessor(ctrl) gomock.InOrder( // Initial ReceiveMessage gets an error. - mockAPI.EXPECT(). - ReceiveMessage(gomock.Any(), gomock.Eq(maxMessages)). + mockSQS.EXPECT(). + ReceiveMessage(gomock.Any(), gomock.Any()). Times(1). DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { return nil, errFakeConnectivityFailure }), // After waiting for sqsRetryDelay, it retries. - mockAPI.EXPECT(). - ReceiveMessage(gomock.Any(), gomock.Eq(maxMessages)). + mockSQS.EXPECT(). + ReceiveMessage(gomock.Any(), gomock.Any()). Times(1). DoAndReturn(func(_ context.Context, _ int) ([]types.Message, error) { cancel() return nil, nil }), ) + mockSQS.EXPECT(). + GetQueueAttributes(gomock.Any(), gomock.Eq([]types.QueueAttributeName{sqsApproximateNumberOfMessages})). + DoAndReturn(func(_ context.Context, _ []types.QueueAttributeName) (map[string]string, error) { + return map[string]string{sqsApproximateNumberOfMessages: "10000"}, nil + }).AnyTimes() - // Execute SQSReceiver and verify calls/state. - receiver := &sqsReaderInput{ - log: logp.NewLogger(inputName), - config: config{MaxNumberOfMessages: maxMessages}, - sqs: mockAPI, - msgHandler: mockMsgHandler, - metrics: newInputMetrics("", nil, 0), - } - receiver.run(ctx) + // Execute SQSReader and verify calls/state. + sqsReader := newSQSReaderInput(config{MaxNumberOfMessages: maxMessages}, aws.Config{}) + sqsReader.log = logp.NewLogger(inputName) + sqsReader.sqs = mockSQS + sqsReader.msgHandler = mockMsgHandler + sqsReader.metrics = newInputMetrics("", nil, 0) + sqsReader.run(ctx) }) } From e2536810872718723d07650a7acfdeff34b6984a Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 2 May 2024 22:55:31 -0400 Subject: [PATCH 59/99] remove unused debug parameter --- x-pack/filebeat/input/awss3/sqs_input.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index 7f91e3b1259f..7794c26a57b0 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -18,7 +18,6 @@ import ( v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/monitoring" ) type sqsReaderInput struct { @@ -63,7 +62,7 @@ func (in *sqsReaderInput) Run( pipeline beat.Pipeline, ) error { // Initialize everything for this run - err := in.setup(inputContext, pipeline, nil) + err := in.setup(inputContext, pipeline) if err != nil { return err } @@ -78,14 +77,10 @@ func (in *sqsReaderInput) Run( // Apply internal initialization based on the parameters of Run, in // preparation for calling run. setup and run are separate functions so -// tests can apply mocks and overrides. Tests should apply overrides -// after calling setup (or skip setup entirely if everything is already -// initialized). If a test does call setup, it can use optRegistry to -// report metrics under a subregistry instead of using the global one. +// tests can apply mocks and overrides before the run loop. func (in *sqsReaderInput) setup( inputContext v2.Context, pipeline beat.Pipeline, - optRegistry *monitoring.Registry, ) error { detectedRegion := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) if in.config.RegionName != "" { @@ -119,7 +114,7 @@ func (in *sqsReaderInput) setup( in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) - in.metrics = newInputMetrics(inputContext.ID, optRegistry, in.config.MaxNumberOfMessages) + in.metrics = newInputMetrics(inputContext.ID, nil, in.config.MaxNumberOfMessages) var err error in.msgHandler, err = in.createEventProcessor(pipeline) From 1fef19905e7a7836f6dcb86d85f65171c97794be Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 09:27:15 -0400 Subject: [PATCH 60/99] remove commented code --- x-pack/filebeat/input/awss3/input_benchmark_test.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index d9abd7983ccb..09b7c8bd9d26 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -208,10 +208,9 @@ file_selectors: func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkResult { return testing.Benchmark(func(b *testing.B) { + var err error pipeline := &fakePipeline{} - //s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, conf.FileSelectors, backupConfig{}) - //sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), metrics, sqsAPI, nil, time.Minute, 5, pipeline, s3EventHandlerFactory) conf := makeBenchmarkConfig(t) conf.MaxNumberOfMessages = maxMessagesInflight sqsReader := newSQSReaderInput(conf, aws.Config{}) @@ -219,9 +218,6 @@ func benchmarkInputSQS(t *testing.T, maxMessagesInflight int) testing.BenchmarkR sqsReader.metrics = newInputMetrics("test_id", monitoring.NewRegistry(), maxMessagesInflight) sqsReader.sqs = newConstantSQS() sqsReader.s3 = newConstantS3(t) - //sqsReader.msgHandler = sqsMessageHandler - - var err error sqsReader.msgHandler, err = sqsReader.createEventProcessor(pipeline) require.NoError(t, err, "createEventProcessor must succeed") From 800fc7338e896f49a1b073af27c0d8f4bf2902de Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 09:42:22 -0400 Subject: [PATCH 61/99] move helper function --- x-pack/filebeat/input/awss3/s3.go | 13 +++++++++++++ x-pack/filebeat/input/awss3/s3_input.go | 13 +------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index b2c2d50fa2af..6f967e9c2656 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -12,6 +12,8 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/elastic/beats/v7/libbeat/beat" + awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" ) func createS3API(ctx context.Context, config config, awsConfig awssdk.Config) (*awsS3API, error) { @@ -31,6 +33,17 @@ func createS3API(ctx context.Context, config config, awsConfig awssdk.Config) (* }, nil } +func createPipelineClient(pipeline beat.Pipeline) (beat.Client, error) { + return pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) +} + func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ Bucket: awssdk.String(bucketName), diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index ad27a40f561b..9360ef04d019 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -64,17 +64,6 @@ func newS3PollerInput( }, nil } -func createClient(pipeline beat.Pipeline) (beat.Client, error) { - return pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) -} - func (in *s3PollerInput) Run( inputContext v2.Context, pipeline beat.Pipeline, @@ -90,7 +79,7 @@ func (in *s3PollerInput) Run( defer in.states.Close() // Create client for publishing events and receive notification of their ACKs. - in.client, err = createClient(pipeline) + in.client, err = createPipelineClient(pipeline) if err != nil { return fmt.Errorf("failed to create pipeline client: %w", err) } From 1694f0d68756cc266dab7b72412b0bb00a99377b Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 09:49:04 -0400 Subject: [PATCH 62/99] clean up aws client config modifiers --- x-pack/filebeat/input/awss3/config.go | 7 +++++++ x-pack/filebeat/input/awss3/sqs_input.go | 14 +++----------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/x-pack/filebeat/input/awss3/config.go b/x-pack/filebeat/input/awss3/config.go index 87b6efa13b40..a5e8b094ae5a 100644 --- a/x-pack/filebeat/input/awss3/config.go +++ b/x-pack/filebeat/input/awss3/config.go @@ -12,6 +12,7 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/aws/retry" "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/dustin/go-humanize" "github.com/elastic/beats/v7/libbeat/common/cfgtype" @@ -265,6 +266,12 @@ func (c config) s3ConfigModifier(o *s3.Options) { }) } +func (c config) sqsConfigModifier(o *sqs.Options) { + if c.AWSConfig.FIPSEnabled { + o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled + } +} + func (c config) getFileSelectors() []fileSelectorConfig { if len(c.FileSelectors) > 0 { return c.FileSelectors diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index b433fc912051..ab04b02158a1 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -93,11 +93,8 @@ func (in *sqsReaderInput) setup( } in.sqs = &awsSQSAPI{ - client: sqs.NewFromConfig(in.awsConfig, func(o *sqs.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - }), + client: sqs.NewFromConfig(in.awsConfig, in.config.sqsConfigModifier), + queueURL: in.config.QueueURL, apiTimeout: in.config.APITimeout, visibilityTimeout: in.config.VisibilityTimeout, @@ -105,12 +102,7 @@ func (in *sqsReaderInput) setup( } in.s3 = &awsS3API{ - client: s3.NewFromConfig(in.awsConfig, func(o *s3.Options) { - if in.config.AWSConfig.FIPSEnabled { - o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled - } - o.UsePathStyle = in.config.PathStyle - }), + client: s3.NewFromConfig(in.awsConfig, in.config.s3ConfigModifier), } in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) From 63be5239903f1f914426dbde4738c9c50a466b09 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 09:51:52 -0400 Subject: [PATCH 63/99] reorder helper functions --- x-pack/filebeat/input/awss3/s3_input.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index 9360ef04d019..8ae1c0ac8c86 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -45,12 +45,6 @@ type s3FetchTask struct { objectState state } -func (in *s3PollerInput) Name() string { return inputName } - -func (in *s3PollerInput) Test(ctx v2.TestContext) error { - return nil -} - func newS3PollerInput( config config, awsConfig awssdk.Config, @@ -64,6 +58,12 @@ func newS3PollerInput( }, nil } +func (in *s3PollerInput) Name() string { return inputName } + +func (in *s3PollerInput) Test(ctx v2.TestContext) error { + return nil +} + func (in *s3PollerInput) Run( inputContext v2.Context, pipeline beat.Pipeline, From 1bae7579d142c0858870ffc8871726c91c3d3e8c Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 09:57:30 -0400 Subject: [PATCH 64/99] reorder helper functions --- x-pack/filebeat/input/awss3/s3_input.go | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index 8ae1c0ac8c86..f8b67ab66dae 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -133,19 +133,6 @@ func (in *s3PollerInput) runPoll(ctx context.Context) { workerWg.Wait() } -func (in *s3PollerInput) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { - event := s3EventV2{} - event.AWSRegion = in.awsConfig.Region - event.Provider = in.provider - event.S3.Bucket.Name = state.Bucket - event.S3.Bucket.ARN = in.config.getBucketARN() - event.S3.Object.Key = state.Key - - acker := awscommon.NewEventACKTracker(ctx) - - return in.s3ObjectHandler.Create(ctx, in.log, in.client, acker, event) -} - func (in *s3PollerInput) workerLoop(ctx context.Context, workChan <-chan *s3FetchTask) { rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) @@ -244,3 +231,16 @@ func (in *s3PollerInput) readerLoop(ctx context.Context, workChan chan<- *s3Fetc } } } + +func (in *s3PollerInput) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { + event := s3EventV2{} + event.AWSRegion = in.awsConfig.Region + event.Provider = in.provider + event.S3.Bucket.Name = state.Bucket + event.S3.Bucket.ARN = in.config.getBucketARN() + event.S3.Object.Key = state.Key + + acker := awscommon.NewEventACKTracker(ctx) + + return in.s3ObjectHandler.Create(ctx, in.log, in.client, acker, event) +} From 939c38fc3e67a6a3dfb9b45066d8850b3c0f9c73 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 10:05:56 -0400 Subject: [PATCH 65/99] update comments --- x-pack/filebeat/input/awss3/sqs.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/x-pack/filebeat/input/awss3/sqs.go b/x-pack/filebeat/input/awss3/sqs.go index 297d613728d0..36985f73720d 100644 --- a/x-pack/filebeat/input/awss3/sqs.go +++ b/x-pack/filebeat/input/awss3/sqs.go @@ -108,7 +108,8 @@ func (mcm messageCountMonitor) run(ctx context.Context) { } } -// updateMessageCount runs GetApproximateMessageCount for the given context and updates the receiver metric with the count returning false on no error +// updateMessageCount runs GetApproximateMessageCount and updates the +// sqsMessagesWaiting metric with the result. // If there is an error, the metric is reinitialized to -1 and true is returned func (mcm messageCountMonitor) updateMessageCount(ctx context.Context) error { count, err := mcm.getApproximateMessageCount(ctx) @@ -118,6 +119,7 @@ func (mcm messageCountMonitor) updateMessageCount(ctx context.Context) error { return err } +// Query the approximate message count for the queue via the SQS API. func (mcm messageCountMonitor) getApproximateMessageCount(ctx context.Context) (int, error) { attributes, err := mcm.sqs.GetQueueAttributes(ctx, []types.QueueAttributeName{sqsApproximateNumberOfMessages}) if err == nil { From b032106129e2c40e74df4d882b3703db6e4504c6 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 10:10:16 -0400 Subject: [PATCH 66/99] move log creation earlier --- x-pack/filebeat/input/awss3/sqs_input.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index ab04b02158a1..588d5400cc91 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -82,6 +82,8 @@ func (in *sqsReaderInput) setup( inputContext v2.Context, pipeline beat.Pipeline, ) error { + in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) + in.detectedRegion = getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) if in.config.RegionName != "" { in.awsConfig.Region = in.config.RegionName @@ -105,8 +107,6 @@ func (in *sqsReaderInput) setup( client: s3.NewFromConfig(in.awsConfig, in.config.s3ConfigModifier), } - in.log = inputContext.Logger.With("queue_url", in.config.QueueURL) - in.metrics = newInputMetrics(inputContext.ID, nil, in.config.MaxNumberOfMessages) var err error From 09959216b52c37b7b567b8ef12269aac799a58fc Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 10:15:46 -0400 Subject: [PATCH 67/99] update comments --- x-pack/filebeat/input/awss3/s3_input.go | 2 +- x-pack/filebeat/input/awss3/sqs_input.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/x-pack/filebeat/input/awss3/s3_input.go b/x-pack/filebeat/input/awss3/s3_input.go index f8b67ab66dae..50786626d279 100644 --- a/x-pack/filebeat/input/awss3/s3_input.go +++ b/x-pack/filebeat/input/awss3/s3_input.go @@ -39,7 +39,7 @@ type s3PollerInput struct { states *states } -// s3FetchTask contains metadata for one S3 object that a worker should fech. +// s3FetchTask contains metadata for one S3 object that a worker should fetch. type s3FetchTask struct { s3ObjectHandler s3ObjectHandler objectState state diff --git a/x-pack/filebeat/input/awss3/sqs_input.go b/x-pack/filebeat/input/awss3/sqs_input.go index 588d5400cc91..e524cf9fd1c7 100644 --- a/x-pack/filebeat/input/awss3/sqs_input.go +++ b/x-pack/filebeat/input/awss3/sqs_input.go @@ -192,8 +192,8 @@ func (in *sqsReaderInput) workerLoop(ctx context.Context) { } func (in *sqsReaderInput) startWorkers(ctx context.Context) { - // Start the worker goroutines that will process messages from workChan - // until the input shuts down. + // Start the worker goroutines that will fetch messages via workRequestChan + // and workResponseChan until the input shuts down. for i := 0; i < in.config.MaxNumberOfMessages; i++ { in.workerWg.Add(1) go func() { From 5d9f7313d1eb28bfeb90645dd6e99c2a87dbf512 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 10:45:35 -0400 Subject: [PATCH 68/99] make check --- x-pack/filebeat/input/awss3/s3.go | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 6f967e9c2656..eb8e19c2cf92 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -12,6 +12,7 @@ import ( awssdk "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/elastic/beats/v7/libbeat/beat" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" ) From a8323a41d52fe69b43d5df2085152ed87537f7bd Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 16:41:04 -0400 Subject: [PATCH 69/99] Working on queue byte limits --- libbeat/common/fifo/fifo.go | 64 +++++++++++++++++++ libbeat/publisher/pipeline/client_test.go | 2 +- libbeat/publisher/pipeline/pipeline_test.go | 2 +- libbeat/publisher/pipeline/queue_reader.go | 2 +- .../queue/diskqueue/benchmark_test.go | 2 +- libbeat/publisher/queue/diskqueue/consumer.go | 2 +- libbeat/publisher/queue/memqueue/broker.go | 9 +-- libbeat/publisher/queue/memqueue/config.go | 19 +++--- .../publisher/queue/memqueue/internal_api.go | 15 ++++- .../publisher/queue/memqueue/queue_test.go | 12 ++-- libbeat/publisher/queue/memqueue/runloop.go | 36 ++++++++--- .../publisher/queue/memqueue/runloop_test.go | 4 +- libbeat/publisher/queue/queue.go | 7 +- .../queue/queuetest/producer_cancel.go | 2 +- .../publisher/queue/queuetest/queuetest.go | 2 +- 15 files changed, 139 insertions(+), 41 deletions(-) create mode 100644 libbeat/common/fifo/fifo.go diff --git a/libbeat/common/fifo/fifo.go b/libbeat/common/fifo/fifo.go new file mode 100644 index 000000000000..79251ee201d5 --- /dev/null +++ b/libbeat/common/fifo/fifo.go @@ -0,0 +1,64 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package fifo + +import "errors" + +var errFIFOEmpty = errors.New("tried to read from an empty FIFO queue") + +type FIFO[T any] struct { + first *node[T] + last *node[T] +} + +type node[T any] struct { + next *node[T] + value T +} + +func (f *FIFO[T]) Add(value T) { + newNode := &node[T]{value: value} + if f.first == nil { + f.first = newNode + } else { + f.last.next = newNode + } + f.last = newNode +} + +func (f *FIFO[T]) Empty() bool { + return f.first == nil +} + +// Return the first value (if present) without removing it from the queue +func (f *FIFO[T]) First() (T, error) { + if f.first == nil { + var none T + return none, errFIFOEmpty + } + return f.first.value, nil +} + +// Remove the first entry in the queue, returning its value +func (f *FIFO[T]) Get() (T, error) { + result, err := f.First() + if f.first != nil { + f.first = f.first.next + } + return result, err +} diff --git a/libbeat/publisher/pipeline/client_test.go b/libbeat/publisher/pipeline/client_test.go index 25080c90615e..b9a6f9c6f3b7 100644 --- a/libbeat/publisher/pipeline/client_test.go +++ b/libbeat/publisher/pipeline/client_test.go @@ -146,7 +146,7 @@ func TestClient(t *testing.T) { done := make(chan struct{}) go func() { for { - batch, err := q.Get(2) + batch, err := q.Get(2, 0) if errors.Is(err, io.EOF) { break } diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index feb01c4fa6e0..50ed27454b62 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -157,7 +157,7 @@ func (q *testQueue) Producer(cfg queue.ProducerConfig) queue.Producer { return nil } -func (q *testQueue) Get(sz int) (queue.Batch, error) { +func (q *testQueue) Get(sz int, _ int) (queue.Batch, error) { if q.get != nil { return q.get(sz) } diff --git a/libbeat/publisher/pipeline/queue_reader.go b/libbeat/publisher/pipeline/queue_reader.go index fa68b83739ce..bc2ce894b853 100644 --- a/libbeat/publisher/pipeline/queue_reader.go +++ b/libbeat/publisher/pipeline/queue_reader.go @@ -54,7 +54,7 @@ func (qr *queueReader) run(logger *logp.Logger) { logger.Debug("pipeline event consumer queue reader: stop") return } - queueBatch, _ := req.queue.Get(req.batchSize) + queueBatch, _ := req.queue.Get(req.batchSize, 0) var batch *ttlBatch if queueBatch != nil { batch = newBatch(req.retryer, queueBatch, req.timeToLive) diff --git a/libbeat/publisher/queue/diskqueue/benchmark_test.go b/libbeat/publisher/queue/diskqueue/benchmark_test.go index 1ac91e57ce1d..134041634fa1 100644 --- a/libbeat/publisher/queue/diskqueue/benchmark_test.go +++ b/libbeat/publisher/queue/diskqueue/benchmark_test.go @@ -134,7 +134,7 @@ func publishEvents(p queue.Producer, num int, protobuf bool) { func getAndAckEvents(q *diskQueue, num_events int, batch_size int) error { var received int for { - batch, err := q.Get(batch_size) + batch, err := q.Get(batch_size, 0) if err != nil { return err } diff --git a/libbeat/publisher/queue/diskqueue/consumer.go b/libbeat/publisher/queue/diskqueue/consumer.go index 0ebdcef5ad3a..9ab03eea2504 100644 --- a/libbeat/publisher/queue/diskqueue/consumer.go +++ b/libbeat/publisher/queue/diskqueue/consumer.go @@ -28,7 +28,7 @@ type diskQueueBatch struct { frames []*readFrame } -func (dq *diskQueue) Get(eventCount int) (queue.Batch, error) { +func (dq *diskQueue) Get(eventCount int, _ int) (queue.Batch, error) { // We can always eventually read at least one frame unless the queue or the // consumer is closed. frame, ok := <-dq.readerLoop.output diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 23569f02150a..fabe26d8d213 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -104,8 +104,9 @@ type broker struct { } type Settings struct { - // The number of events the queue can hold. - Events int + // The number of events and bytes the queue can hold. <= zero means no limit. + // At least one must be greater than zero. + Events, Bytes int // The most events that will ever be returned from one Get request. MaxGetRequest int @@ -267,13 +268,13 @@ func (b *broker) Producer(cfg queue.ProducerConfig) queue.Producer { return newProducer(b, cfg.ACK, cfg.OnDrop, cfg.DropOnCancel, encoder) } -func (b *broker) Get(count int) (queue.Batch, error) { +func (b *broker) Get(count int, bytes int) (queue.Batch, error) { responseChan := make(chan *batch, 1) select { case <-b.ctx.Done(): return nil, io.EOF case b.getChan <- getRequest{ - entryCount: count, responseChan: responseChan}: + entryCount: count, byteCount: bytes, responseChan: responseChan}: } // if request has been sent, we have to wait for a response diff --git a/libbeat/publisher/queue/memqueue/config.go b/libbeat/publisher/queue/memqueue/config.go index 7d9593b30e31..975da5118671 100644 --- a/libbeat/publisher/queue/memqueue/config.go +++ b/libbeat/publisher/queue/memqueue/config.go @@ -27,22 +27,25 @@ import ( type config struct { Events int `config:"events" validate:"min=32"` - // This field is named MaxGetRequest because its logical effect is to give + Bytes int `config:"bytes" validate:"min=32768"` + + // This field is named MaxGetEvents because its logical effect is to give // a maximum on the number of events a Get request can return, but the // user-exposed name is "flush.min_events" for backwards compatibility, // since it used to control buffer size in the internal buffer chain. - MaxGetRequest int `config:"flush.min_events" validate:"min=0"` - FlushTimeout time.Duration `config:"flush.timeout"` + // Ignored if a byte limit is set in the queue or the get request. + MaxGetEvents int `config:"flush.min_events" validate:"min=0"` + FlushTimeout time.Duration `config:"flush.timeout"` } var defaultConfig = config{ - Events: 3200, - MaxGetRequest: 1600, - FlushTimeout: 10 * time.Second, + Events: 3200, + MaxGetEvents: 1600, + FlushTimeout: 10 * time.Second, } func (c *config) Validate() error { - if c.MaxGetRequest > c.Events { + if c.MaxGetEvents > c.Events { return errors.New("flush.min_events must be less events") } return nil @@ -60,7 +63,7 @@ func SettingsForUserConfig(cfg *c.C) (Settings, error) { //nolint:gosimple // Actually want this conversion to be explicit since the types aren't definitionally equal. return Settings{ Events: config.Events, - MaxGetRequest: config.MaxGetRequest, + MaxGetRequest: config.MaxGetEvents, FlushTimeout: config.FlushTimeout, }, nil } diff --git a/libbeat/publisher/queue/memqueue/internal_api.go b/libbeat/publisher/queue/memqueue/internal_api.go index 95b5e0eba90f..f3ce7765cc4b 100644 --- a/libbeat/publisher/queue/memqueue/internal_api.go +++ b/libbeat/publisher/queue/memqueue/internal_api.go @@ -28,6 +28,11 @@ type pushRequest struct { // early encoding, 0 otherwise. eventSize int + // If the queue doesn't have room for an incoming event and blockIfFull + // is true, the request will be held until there is space in the queue. + // Otherwise, the queue will return failure immediately. + blockIfFull bool + // The producer that generated this event, or nil if this producer does // not require ack callbacks. producer *ackProducer @@ -50,8 +55,14 @@ type producerCancelResponse struct { // consumer -> broker API type getRequest struct { - entryCount int // request entryCount events from the broker - responseChan chan *batch // channel to send response to + // The number of entries to request, or <= 0 for no limit. + entryCount int + + // The number of (encoded) event bytes to request, or <= 0 for no limit. + byteCount int + + // The channel to send the new batch to. + responseChan chan *batch } type batchDoneMsg struct{} diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index df2d16d0dec7..f9249e9912bd 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -217,7 +217,7 @@ func TestProducerClosePreservesEventCount(t *testing.T) { // Get call will block until the queue itself is cancelled. go func() { for i := 0; i < 2; i++ { - batch, err := q.Get(2) + batch, err := q.Get(2, 0) // Only error to worry about is queue closing, which isn't // a test failure. if err == nil { @@ -274,7 +274,7 @@ func queueTestWithSettings(t *testing.T, settings Settings, eventsToTest int, te queueMetricsAreValid(t, testQueue, 5, settings.Events, 0, fmt.Sprintf("%s - First send of metrics to queue", testName)) // Read events, don't yet ack them - batch, err := testQueue.Get(eventsToTest) + batch, err := testQueue.Get(eventsToTest, 0) assert.NoError(t, err, "error in Get") t.Logf("Got batch of %d events", batch.Count()) @@ -350,7 +350,7 @@ func TestEntryIDs(t *testing.T) { } for i := 0; i < entryCount; i++ { - batch, err := q.Get(1) + batch, err := q.Get(1, 0) assert.NoError(t, err, "Queue read should succeed") assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") @@ -381,7 +381,7 @@ func TestEntryIDs(t *testing.T) { batches := []queue.Batch{} for i := 0; i < entryCount; i++ { - batch, err := q.Get(1) + batch, err := q.Get(1, 0) assert.NoError(t, err, "Queue read should succeed") assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") batches = append(batches, batch) @@ -453,10 +453,10 @@ func TestBatchFreeEntries(t *testing.T) { _, ok := producer.Publish(i) require.True(t, ok, "Queue publish must succeed") } - batch1, err := testQueue.Get(batchSize) + batch1, err := testQueue.Get(batchSize, 0) require.NoError(t, err, "Queue read must succeed") require.Equal(t, batchSize, batch1.Count(), "Returned batch size must match request") - batch2, err := testQueue.Get(batchSize) + batch2, err := testQueue.Get(batchSize, 0) require.NoError(t, err, "Queue read must succeed") require.Equal(t, batchSize, batch2.Count(), "Returned batch size must match request") // Slight concurrency subtlety: we check events are non-nil after the queue diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index 45ae3c0a1a2b..40ba2be358e4 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -20,6 +20,8 @@ package memqueue import ( "time" + "github.com/elastic/beats/v7/libbeat/common/fifo" + "github.com/elastic/beats/v7/libbeat/publisher/queue" ) @@ -37,6 +39,9 @@ type runLoop struct { // The total number of events in the queue. eventCount int + // The total number of bytes in the queue + byteCount int + // The number of consumed events waiting for acknowledgment. The next Get // request will return events starting at position // (bufPos + consumedCount) % len(buf). @@ -47,6 +52,11 @@ type runLoop struct { // outstanding batches, only the ones not yet forwarded to ackLoop.) consumedBatches batchList + // pendingPushRequests stores incoming events that can't yet fit in the + // queue. As space in the queue is freed, these requests will be handled + // in order. + pendingPushRequests fifo.FIFO[pushRequest] + // If there aren't enough events ready to fill an incoming get request, // the queue may block based on its flush settings. When this happens, // pendingGetRequest stores the request until we're ready to handle it. @@ -86,15 +96,19 @@ func (l *runLoop) run() { } } +func (l *runLoop) isSpaceAvailable() bool { + maxEvents := l.broker.settings.Events + maxBytes := l.broker.settings.Bytes + + eventsAvailable := maxEvents <= 0 || l.eventCount < maxEvents + bytesAvailable := maxBytes <= 0 || l.byteCount < maxBytes + + return eventsAvailable && bytesAvailable +} + // Perform one iteration of the queue's main run loop. Broken out into a // standalone helper function to allow testing of loop invariants. func (l *runLoop) runIteration() { - var pushChan chan pushRequest - // Push requests are enabled if the queue isn't yet full. - if l.eventCount < len(l.broker.buf) { - pushChan = l.broker.pushChan - } - var getChan chan getRequest // Get requests are enabled if the queue has events that weren't yet sent // to consumers, and no existing request is active. @@ -119,7 +133,7 @@ func (l *runLoop) runIteration() { case <-l.broker.ctx.Done(): return - case req := <-pushChan: // producer pushing new event + case req := <-l.broker.pushChan: // producer pushing new event l.handleInsert(&req) case req := <-l.broker.cancelChan: // producer cancelling active events @@ -148,8 +162,12 @@ func (l *runLoop) runIteration() { } func (l *runLoop) handleGetRequest(req *getRequest) { - if req.entryCount <= 0 || req.entryCount > l.broker.settings.MaxGetRequest { - req.entryCount = l.broker.settings.MaxGetRequest + // Backwards compatibility: if all byte parameters are <= 0, get requests + // are capped by settings.MaxGetRequest. + if req.byteCount <= 0 && l.broker.settings.Bytes <= 0 { + if req.entryCount <= 0 || req.entryCount > l.broker.settings.MaxGetRequest { + req.entryCount = l.broker.settings.MaxGetRequest + } } if l.getRequestShouldBlock(req) { l.pendingGetRequest = req diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index d25537265ea3..b74f5fca414e 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -61,7 +61,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { go func() { // Run the Get asynchronously so the test itself doesn't block if // there's a logical error. - _, _ = broker.Get(100) + _, _ = broker.Get(100, 0) }() rl.runIteration() assert.Nil(t, rl.pendingGetRequest, "Queue should have no pending get request since the request should succeed immediately") @@ -98,7 +98,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { go func() { // Run the Get asynchronously so the test itself doesn't block if // there's a logical error. - _, _ = broker.Get(101) + _, _ = broker.Get(101, 0) }() rl.runIteration() assert.NotNil(t, rl.pendingGetRequest, "Queue should have a pending get request since the queue doesn't have the requested event count") diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index e691c2888f66..32f7102c05f1 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -73,9 +73,10 @@ type Queue interface { Producer(cfg ProducerConfig) Producer - // Get retrieves a batch of up to eventCount events. If eventCount <= 0, - // there is no bound on the number of returned events. - Get(eventCount int) (Batch, error) + // Get retrieves an event batch with up to eventCount events or up to + // byteCount bytes, whichever is smaller. If either parameter is <= 0, + // there is no limit on that value. + Get(eventCount int, byteCount int) (Batch, error) Metrics() (Metrics, error) } diff --git a/libbeat/publisher/queue/queuetest/producer_cancel.go b/libbeat/publisher/queue/queuetest/producer_cancel.go index 6bb8a9bdd083..43bb42cd60d7 100644 --- a/libbeat/publisher/queue/queuetest/producer_cancel.go +++ b/libbeat/publisher/queue/queuetest/producer_cancel.go @@ -76,7 +76,7 @@ func TestProducerCancelRemovesEvents(t *testing.T, factory QueueFactory) { total := N2 - N1 events := make([]interface{}, 0, total) for len(events) < total { - batch, err := b.Get(-1) // collect all events + batch, err := b.Get(-1, 0) // collect all events if err != nil { panic(err) } diff --git a/libbeat/publisher/queue/queuetest/queuetest.go b/libbeat/publisher/queue/queuetest/queuetest.go index 96b2310d2223..ee60cb4b10c4 100644 --- a/libbeat/publisher/queue/queuetest/queuetest.go +++ b/libbeat/publisher/queue/queuetest/queuetest.go @@ -297,7 +297,7 @@ func multiConsumer(numConsumers, maxEvents, batchSize int) workerFactory { go func() { for { - batch, err := b.Get(batchSize) + batch, err := b.Get(batchSize, 0) if err != nil { return } From 8ab68edf908d2bf72893d0318fb794077fa0fbcc Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 17:17:57 -0400 Subject: [PATCH 70/99] Fill out more of the byte limits API, remove EntryID --- libbeat/publisher/pipeline/client.go | 4 +- libbeat/publisher/pipeline/controller.go | 8 +- libbeat/publisher/pipeline/pipeline_test.go | 20 ++-- .../queue/diskqueue/benchmark_test.go | 2 +- libbeat/publisher/queue/diskqueue/producer.go | 8 +- libbeat/publisher/queue/memqueue/broker.go | 2 - .../publisher/queue/memqueue/internal_api.go | 4 +- libbeat/publisher/queue/memqueue/produce.go | 81 +++++-------- .../publisher/queue/memqueue/queue_test.go | 111 +----------------- libbeat/publisher/queue/memqueue/runloop.go | 48 +++----- .../publisher/queue/memqueue/runloop_test.go | 6 +- libbeat/publisher/queue/queue.go | 23 ++-- 12 files changed, 83 insertions(+), 234 deletions(-) diff --git a/libbeat/publisher/pipeline/client.go b/libbeat/publisher/pipeline/client.go index a5c02faace6d..4a9cf39a5438 100644 --- a/libbeat/publisher/pipeline/client.go +++ b/libbeat/publisher/pipeline/client.go @@ -119,9 +119,9 @@ func (c *client) publish(e beat.Event) { var published bool if c.canDrop { - _, published = c.producer.TryPublish(pubEvent) + published = c.producer.TryPublish(pubEvent) } else { - _, published = c.producer.Publish(pubEvent) + published = c.producer.Publish(pubEvent) } if published { diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index bb75c9619c57..ea85d4891af9 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -295,12 +295,12 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { // a producer for a nonexistent queue. type emptyProducer struct{} -func (emptyProducer) Publish(_ queue.Entry) (queue.EntryID, bool) { - return 0, false +func (emptyProducer) Publish(_ queue.Entry) bool { + return false } -func (emptyProducer) TryPublish(_ queue.Entry) (queue.EntryID, bool) { - return 0, false +func (emptyProducer) TryPublish(_ queue.Entry) bool { + return false } func (emptyProducer) Cancel() int { diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index 50ed27454b62..f292810ded37 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -99,9 +99,9 @@ func makeDiscardQueue() queue.Queue { // it's also the returned Event ID count := uint64(0) producer := &testProducer{ - publish: func(try bool, event queue.Entry) (queue.EntryID, bool) { + publish: func(try bool, event queue.Entry) bool { count++ - return queue.EntryID(count), true + return true }, cancel: func() int { @@ -124,7 +124,7 @@ type testQueue struct { } type testProducer struct { - publish func(try bool, event queue.Entry) (queue.EntryID, bool) + publish func(try bool, event queue.Entry) bool cancel func() int } @@ -164,18 +164,18 @@ func (q *testQueue) Get(sz int, _ int) (queue.Batch, error) { return nil, nil } -func (p *testProducer) Publish(event queue.Entry) (queue.EntryID, bool) { +func (p *testProducer) Publish(event queue.Entry) bool { if p.publish != nil { return p.publish(false, event) } - return 0, false + return false } -func (p *testProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { +func (p *testProducer) TryPublish(event queue.Entry) bool { if p.publish != nil { return p.publish(true, event) } - return 0, false + return false } func (p *testProducer) Cancel() int { @@ -210,7 +210,7 @@ func makeTestQueue() queue.Queue { var producer *testProducer p := blockingProducer(cfg) producer = &testProducer{ - publish: func(try bool, event queue.Entry) (queue.EntryID, bool) { + publish: func(try bool, event queue.Entry) bool { if try { return p.TryPublish(event) } @@ -242,10 +242,10 @@ func blockingProducer(_ queue.ProducerConfig) queue.Producer { waiting := atomic.MakeInt(0) return &testProducer{ - publish: func(_ bool, _ queue.Entry) (queue.EntryID, bool) { + publish: func(_ bool, _ queue.Entry) bool { waiting.Inc() <-sig - return 0, false + return false }, cancel: func() int { diff --git a/libbeat/publisher/queue/diskqueue/benchmark_test.go b/libbeat/publisher/queue/diskqueue/benchmark_test.go index 134041634fa1..0d9d1b373b4c 100644 --- a/libbeat/publisher/queue/diskqueue/benchmark_test.go +++ b/libbeat/publisher/queue/diskqueue/benchmark_test.go @@ -124,7 +124,7 @@ func publishEvents(p queue.Producer, num int, protobuf bool) { } else { e = makePublisherEvent() } - _, ok := p.Publish(e) + ok := p.Publish(e) if !ok { panic("didn't publish") } diff --git a/libbeat/publisher/queue/diskqueue/producer.go b/libbeat/publisher/queue/diskqueue/producer.go index 69725c62ccc1..480afdca6ed3 100644 --- a/libbeat/publisher/queue/diskqueue/producer.go +++ b/libbeat/publisher/queue/diskqueue/producer.go @@ -49,12 +49,12 @@ type producerWriteRequest struct { // diskQueueProducer implementation of the queue.Producer interface // -func (producer *diskQueueProducer) Publish(event queue.Entry) (queue.EntryID, bool) { - return 0, producer.publish(event, true) +func (producer *diskQueueProducer) Publish(event queue.Entry) bool { + return producer.publish(event, true) } -func (producer *diskQueueProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { - return 0, producer.publish(event, false) +func (producer *diskQueueProducer) TryPublish(event queue.Entry) bool { + return producer.publish(event, false) } func (producer *diskQueueProducer) publish( diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index fabe26d8d213..1c9f7a291a1a 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -118,7 +118,6 @@ type Settings struct { type queueEntry struct { event queue.Entry - id queue.EntryID producer *ackProducer producerID producerID // The order of this entry within its producer @@ -297,7 +296,6 @@ func (b *broker) Metrics() (queue.Metrics, error) { EventCount: opt.UintWith(uint64(resp.currentQueueSize)), EventLimit: opt.UintWith(uint64(len(b.buf))), UnackedConsumedEvents: opt.UintWith(uint64(resp.occupiedRead)), - OldestEntryID: resp.oldestEntryID, }, nil } diff --git a/libbeat/publisher/queue/memqueue/internal_api.go b/libbeat/publisher/queue/memqueue/internal_api.go index f3ce7765cc4b..e7bce4eba578 100644 --- a/libbeat/publisher/queue/memqueue/internal_api.go +++ b/libbeat/publisher/queue/memqueue/internal_api.go @@ -40,7 +40,7 @@ type pushRequest struct { // The index of the event in this producer only. Used to condense // multiple acknowledgments for a producer to a single callback call. producerID producerID - resp chan queue.EntryID + resp chan bool } type producerCancelRequest struct { @@ -79,6 +79,4 @@ type memQueueMetrics struct { currentQueueSize int // the number of items that have been read by a consumer but not yet ack'ed occupiedRead int - - oldestEntryID queue.EntryID } diff --git a/libbeat/publisher/queue/memqueue/produce.go b/libbeat/publisher/queue/memqueue/produce.go index 55f15a8cc869..3932cb95fa7e 100644 --- a/libbeat/publisher/queue/memqueue/produce.go +++ b/libbeat/publisher/queue/memqueue/produce.go @@ -76,19 +76,23 @@ func newProducer(b *broker, cb ackHandler, dropCB func(queue.Entry), dropOnCance return &forgetfulProducer{broker: b, openState: openState} } -func (p *forgetfulProducer) makePushRequest(event queue.Entry) pushRequest { - resp := make(chan queue.EntryID, 1) +func (p *forgetfulProducer) makePushRequest( + event queue.Entry, + blockIfFull bool, +) pushRequest { + resp := make(chan bool, 1) return pushRequest{ - event: event, - resp: resp} + event: event, + blockIfFull: blockIfFull, + resp: resp} } -func (p *forgetfulProducer) Publish(event queue.Entry) (queue.EntryID, bool) { - return p.openState.publish(p.makePushRequest(event)) +func (p *forgetfulProducer) Publish(event queue.Entry) bool { + return p.openState.publish(p.makePushRequest(event, true)) } -func (p *forgetfulProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { - return p.openState.tryPublish(p.makePushRequest(event)) +func (p *forgetfulProducer) TryPublish(event queue.Entry) bool { + return p.openState.publish(p.makePushRequest(event, false)) } func (p *forgetfulProducer) Cancel() int { @@ -96,31 +100,32 @@ func (p *forgetfulProducer) Cancel() int { return 0 } -func (p *ackProducer) makePushRequest(event queue.Entry) pushRequest { - resp := make(chan queue.EntryID, 1) +func (p *ackProducer) makePushRequest(event queue.Entry, blockIfFull bool) pushRequest { + resp := make(chan bool, 1) return pushRequest{ - event: event, - producer: p, + event: event, + blockIfFull: blockIfFull, + producer: p, // We add 1 to the id so the default lastACK of 0 is a // valid initial state and 1 is the first real id. producerID: producerID(p.producedCount + 1), resp: resp} } -func (p *ackProducer) Publish(event queue.Entry) (queue.EntryID, bool) { - id, published := p.openState.publish(p.makePushRequest(event)) +func (p *ackProducer) Publish(event queue.Entry) bool { + published := p.openState.publish(p.makePushRequest(event, true)) if published { p.producedCount++ } - return id, published + return published } -func (p *ackProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { - id, published := p.openState.tryPublish(p.makePushRequest(event)) +func (p *ackProducer) TryPublish(event queue.Entry) bool { + published := p.openState.publish(p.makePushRequest(event, false)) if published { p.producedCount++ } - return id, published + return published } func (p *ackProducer) Cancel() int { @@ -144,7 +149,7 @@ func (st *openState) Close() { close(st.done) } -func (st *openState) publish(req pushRequest) (queue.EntryID, bool) { +func (st *openState) publish(req pushRequest) bool { // If we were given an encoder callback for incoming events, apply it before // sending the entry to the queue. if st.encoder != nil { @@ -158,44 +163,12 @@ func (st *openState) publish(req pushRequest) (queue.EntryID, bool) { // shutdown channel. select { case resp := <-req.resp: - return resp, true + return resp case <-st.queueDone: - st.events = nil - return 0, false } case <-st.done: - st.events = nil - return 0, false case <-st.queueDone: - st.events = nil - return 0, false - } -} - -func (st *openState) tryPublish(req pushRequest) (queue.EntryID, bool) { - // If we were given an encoder callback for incoming events, apply it before - // sending the entry to the queue. - if st.encoder != nil { - req.event, req.eventSize = st.encoder.EncodeEntry(req.event) - } - select { - case st.events <- req: - // The events channel is buffered, which means we may successfully - // write to it even if the queue is shutting down. To avoid blocking - // forever during shutdown, we also have to wait on the queue's - // shutdown channel. - select { - case resp := <-req.resp: - return resp, true - case <-st.queueDone: - st.events = nil - return 0, false - } - case <-st.done: - st.events = nil - return 0, false - default: - st.log.Debugf("Dropping event, queue is blocked") - return 0, false } + st.events = nil + return false } diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index f9249e9912bd..25db410dc97c 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -107,14 +107,14 @@ func TestProducerDoesNotBlockWhenQueueClosed(t *testing.T) { // Publish 2 events, this will make the queue full, but // both will be accepted for i := 0; i < 2; i++ { - id, ok := p.Publish(fmt.Sprintf("Event %d", i)) + ok := p.Publish(fmt.Sprintf("Event %d", i)) if !ok { - t.Errorf("failed to publish to the queue, event ID: %v", id) + t.Errorf("failed to publish to the queue") return } publishCount.Add(1) } - _, ok := p.Publish("Event 3") + ok := p.Publish("Event 3") if ok { t.Errorf("publishing the 3rd event must fail") return @@ -191,7 +191,7 @@ func TestProducerClosePreservesEventCount(t *testing.T) { // decrement afterwards if it failed (otherwise the event count // could become negative even under correct queue operation). activeEvents.Add(1) - _, ok := p.Publish(event) + ok := p.Publish(event) if !ok { activeEvents.Add(-1) } @@ -337,107 +337,6 @@ func TestAdjustInputQueueSize(t *testing.T) { }) } -func TestEntryIDs(t *testing.T) { - entryCount := 100 - - testForward := func(q queue.Queue) { - waiter := &producerACKWaiter{} - producer := q.Producer(queue.ProducerConfig{ACK: waiter.ack}) - for i := 0; i < entryCount; i++ { - id, success := producer.Publish(nil) - assert.Equal(t, success, true, "Queue publish should succeed") - assert.Equal(t, id, queue.EntryID(i), "Entry ID should match publication order") - } - - for i := 0; i < entryCount; i++ { - batch, err := q.Get(1, 0) - assert.NoError(t, err, "Queue read should succeed") - assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") - - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(i), - fmt.Sprintf("Oldest entry ID before ACKing event %v should be %v", i, i)) - - batch.Done() - waiter.waitForEvents(1) - metrics, err = q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(i+1), - fmt.Sprintf("Oldest entry ID after ACKing event %v should be %v", i, i+1)) - - } - } - - testBackward := func(q queue.Queue) { - waiter := &producerACKWaiter{} - producer := q.Producer(queue.ProducerConfig{ACK: waiter.ack}) - for i := 0; i < entryCount; i++ { - id, success := producer.Publish(nil) - assert.Equal(t, success, true, "Queue publish should succeed") - assert.Equal(t, id, queue.EntryID(i), "Entry ID should match publication order") - } - - batches := []queue.Batch{} - - for i := 0; i < entryCount; i++ { - batch, err := q.Get(1, 0) - assert.NoError(t, err, "Queue read should succeed") - assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") - batches = append(batches, batch) - } - - for i := entryCount - 1; i > 0; i-- { - batches[i].Done() - - // It's hard to remove this delay since the Done signal is propagated - // asynchronously to the queue, and since this test is ensuring that the - // queue _doesn't_ advance we can't use a callback to gate the comparison - // like we do in testForward. However: - // - While this race condition could sometimes let a buggy implementation - // pass, it will not produce a false failure (so it won't contribute - // to general test flakiness) - // - That notwithstanding, when the ACK _does_ cause an incorrect - // metrics update, this delay is enough to recognize it approximately - // 100% of the time, so this test is still a good signal despite - // the slight nondeterminism. - time.Sleep(1 * time.Millisecond) - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(0), - fmt.Sprintf("Oldest entry ID after ACKing event %v should be 0", i)) - } - // ACK the first batch, which should unblock all the later ones - batches[0].Done() - waiter.waitForEvents(100) - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(100), - fmt.Sprintf("Oldest entry ID after ACKing event 0 should be %v", queue.EntryID(entryCount))) - - } - - t.Run("acking in forward order with directEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000}, 0, nil) - testForward(testQueue) - }) - - t.Run("acking in reverse order with directEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000}, 0, nil) - testBackward(testQueue) - }) - - t.Run("acking in forward order with bufferedEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000, MaxGetRequest: 2, FlushTimeout: time.Microsecond}, 0, nil) - testForward(testQueue) - }) - - t.Run("acking in reverse order with bufferedEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000, MaxGetRequest: 2, FlushTimeout: time.Microsecond}, 0, nil) - testBackward(testQueue) - }) -} - func TestBatchFreeEntries(t *testing.T) { const queueSize = 10 const batchSize = 5 @@ -450,7 +349,7 @@ func TestBatchFreeEntries(t *testing.T) { testQueue := NewQueue(nil, nil, Settings{Events: queueSize, MaxGetRequest: batchSize, FlushTimeout: time.Second}, 0, nil) producer := testQueue.Producer(queue.ProducerConfig{}) for i := 0; i < queueSize; i++ { - _, ok := producer.Publish(i) + ok := producer.Publish(i) require.True(t, ok, "Queue publish must succeed") } batch1, err := testQueue.Get(batchSize, 0) diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index 40ba2be358e4..d18079fa1148 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -21,8 +21,6 @@ import ( "time" "github.com/elastic/beats/v7/libbeat/common/fifo" - - "github.com/elastic/beats/v7/libbeat/publisher/queue" ) // runLoop internal state. These fields could mostly be local variables @@ -66,11 +64,6 @@ type runLoop struct { // to a pending getRequest even if we can't fill the requested event count. // It is active if and only if pendingGetRequest is non-nil. getTimer *time.Timer - - // TODO (https://github.com/elastic/beats/issues/37893): entry IDs were a - // workaround for an external project that no longer exists. At this point - // they just complicate the API and should be removed. - nextEntryID queue.EntryID } func newRunLoop(broker *broker) *runLoop { @@ -106,6 +99,10 @@ func (l *runLoop) isSpaceAvailable() bool { return eventsAvailable && bytesAvailable } +func (l *runLoop) canHandlePushRequest(req pushRequest) bool { + return false +} + // Perform one iteration of the queue's main run loop. Broken out into a // standalone helper function to allow testing of loop invariants. func (l *runLoop) runIteration() { @@ -134,7 +131,7 @@ func (l *runLoop) runIteration() { return case req := <-l.broker.pushChan: // producer pushing new event - l.handleInsert(&req) + l.handleInsert(req) case req := <-l.broker.cancelChan: // producer cancelling active events l.handleCancel(&req) @@ -212,12 +209,21 @@ func (l *runLoop) handleDelete(count int) { l.consumedCount -= count } -func (l *runLoop) handleInsert(req *pushRequest) { - if l.insert(req, l.nextEntryID) { +func (l *runLoop) handleInsert(req pushRequest) { + if !l.canHandlePushRequest(req) { + if req.blockIfFull { + // Add this request to the pending list to be handled when there's space. + l.pendingPushRequests.Add(req) + } else { + l.broker.logger.Debugf("Dropping event, queue is blocked") + req.resp <- false + } + return + } + if l.insert(req) { // Send back the new event id. - req.resp <- l.nextEntryID + req.resp <- true - l.nextEntryID++ l.eventCount++ // See if this gave us enough for a new batch @@ -242,16 +248,14 @@ func (l *runLoop) maybeUnblockGetRequest() { } // Returns true if the event was inserted, false if insertion was cancelled. -func (l *runLoop) insert(req *pushRequest, id queue.EntryID) bool { +func (l *runLoop) insert(req pushRequest) bool { if req.producer != nil && req.producer.state.cancelled { - reportCancelledState(req) return false } index := (l.bufPos + l.eventCount) % len(l.broker.buf) l.broker.buf[index] = queueEntry{ event: req.event, - id: id, producer: req.producer, producerID: req.producerID, } @@ -259,16 +263,9 @@ func (l *runLoop) insert(req *pushRequest, id queue.EntryID) bool { } func (l *runLoop) handleMetricsRequest(req *metricsRequest) { - oldestEntryID := l.nextEntryID - if l.eventCount > 0 { - index := l.bufPos % len(l.broker.buf) - oldestEntryID = l.broker.buf[index].id - } - req.responseChan <- memQueueMetrics{ currentQueueSize: l.eventCount, occupiedRead: l.consumedCount, - oldestEntryID: oldestEntryID, } } @@ -311,10 +308,3 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { req.resp <- producerCancelResponse{removed: removedCount} } } - -func reportCancelledState(req *pushRequest) { - // do not add waiting events if producer did send cancel signal - if cb := req.producer.state.dropCB; cb != nil { - cb(req.event) - } -} diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index b74f5fca414e..51e6c4587140 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -50,7 +50,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { // Pair each publish call with an iteration of the run loop so we // get a response. go rl.runIteration() - _, ok := producer.Publish(i) + ok := producer.Publish(i) require.True(t, ok, "Queue publish call must succeed") } @@ -89,7 +89,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { // Pair each publish call with an iteration of the run loop so we // get a response. go rl.runIteration() - _, ok := producer.Publish("some event") + ok := producer.Publish("some event") require.True(t, ok, "Queue publish call must succeed") } @@ -106,7 +106,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { // Now confirm that adding one more event unblocks the request go func() { - _, _ = producer.Publish("some event") + _ = producer.Publish("some event") }() rl.runIteration() assert.Nil(t, rl.pendingGetRequest, "Queue should have no pending get request since adding an event should unblock the previous one") diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 32f7102c05f1..402f3106b437 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -47,10 +47,6 @@ type Metrics struct { //OldestActiveTimestamp is the timestamp of the oldest item in the queue. OldestActiveTimestamp common.Time - - // OldestActiveID is ID of the oldest unacknowledged event in the queue, or - // the next ID that will be assigned if the queue is empty. - OldestEntryID EntryID } // ErrMetricsNotImplemented is a hopefully temporary type to mark queue metrics as not yet implemented @@ -119,21 +115,16 @@ type ProducerConfig struct { DropOnCancel bool } -type EntryID uint64 - // Producer is an interface to be used by the pipelines client to forward // events to a queue. type Producer interface { - // Publish adds an entry to the queue, blocking if necessary, and returns - // the new entry's id and true on success. - Publish(entry Entry) (EntryID, bool) - - // TryPublish adds an entry to the queue if doing so will not block the - // caller, otherwise it immediately returns. The reasons a publish attempt - // might block are defined by the specific queue implementation and its - // configuration. If the event was successfully added, returns true with - // the event's assigned ID, and false otherwise. - TryPublish(entry Entry) (EntryID, bool) + // Publish adds an entry to the queue, blocking until there is space + // if necessary, and returns true on success. + Publish(entry Entry) bool + + // TryPublish adds an entry to the queue if the queue has space for it, + // otherwise it returns false immediately. + TryPublish(entry Entry) bool // Cancel closes this Producer endpoint. If the producer is configured to // drop its entries on Cancel, the number of dropped entries is returned. From 60e1c2cfca6c3bfb18ffb2112b409ba900281e11 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Fri, 3 May 2024 20:02:14 -0400 Subject: [PATCH 71/99] convert circular buffer indices to a helper type storing the absolute event order --- libbeat/publisher/queue/memqueue/ackloop.go | 2 +- libbeat/publisher/queue/memqueue/broker.go | 36 ++++-- libbeat/publisher/queue/memqueue/runloop.go | 106 ++++++++++++------ .../publisher/queue/memqueue/runloop_test.go | 6 +- 4 files changed, 104 insertions(+), 46 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/ackloop.go b/libbeat/publisher/queue/memqueue/ackloop.go index 1a964d8bb45f..c7d125579ce1 100644 --- a/libbeat/publisher/queue/memqueue/ackloop.go +++ b/libbeat/publisher/queue/memqueue/ackloop.go @@ -122,7 +122,7 @@ func (l *ackLoop) processACK(lst batchList, N int) { // Traverse entries from last to first, so we can acknowledge the most recent // ones first and skip subsequent producer callbacks. for i := batch.count - 1; i >= 0; i-- { - entry := batch.rawEntry(i) + entry := batch.mutableEntry(i) if entry.producer == nil { continue } diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 1c9f7a291a1a..3299f0aea90f 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -117,20 +117,32 @@ type Settings struct { } type queueEntry struct { - event queue.Entry + event queue.Entry + eventSize int producer *ackProducer producerID producerID // The order of this entry within its producer } +type entryIndex int + +func (ei entryIndex) plus(offset int) entryIndex { + return entryIndex(int(ei) + offset) +} + type batch struct { queue *broker // Next batch in the containing batchList next *batch - // Position and length of the events within the queue buffer - start, count int + // Position of the batch's events within the queue. This is an absolute + // index over the lifetime of the queue, to get the position within the + // queue's current circular buffer, use (start % len(queue.buf)). + start entryIndex + + // Number of sequential events in this batch. + count int // batch.Done() sends to doneChan, where ackLoop reads it and handles // acknowledgment / cleanup. @@ -307,7 +319,7 @@ var batchPool = sync.Pool{ }, } -func newBatch(queue *broker, start, count int) *batch { +func newBatch(queue *broker, start entryIndex, count int) *batch { batch := batchPool.Get().(*batch) batch.next = nil batch.queue = queue @@ -405,23 +417,29 @@ func (b *batch) Count() int { return b.count } +func (ei entryIndex) inBuffer(buf []queueEntry) *queueEntry { + return &buf[int(ei)%len(buf)] +} + // Return a pointer to the queueEntry for the i-th element of this batch -func (b *batch) rawEntry(i int) *queueEntry { +func (b *batch) mutableEntry(i int) *queueEntry { // Indexes wrap around the end of the queue buffer - return &b.queue.buf[(b.start+i)%len(b.queue.buf)] + entryIndex := b.start.plus(i) + return entryIndex.inBuffer(b.queue.buf) } // Return the event referenced by the i-th element of this batch func (b *batch) Entry(i int) queue.Entry { - return b.rawEntry(i).event + return b.mutableEntry(i).event } func (b *batch) FreeEntries() { // This signals that the event data has been copied out of the batch, and is // safe to free from the queue buffer, so set all the event pointers to nil. for i := 0; i < b.count; i++ { - index := (b.start + i) % len(b.queue.buf) - b.queue.buf[index].event = nil + index := b.start.plus(i) + entry := index.inBuffer(b.queue.buf) + entry.event = nil } } diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index d18079fa1148..bf0f8dcaa6f6 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -32,7 +32,7 @@ type runLoop struct { // The index of the beginning of the current ring buffer within its backing // array. If the queue isn't empty, bufPos points to the oldest remaining // event. - bufPos int + bufPos entryIndex // The total number of events in the queue. eventCount int @@ -42,8 +42,8 @@ type runLoop struct { // The number of consumed events waiting for acknowledgment. The next Get // request will return events starting at position - // (bufPos + consumedCount) % len(buf). - consumedCount int + // (bufPos + consumedEventCount) % len(buf). + consumedEventCount int // The list of batches that have been consumed and are waiting to be sent // to ackLoop for acknowledgment handling. (This list doesn't contain all @@ -89,18 +89,19 @@ func (l *runLoop) run() { } } -func (l *runLoop) isSpaceAvailable() bool { +// Returns true if the given push request can be added to the queue +// without exceeding entry count or byte limits +func (l *runLoop) canFitPushRequest(req pushRequest) bool { maxEvents := l.broker.settings.Events maxBytes := l.broker.settings.Bytes - eventsAvailable := maxEvents <= 0 || l.eventCount < maxEvents - bytesAvailable := maxBytes <= 0 || l.byteCount < maxBytes + newEventCount := l.eventCount + 1 + newByteCount := l.byteCount + req.eventSize - return eventsAvailable && bytesAvailable -} + eventCountFits := maxEvents <= 0 || newEventCount <= maxEvents + byteCountFits := maxBytes <= 0 || newByteCount <= maxBytes -func (l *runLoop) canHandlePushRequest(req pushRequest) bool { - return false + return eventCountFits && byteCountFits } // Perform one iteration of the queue's main run loop. Broken out into a @@ -109,7 +110,7 @@ func (l *runLoop) runIteration() { var getChan chan getRequest // Get requests are enabled if the queue has events that weren't yet sent // to consumers, and no existing request is active. - if l.pendingGetRequest == nil && l.eventCount > l.consumedCount { + if l.pendingGetRequest == nil && l.eventCount > l.consumedEventCount { getChan = l.broker.getChan } @@ -166,6 +167,7 @@ func (l *runLoop) handleGetRequest(req *getRequest) { req.entryCount = l.broker.settings.MaxGetRequest } } + if l.getRequestShouldBlock(req) { l.pendingGetRequest = req l.getTimer.Reset(l.broker.settings.FlushTimeout) @@ -179,52 +181,63 @@ func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { // Never block if the flush timeout isn't positive return false } - eventsAvailable := l.eventCount - l.consumedCount + eventsAvailable := l.eventCount - l.consumedEventCount // Block if the available events aren't enough to fill the request return eventsAvailable < req.entryCount } // Respond to the given get request without blocking or waiting for more events func (l *runLoop) handleGetReply(req *getRequest) { - eventsAvailable := l.eventCount - l.consumedCount + eventsAvailable := l.eventCount - l.consumedEventCount batchSize := req.entryCount if eventsAvailable < batchSize { batchSize = eventsAvailable } - startIndex := l.bufPos + l.consumedCount + startIndex := l.bufPos.plus(l.consumedEventCount) batch := newBatch(l.broker, startIndex, batchSize) // Send the batch to the caller and update internal state req.responseChan <- batch l.consumedBatches.append(batch) - l.consumedCount += batchSize + l.consumedEventCount += batchSize } func (l *runLoop) handleDelete(count int) { // Advance position and counters. Event data was already cleared in - // batch.FreeEntries when the events were vended. - l.bufPos = (l.bufPos + count) % len(l.broker.buf) + // batch.FreeEntries when the events were vended, so we just need to + // check the byte total being removed. + deletedByteCount := 0 + for i := 0; i < count; i++ { + entryIndex := l.bufPos.plus(i) + entry := entryIndex.inBuffer(l.broker.buf) + deletedByteCount += entry.eventSize + } + l.bufPos = l.bufPos.plus(count) l.eventCount -= count - l.consumedCount -= count + l.consumedEventCount -= count + l.byteCount -= deletedByteCount } func (l *runLoop) handleInsert(req pushRequest) { - if !l.canHandlePushRequest(req) { + if !l.canFitPushRequest(req) { if req.blockIfFull { // Add this request to the pending list to be handled when there's space. l.pendingPushRequests.Add(req) } else { - l.broker.logger.Debugf("Dropping event, queue is blocked") + // Request doesn't want to block, return failure immediately. + l.broker.logger.Debugf("queue is full, dropping event") req.resp <- false } return } + // There is space, insert the new event and report the result. if l.insert(req) { // Send back the new event id. req.resp <- true l.eventCount++ + l.byteCount += req.eventSize // See if this gave us enough for a new batch l.maybeUnblockGetRequest() @@ -236,7 +249,7 @@ func (l *runLoop) maybeUnblockGetRequest() { // If a get request is blocked waiting for more events, check if // we should unblock it. if getRequest := l.pendingGetRequest; getRequest != nil { - available := l.eventCount - l.consumedCount + available := l.eventCount - l.consumedEventCount if available >= getRequest.entryCount { l.pendingGetRequest = nil if !l.getTimer.Stop() { @@ -247,15 +260,39 @@ func (l *runLoop) maybeUnblockGetRequest() { } } +// growEventBuffer is called when there is no limit on the queue event +// count (i.e. the queue size is byte-based) but the queue's event buffer +// (a []queueEntry) is full. +// For this to be possible, queue indices must be stable when the buffer +// size changes. Therefore, entry positions are based on a strictly +// increasing id, so that different events have different positions, +// even when they occupy the same location in the underlying buffer. +// The buffer position is the entry's index modulo the buffer size: for +// a queue with buffer size N, the entries stored in buf[0] will have +// entry indices 0, N, 2*N, 3*N, ... +func (l *runLoop) growEventBuffer() { + +} + // Returns true if the event was inserted, false if insertion was cancelled. func (l *runLoop) insert(req pushRequest) bool { + // We reject events if their producer was cancelled before they reach + // the queue. if req.producer != nil && req.producer.state.cancelled { return false } - index := (l.bufPos + l.eventCount) % len(l.broker.buf) - l.broker.buf[index] = queueEntry{ + maxEvents := l.broker.settings.Events + if maxEvents <= 0 && l.eventCount >= len(l.broker.buf) { + // We are allowed to add this event, but we need to grow the queue buffer + // in order to do it. + l.growEventBuffer() + } + + entryIndex := l.bufPos.plus(l.eventCount) + *entryIndex.inBuffer(l.broker.buf) = queueEntry{ event: req.event, + eventSize: req.eventSize, producer: req.producer, producerID: req.producerID, } @@ -265,7 +302,7 @@ func (l *runLoop) insert(req pushRequest) bool { func (l *runLoop) handleMetricsRequest(req *metricsRequest) { req.responseChan <- memQueueMetrics{ currentQueueSize: l.eventCount, - occupiedRead: l.consumedCount, + occupiedRead: l.consumedEventCount, } } @@ -276,11 +313,12 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { // the specified producer. As we go we condense all the remaining // events to be sequential. buf := l.broker.buf - startIndex := l.bufPos + l.consumedCount - unconsumedEventCount := l.eventCount - l.consumedCount + startIndex := l.bufPos.plus(l.consumedEventCount) + unconsumedEventCount := l.eventCount - l.consumedEventCount for i := 0; i < unconsumedEventCount; i++ { - readIndex := (startIndex + i) % len(buf) - if buf[readIndex].producer == req.producer { + readIndex := startIndex.plus(i) + entry := readIndex.inBuffer(buf) + if entry.producer == req.producer { // The producer matches, skip this event removedCount++ } else { @@ -288,16 +326,18 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { // earlier indices that were removed. // (Count backwards from (startIndex + i), not from readIndex, to avoid // sign issues when the buffer wraps.) - writeIndex := (startIndex + i - removedCount) % len(buf) - buf[writeIndex] = buf[readIndex] + writeIndex := startIndex.plus(i - removedCount) + if readIndex != writeIndex { + *writeIndex.inBuffer(buf) = *readIndex.inBuffer(buf) + } } } // Clear the event pointers at the end of the buffer so we don't keep // old events in memory by accident. - for i := 0; i < removedCount; i++ { - index := (l.bufPos + l.eventCount - removedCount + i) % len(buf) - buf[index].event = nil + for i := l.eventCount - removedCount; i < l.eventCount; i++ { + entryIndex := l.bufPos.plus(i) + entryIndex.inBuffer(buf).event = nil } // Subtract removed events from the internal event count diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index 51e6c4587140..16caa24bfbf4 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -65,7 +65,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { }() rl.runIteration() assert.Nil(t, rl.pendingGetRequest, "Queue should have no pending get request since the request should succeed immediately") - assert.Equal(t, 100, rl.consumedCount, "Queue should have a consumedCount of 100 after a consumer requested all its events") + assert.Equal(t, 100, rl.consumedEventCount, "Queue should have a consumedCount of 100 after a consumer requested all its events") } func TestFlushSettingsBlockPartialBatches(t *testing.T) { @@ -102,7 +102,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { }() rl.runIteration() assert.NotNil(t, rl.pendingGetRequest, "Queue should have a pending get request since the queue doesn't have the requested event count") - assert.Equal(t, 0, rl.consumedCount, "Queue should have a consumedCount of 0 since the Get request couldn't be completely filled") + assert.Equal(t, 0, rl.consumedEventCount, "Queue should have a consumedCount of 0 since the Get request couldn't be completely filled") // Now confirm that adding one more event unblocks the request go func() { @@ -110,5 +110,5 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { }() rl.runIteration() assert.Nil(t, rl.pendingGetRequest, "Queue should have no pending get request since adding an event should unblock the previous one") - assert.Equal(t, 101, rl.consumedCount, "Queue should have a consumedCount of 101 after adding an event unblocked the pending get request") + assert.Equal(t, 101, rl.consumedEventCount, "Queue should have a consumedCount of 101 after adding an event unblocked the pending get request") } From 812f87d2ca7daacb54786b98d234a366843057c8 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Sat, 4 May 2024 00:16:57 -0400 Subject: [PATCH 72/99] Finish most byte bounds logic, add bulk_max_bytes to ES output --- libbeat/common/fifo/fifo.go | 9 +- .../report/elasticsearch/elasticsearch.go | 6 +- libbeat/outputs/console/console.go | 2 +- libbeat/outputs/discard/discard.go | 2 +- libbeat/outputs/elasticsearch/config.go | 2 + .../outputs/elasticsearch/elasticsearch.go | 9 +- libbeat/outputs/fileout/file.go | 2 +- libbeat/outputs/kafka/kafka.go | 2 +- libbeat/outputs/logstash/logstash.go | 2 +- libbeat/outputs/output_reg.go | 3 +- libbeat/outputs/redis/redis.go | 2 +- libbeat/outputs/util.go | 11 +- libbeat/publisher/pipeline/client_test.go | 4 +- libbeat/publisher/pipeline/consumer.go | 12 +- libbeat/publisher/pipeline/controller.go | 9 +- libbeat/publisher/pipeline/module.go | 2 +- libbeat/publisher/pipeline/queue_reader.go | 5 +- libbeat/publisher/pipeline/stress/out.go | 2 +- libbeat/publisher/queue/memqueue/ackloop.go | 2 +- libbeat/publisher/queue/memqueue/broker.go | 45 ++-- .../queue/memqueue/circular_buffer.go | 68 +++++ libbeat/publisher/queue/memqueue/config.go | 49 +++- .../publisher/queue/memqueue/queue_test.go | 10 +- libbeat/publisher/queue/memqueue/runloop.go | 245 ++++++++++++------ 24 files changed, 342 insertions(+), 163 deletions(-) create mode 100644 libbeat/publisher/queue/memqueue/circular_buffer.go diff --git a/libbeat/common/fifo/fifo.go b/libbeat/common/fifo/fifo.go index 79251ee201d5..c6cde4d491fd 100644 --- a/libbeat/common/fifo/fifo.go +++ b/libbeat/common/fifo/fifo.go @@ -54,11 +54,12 @@ func (f *FIFO[T]) First() (T, error) { return f.first.value, nil } -// Remove the first entry in the queue, returning its value -func (f *FIFO[T]) Get() (T, error) { - result, err := f.First() +// Remove the first entry in the queue. Does nothing if the FIFO is empty. +func (f *FIFO[T]) Remove() { if f.first != nil { f.first = f.first.next + if f.first == nil { + f.last = nil + } } - return result, err } diff --git a/libbeat/monitoring/report/elasticsearch/elasticsearch.go b/libbeat/monitoring/report/elasticsearch/elasticsearch.go index e2f58fb4b169..ac0a08932d81 100644 --- a/libbeat/monitoring/report/elasticsearch/elasticsearch.go +++ b/libbeat/monitoring/report/elasticsearch/elasticsearch.go @@ -165,9 +165,9 @@ func makeReporter(beat beat.Info, settings report.Settings, cfg *conf.C) (report }, queueConfig, outputs.Group{ - Clients: []outputs.Client{outClient}, - BatchSize: windowSize, - Retry: 0, // no retry. Drop event on error. + Clients: []outputs.Client{outClient}, + BatchEvents: windowSize, + Retry: 0, // no retry. Drop event on error. }, pipeline.Settings{ WaitClose: 0, diff --git a/libbeat/outputs/console/console.go b/libbeat/outputs/console/console.go index f723bf818c91..4f666e6eeeed 100644 --- a/libbeat/outputs/console/console.go +++ b/libbeat/outputs/console/console.go @@ -85,7 +85,7 @@ func makeConsole( } } - return outputs.Success(config.Queue, config.BatchSize, 0, nil, c) + return outputs.Success(config.Queue, config.BatchSize, 0, 0, nil, c) } func newConsole(index string, observer outputs.Observer, codec codec.Codec) (*console, error) { diff --git a/libbeat/outputs/discard/discard.go b/libbeat/outputs/discard/discard.go index c9a51b0f33df..5f0d2af91c91 100644 --- a/libbeat/outputs/discard/discard.go +++ b/libbeat/outputs/discard/discard.go @@ -56,7 +56,7 @@ func makeDiscard( // disable bulk support in publisher pipeline _ = cfg.SetInt("bulk_max_size", -1, -1) out.log.Infof("Initialized discard output") - return outputs.Success(doConfig.Queue, -1, 0, nil, out) + return outputs.Success(doConfig.Queue, -1, 0, 0, nil, out) } // Implement Outputer diff --git a/libbeat/outputs/elasticsearch/config.go b/libbeat/outputs/elasticsearch/config.go index 2f3a325c178a..cdd11b842454 100644 --- a/libbeat/outputs/elasticsearch/config.go +++ b/libbeat/outputs/elasticsearch/config.go @@ -21,6 +21,7 @@ import ( "fmt" "time" + "github.com/elastic/beats/v7/libbeat/common/cfgtype" "github.com/elastic/beats/v7/libbeat/common/transport/kerberos" "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/transport/httpcommon" @@ -39,6 +40,7 @@ type elasticsearchConfig struct { EscapeHTML bool `config:"escape_html"` Kerberos *kerberos.Config `config:"kerberos"` BulkMaxSize int `config:"bulk_max_size"` + BulkMaxBytes cfgtype.ByteSize `config:"bulk_max_bytes"` MaxRetries int `config:"max_retries"` Backoff Backoff `config:"backoff"` NonIndexablePolicy *config.Namespace `config:"non_indexable_policy"` diff --git a/libbeat/outputs/elasticsearch/elasticsearch.go b/libbeat/outputs/elasticsearch/elasticsearch.go index 9bc8498afe45..a4d599ec7189 100644 --- a/libbeat/outputs/elasticsearch/elasticsearch.go +++ b/libbeat/outputs/elasticsearch/elasticsearch.go @@ -135,7 +135,14 @@ func makeES( clients[i] = client } - return outputs.SuccessNet(esConfig.Queue, esConfig.LoadBalance, esConfig.BulkMaxSize, esConfig.MaxRetries, encoderFactory, clients) + return outputs.SuccessNet( + esConfig.Queue, + esConfig.LoadBalance, + esConfig.BulkMaxSize, + int(esConfig.BulkMaxBytes), + esConfig.MaxRetries, + encoderFactory, + clients) } func buildSelectors( diff --git a/libbeat/outputs/fileout/file.go b/libbeat/outputs/fileout/file.go index d14bd99d69ad..ca405cea8e20 100644 --- a/libbeat/outputs/fileout/file.go +++ b/libbeat/outputs/fileout/file.go @@ -66,7 +66,7 @@ func makeFileout( return outputs.Fail(err) } - return outputs.Success(foConfig.Queue, -1, 0, nil, fo) + return outputs.Success(foConfig.Queue, -1, 0, 0, nil, fo) } func (out *fileOutput) init(beat beat.Info, c fileOutConfig) error { diff --git a/libbeat/outputs/kafka/kafka.go b/libbeat/outputs/kafka/kafka.go index cb23823a95a3..785a8ab136da 100644 --- a/libbeat/outputs/kafka/kafka.go +++ b/libbeat/outputs/kafka/kafka.go @@ -84,7 +84,7 @@ func makeKafka( if kConfig.MaxRetries < 0 { retry = -1 } - return outputs.Success(kConfig.Queue, kConfig.BulkMaxSize, retry, nil, client) + return outputs.Success(kConfig.Queue, kConfig.BulkMaxSize, 0, retry, nil, client) } // buildTopicSelector builds the topic selector for standalone Beat and when diff --git a/libbeat/outputs/logstash/logstash.go b/libbeat/outputs/logstash/logstash.go index c4c51ae54373..6a3711e4f4d5 100644 --- a/libbeat/outputs/logstash/logstash.go +++ b/libbeat/outputs/logstash/logstash.go @@ -85,5 +85,5 @@ func makeLogstash( clients[i] = client } - return outputs.SuccessNet(lsConfig.Queue, lsConfig.LoadBalance, lsConfig.BulkMaxSize, lsConfig.MaxRetries, nil, clients) + return outputs.SuccessNet(lsConfig.Queue, lsConfig.LoadBalance, lsConfig.BulkMaxSize, 0, lsConfig.MaxRetries, nil, clients) } diff --git a/libbeat/outputs/output_reg.go b/libbeat/outputs/output_reg.go index fdd8e22a6634..83b3feb73fa2 100644 --- a/libbeat/outputs/output_reg.go +++ b/libbeat/outputs/output_reg.go @@ -56,7 +56,8 @@ type IndexSelector interface { // configuration into the outputs. type Group struct { Clients []Client - BatchSize int + BatchEvents int + BatchBytes int Retry int QueueFactory queue.QueueFactory diff --git a/libbeat/outputs/redis/redis.go b/libbeat/outputs/redis/redis.go index d0cba1e70618..0bff9c9ee1a9 100644 --- a/libbeat/outputs/redis/redis.go +++ b/libbeat/outputs/redis/redis.go @@ -165,7 +165,7 @@ func makeRedis( clients[i] = newBackoffClient(client, rConfig.Backoff.Init, rConfig.Backoff.Max) } - return outputs.SuccessNet(rConfig.Queue, rConfig.LoadBalance, rConfig.BulkMaxSize, rConfig.MaxRetries, nil, clients) + return outputs.SuccessNet(rConfig.Queue, rConfig.LoadBalance, rConfig.BulkMaxSize, 0, rConfig.MaxRetries, nil, clients) } func buildKeySelector(cfg *config.C) (outil.Selector, error) { diff --git a/libbeat/outputs/util.go b/libbeat/outputs/util.go index 8b3d96fcaa5f..6157c5b1fe65 100644 --- a/libbeat/outputs/util.go +++ b/libbeat/outputs/util.go @@ -35,7 +35,7 @@ func Fail(err error) (Group, error) { return Group{}, err } // instances. The first argument is expected to contain a queue // config.Namespace. The queue config is passed to assign the queue // factory when elastic-agent reloads the output. -func Success(cfg config.Namespace, batchSize, retry int, encoderFactory queue.EncoderFactory, clients ...Client) (Group, error) { +func Success(cfg config.Namespace, batchEvents, batchBytes, retry int, encoderFactory queue.EncoderFactory, clients ...Client) (Group, error) { var q queue.QueueFactory if cfg.IsSet() && cfg.Config().Enabled() { switch cfg.Name() { @@ -60,7 +60,8 @@ func Success(cfg config.Namespace, batchSize, retry int, encoderFactory queue.En } return Group{ Clients: clients, - BatchSize: batchSize, + BatchEvents: batchEvents, + BatchBytes: batchBytes, Retry: retry, QueueFactory: q, EncoderFactory: encoderFactory, @@ -80,12 +81,12 @@ func NetworkClients(netclients []NetworkClient) []Client { // The first argument is expected to contain a queue config.Namespace. // The queue config is passed to assign the queue factory when // elastic-agent reloads the output. -func SuccessNet(cfg config.Namespace, loadbalance bool, batchSize, retry int, encoderFactory queue.EncoderFactory, netclients []NetworkClient) (Group, error) { +func SuccessNet(cfg config.Namespace, loadbalance bool, batchEvents, batchBytes, retry int, encoderFactory queue.EncoderFactory, netclients []NetworkClient) (Group, error) { if !loadbalance { - return Success(cfg, batchSize, retry, encoderFactory, NewFailoverClient(netclients)) + return Success(cfg, batchEvents, batchBytes, retry, encoderFactory, NewFailoverClient(netclients)) } clients := NetworkClients(netclients) - return Success(cfg, batchSize, retry, encoderFactory, clients...) + return Success(cfg, batchEvents, batchBytes, retry, encoderFactory, clients...) } diff --git a/libbeat/publisher/pipeline/client_test.go b/libbeat/publisher/pipeline/client_test.go index b9a6f9c6f3b7..8b421883e335 100644 --- a/libbeat/publisher/pipeline/client_test.go +++ b/libbeat/publisher/pipeline/client_test.go @@ -336,8 +336,8 @@ func TestMonitoring(t *testing.T) { }) } return "output_name", outputs.Group{ - BatchSize: batchSize, - Clients: clients, + BatchEvents: batchSize, + Clients: clients, }, nil }, ) diff --git a/libbeat/publisher/pipeline/consumer.go b/libbeat/publisher/pipeline/consumer.go index 1ff8c1bc95d7..3d3c4668d5bb 100644 --- a/libbeat/publisher/pipeline/consumer.go +++ b/libbeat/publisher/pipeline/consumer.go @@ -58,10 +58,11 @@ type eventConsumer struct { // consumerTarget specifies the queue to read from, the parameters needed // to generate a batch, and the output channel to send batches to. type consumerTarget struct { - queue queue.Queue - ch chan publisher.Batch - timeToLive int - batchSize int + queue queue.Queue + ch chan publisher.Batch + timeToLive int + batchEvents int + batchBytes int } // retryRequest is used by ttlBatch to add itself back to the eventConsumer @@ -134,7 +135,8 @@ outerLoop: c.queueReader.req <- queueReaderRequest{ queue: target.queue, retryer: c, - batchSize: target.batchSize, + eventCount: target.batchEvents, + byteCount: target.batchBytes, timeToLive: target.timeToLive, } } diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index ea85d4891af9..2645569f6a54 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -170,10 +170,11 @@ func (c *outputController) Set(outGrp outputs.Group) { // Resume consumer targeting the new work queue c.consumer.setTarget( consumerTarget{ - queue: c.queue, - ch: targetChan, - batchSize: outGrp.BatchSize, - timeToLive: outGrp.Retry + 1, + queue: c.queue, + ch: targetChan, + batchEvents: outGrp.BatchEvents, + batchBytes: outGrp.BatchBytes, + timeToLive: outGrp.Retry + 1, }) } diff --git a/libbeat/publisher/pipeline/module.go b/libbeat/publisher/pipeline/module.go index 934d3c0db3da..71eef324d416 100644 --- a/libbeat/publisher/pipeline/module.go +++ b/libbeat/publisher/pipeline/module.go @@ -155,7 +155,7 @@ func loadOutput( telemetry = monitors.Telemetry.NewRegistry("output") } monitoring.NewString(telemetry, "name").Set(outName) - monitoring.NewInt(telemetry, "batch_size").Set(int64(out.BatchSize)) + monitoring.NewInt(telemetry, "batch_size").Set(int64(out.BatchEvents)) monitoring.NewInt(telemetry, "clients").Set(int64(len(out.Clients))) } diff --git a/libbeat/publisher/pipeline/queue_reader.go b/libbeat/publisher/pipeline/queue_reader.go index bc2ce894b853..b240fb771a92 100644 --- a/libbeat/publisher/pipeline/queue_reader.go +++ b/libbeat/publisher/pipeline/queue_reader.go @@ -33,7 +33,8 @@ type queueReader struct { type queueReaderRequest struct { queue queue.Queue retryer retryer - batchSize int + eventCount int + byteCount int timeToLive int } @@ -54,7 +55,7 @@ func (qr *queueReader) run(logger *logp.Logger) { logger.Debug("pipeline event consumer queue reader: stop") return } - queueBatch, _ := req.queue.Get(req.batchSize, 0) + queueBatch, _ := req.queue.Get(req.eventCount, req.byteCount) var batch *ttlBatch if queueBatch != nil { batch = newBatch(req.retryer, queueBatch, req.timeToLive) diff --git a/libbeat/publisher/pipeline/stress/out.go b/libbeat/publisher/pipeline/stress/out.go index 03ea06d3be86..e755e3a812aa 100644 --- a/libbeat/publisher/pipeline/stress/out.go +++ b/libbeat/publisher/pipeline/stress/out.go @@ -67,7 +67,7 @@ func makeTestOutput(_ outputs.IndexManager, beat beat.Info, observer outputs.Obs clients[i] = client } - return outputs.Success(config.Queue, config.BulkMaxSize, config.Retry, nil, clients...) + return outputs.Success(config.Queue, config.BulkMaxSize, 0, config.Retry, nil, clients...) } func (*testOutput) Close() error { return nil } diff --git a/libbeat/publisher/queue/memqueue/ackloop.go b/libbeat/publisher/queue/memqueue/ackloop.go index c7d125579ce1..7069f170d4d7 100644 --- a/libbeat/publisher/queue/memqueue/ackloop.go +++ b/libbeat/publisher/queue/memqueue/ackloop.go @@ -122,7 +122,7 @@ func (l *ackLoop) processACK(lst batchList, N int) { // Traverse entries from last to first, so we can acknowledge the most recent // ones first and skip subsequent producer callbacks. for i := batch.count - 1; i >= 0; i-- { - entry := batch.mutableEntry(i) + entry := batch.entry(i) if entry.producer == nil { continue } diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 3299f0aea90f..bd48e1a69f7a 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -47,10 +47,6 @@ type broker struct { ctx context.Context ctxCancel context.CancelFunc - // The ring buffer backing the queue. All buffer positions should be taken - // modulo the size of this array. - buf []queueEntry - // wait group for queue workers (runLoop and ackLoop) wg sync.WaitGroup @@ -96,7 +92,8 @@ type broker struct { /////////////////////////////// // internal goroutine state - // The goroutine that manages the queue's core run state + // The goroutine that manages the queue's core run state and owns its + // backing buffer. runLoop *runLoop // The goroutine that manages ack notifications and callbacks @@ -106,7 +103,8 @@ type broker struct { type Settings struct { // The number of events and bytes the queue can hold. <= zero means no limit. // At least one must be greater than zero. - Events, Bytes int + Events int + Bytes int // The most events that will ever be returned from one Get request. MaxGetRequest int @@ -124,14 +122,11 @@ type queueEntry struct { producerID producerID // The order of this entry within its producer } -type entryIndex int - -func (ei entryIndex) plus(offset int) entryIndex { - return entryIndex(int(ei) + offset) -} - type batch struct { - queue *broker + // The queue buffer (at the time that this batch was generated -- + // only the indices corresponding to this batch's events are guaranteed + // to be valid). + queueBuf circularBuffer // Next batch in the containing batchList next *batch @@ -217,7 +212,7 @@ func newQueue( } // Can't request more than the full queue - if settings.MaxGetRequest > settings.Events { + if settings.Events > 0 && settings.MaxGetRequest > settings.Events { settings.MaxGetRequest = settings.Events } @@ -229,8 +224,6 @@ func newQueue( settings: settings, logger: logger, - buf: make([]queueEntry, settings.Events), - encoderFactory: encoderFactory, // broker API channels @@ -264,7 +257,7 @@ func (b *broker) QueueType() string { func (b *broker) BufferConfig() queue.BufferConfig { return queue.BufferConfig{ - MaxEvents: len(b.buf), + MaxEvents: b.settings.Events, } } @@ -305,8 +298,9 @@ func (b *broker) Metrics() (queue.Metrics, error) { resp := <-responseChan return queue.Metrics{ - EventCount: opt.UintWith(uint64(resp.currentQueueSize)), - EventLimit: opt.UintWith(uint64(len(b.buf))), + EventCount: opt.UintWith(uint64(resp.currentQueueSize)), + // hi fae, this metric is sometimes inapplicable now: + EventLimit: opt.UintWith(uint64(b.settings.Events)), UnackedConsumedEvents: opt.UintWith(uint64(resp.occupiedRead)), }, nil } @@ -319,10 +313,10 @@ var batchPool = sync.Pool{ }, } -func newBatch(queue *broker, start entryIndex, count int) *batch { +func newBatch(queueBuf circularBuffer, start entryIndex, count int) *batch { batch := batchPool.Get().(*batch) batch.next = nil - batch.queue = queue + batch.queueBuf = queueBuf batch.start = start batch.count = count return batch @@ -422,15 +416,14 @@ func (ei entryIndex) inBuffer(buf []queueEntry) *queueEntry { } // Return a pointer to the queueEntry for the i-th element of this batch -func (b *batch) mutableEntry(i int) *queueEntry { - // Indexes wrap around the end of the queue buffer +func (b *batch) entry(i int) *queueEntry { entryIndex := b.start.plus(i) - return entryIndex.inBuffer(b.queue.buf) + return b.queueBuf.entry(entryIndex) } // Return the event referenced by the i-th element of this batch func (b *batch) Entry(i int) queue.Entry { - return b.mutableEntry(i).event + return b.entry(i).event } func (b *batch) FreeEntries() { @@ -438,7 +431,7 @@ func (b *batch) FreeEntries() { // safe to free from the queue buffer, so set all the event pointers to nil. for i := 0; i < b.count; i++ { index := b.start.plus(i) - entry := index.inBuffer(b.queue.buf) + entry := b.queueBuf.entry(index) entry.event = nil } } diff --git a/libbeat/publisher/queue/memqueue/circular_buffer.go b/libbeat/publisher/queue/memqueue/circular_buffer.go new file mode 100644 index 000000000000..0b3e3ec8df56 --- /dev/null +++ b/libbeat/publisher/queue/memqueue/circular_buffer.go @@ -0,0 +1,68 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package memqueue + +// the queue's underlying array buffer needs to coordinate concurrent +// access by: +// +// runLoop +// - when a pushRequest is accepted, writes to the newly created entry index. +// - when a producer is cancelled, reads and writes to entry indices that +// have been created but not yet consumed, to discard events from that +// producer. +// - when entries are deleted (after consumed events have been +// acknowledged), reads from the deleted entry indices. +// - when a pushRequest requires resizing of the array, expands and/or +// replaces the buffer. +// +// the queue's consumer (in a live Beat this means queueReader in +// libbeat/publisher/pipeline/queue_reader.go) which reads from entry +// indices that have been consumed but not deleted via (*batch).Entry(). +// +// ackLoop, which reads producer metadata from acknowledged entry +// indices before they are deleted so acknowledgment callbacks can be +// invoked. +// +// Most of these are not in conflict since they access disjoint array indices. +// The exception is growing the circular buffer, which conflicts with read +// access from batches of consumed events. +type circularBuffer struct { + // Do not access this array directly! use (circularBuffer).entry(). + _entries []queueEntry +} + +type entryIndex int + +func newCircularBuffer(size int) circularBuffer { + return circularBuffer{ + _entries: make([]queueEntry, size), + } +} + +func (cb circularBuffer) size() int { + return len(cb._entries) +} + +func (cb circularBuffer) entry(i entryIndex) *queueEntry { + rawIndex := int(i) % len(cb._entries) + return &cb._entries[rawIndex] +} + +func (ei entryIndex) plus(offset int) entryIndex { + return entryIndex(int(ei) + offset) +} diff --git a/libbeat/publisher/queue/memqueue/config.go b/libbeat/publisher/queue/memqueue/config.go index 975da5118671..52483f7689f1 100644 --- a/libbeat/publisher/queue/memqueue/config.go +++ b/libbeat/publisher/queue/memqueue/config.go @@ -22,12 +22,13 @@ import ( "fmt" "time" + "github.com/elastic/beats/v7/libbeat/common/cfgtype" c "github.com/elastic/elastic-agent-libs/config" ) type config struct { - Events int `config:"events" validate:"min=32"` - Bytes int `config:"bytes" validate:"min=32768"` + Events *int `config:"events" validate:"min=32"` + Bytes *cfgtype.ByteSize `config:"bytes"` // This field is named MaxGetEvents because its logical effect is to give // a maximum on the number of events a Get request can return, but the @@ -38,32 +39,52 @@ type config struct { FlushTimeout time.Duration `config:"flush.timeout"` } -var defaultConfig = config{ - Events: 3200, - MaxGetEvents: 1600, - FlushTimeout: 10 * time.Second, -} +const minQueueBytes = 32768 +const minQueueEvents = 32 func (c *config) Validate() error { - if c.MaxGetEvents > c.Events { - return errors.New("flush.min_events must be less events") + if c.Bytes != nil && *c.Bytes < minQueueBytes { + return errors.New(fmt.Sprintf("queue byte size must be at least %v", minQueueBytes)) + } + if c.Events != nil && *c.Events < minQueueEvents { + return errors.New(fmt.Sprintf("queue event size must be at least %v", minQueueEvents)) + } + if c.Events == nil && c.Bytes == nil { + return errors.New("queue must have an event limit or a byte limit") + } + if c.Events != nil && c.MaxGetEvents > *c.Events { + return errors.New("flush.min_events must be less than events") } return nil } +var defaultConfig = config{ + MaxGetEvents: 1600, + FlushTimeout: 10 * time.Second, +} + // SettingsForUserConfig unpacks a ucfg config from a Beats queue // configuration and returns the equivalent memqueue.Settings object. func SettingsForUserConfig(cfg *c.C) (Settings, error) { - config := defaultConfig + var config config if cfg != nil { if err := cfg.Unpack(&config); err != nil { return Settings{}, fmt.Errorf("couldn't unpack memory queue config: %w", err) } } - //nolint:gosimple // Actually want this conversion to be explicit since the types aren't definitionally equal. - return Settings{ - Events: config.Events, + result := Settings{ MaxGetRequest: config.MaxGetEvents, FlushTimeout: config.FlushTimeout, - }, nil + } + if config.Events != nil { + result.Events = *config.Events + } + if config.Bytes != nil { + result.Bytes = int(*config.Bytes) + } + // If no size constraint was given, fall back on the default event cap + if config.Events == nil && config.Bytes == nil { + result.Events = 3200 + } + return result, nil } diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index 25db410dc97c..f45a6fe20fa1 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -361,17 +361,19 @@ func TestBatchFreeEntries(t *testing.T) { // Slight concurrency subtlety: we check events are non-nil after the queue // reads, since if we do it before we have no way to be sure the insert // has been completed. + queueBuf := testQueue.runLoop.buf for i := 0; i < queueSize; i++ { - require.NotNil(t, testQueue.buf[i].event, "All queue events must be non-nil") + require.NotNil(t, queueBuf.entry(entryIndex(i)).event, "All queue events must be non-nil") } batch2.FreeEntries() for i := 0; i < batchSize; i++ { - require.NotNilf(t, testQueue.buf[i].event, "Queue index %v: batch 1's events should be unaffected by calling FreeEntries on Batch 2", i) - require.Nilf(t, testQueue.buf[batchSize+i].event, "Queue index %v: batch 2's events should be nil after FreeEntries", batchSize+i) + entryIndex := entryIndex(i) + require.NotNilf(t, queueBuf.entry(entryIndex).event, "Queue index %v: batch 1's events should be unaffected by calling FreeEntries on Batch 2", i) + require.Nilf(t, queueBuf.entry(entryIndex.plus(batchSize)).event, "Queue index %v: batch 2's events should be nil after FreeEntries", batchSize+i) } batch1.FreeEntries() for i := 0; i < queueSize; i++ { - require.Nilf(t, testQueue.buf[i].event, "Queue index %v: all events should be nil after calling FreeEntries on both batches") + require.Nilf(t, queueBuf.entry(entryIndex(i)).event, "Queue index %v: all events should be nil after calling FreeEntries on both batches") } } diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index bf0f8dcaa6f6..829fde9556b6 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -29,22 +29,32 @@ import ( type runLoop struct { broker *broker - // The index of the beginning of the current ring buffer within its backing - // array. If the queue isn't empty, bufPos points to the oldest remaining - // event. - bufPos entryIndex - - // The total number of events in the queue. + // The buffer backing the queue. Don't access its internal array directly, + // use an entryIndex: buf.entry(entryIndex) returns a pointer to the target + // entry within the buffer. + // Accessing this way handles the modular arithmetic to convert entry index + // to buffer index, in a way that's compatible with dynamically growing the + // underlying array (which is important when the queue has no maximum event + // count). + buf circularBuffer + + // The index of the oldest entry in the underlying circular buffer. + bufferStart entryIndex + + // The current number of events in the queue. eventCount int - // The total number of bytes in the queue + // The current number of bytes in the queue. byteCount int // The number of consumed events waiting for acknowledgment. The next Get - // request will return events starting at position - // (bufPos + consumedEventCount) % len(buf). + // request will return events starting at index + // bufferStart.plus(consumedEventCount). consumedEventCount int + // The number of event bytes in the queue corresponding to consumed events. + consumedByteCount int + // The list of batches that have been consumed and are waiting to be sent // to ackLoop for acknowledgment handling. (This list doesn't contain all // outstanding batches, only the ones not yet forwarded to ackLoop.) @@ -56,13 +66,15 @@ type runLoop struct { pendingPushRequests fifo.FIFO[pushRequest] // If there aren't enough events ready to fill an incoming get request, - // the queue may block based on its flush settings. When this happens, - // pendingGetRequest stores the request until we're ready to handle it. + // the request may block based on the queue flush settings. When this + // happens, pendingGetRequest stores the request until we can handle it. pendingGetRequest *getRequest - // This timer tracks the configured flush timeout when we will respond - // to a pending getRequest even if we can't fill the requested event count. - // It is active if and only if pendingGetRequest is non-nil. + // When a get request is blocked because the queue doesn't have enough + // events, getTimer stores the flush timer. When it expires, the queue + // will respond to the request even if the requested number of events + // and/or bytes is not available. + // getTimer is active if and only if pendingGetRequest is non-nil. getTimer *time.Timer } @@ -77,9 +89,18 @@ func newRunLoop(broker *broker) *runLoop { <-timer.C } } + + eventBufSize := broker.settings.Events + if eventBufSize <= 0 { + // The queue is using byte limits, start with a buffer of 2^10 and + // we will expand it as needed. + eventBufSize = 1 << 10 + } + return &runLoop{ broker: broker, getTimer: timer, + buf: newCircularBuffer(eventBufSize), } } @@ -89,21 +110,6 @@ func (l *runLoop) run() { } } -// Returns true if the given push request can be added to the queue -// without exceeding entry count or byte limits -func (l *runLoop) canFitPushRequest(req pushRequest) bool { - maxEvents := l.broker.settings.Events - maxBytes := l.broker.settings.Bytes - - newEventCount := l.eventCount + 1 - newByteCount := l.byteCount + req.eventSize - - eventCountFits := maxEvents <= 0 || newEventCount <= maxEvents - byteCountFits := maxBytes <= 0 || newByteCount <= maxBytes - - return eventCountFits && byteCountFits -} - // Perform one iteration of the queue's main run loop. Broken out into a // standalone helper function to allow testing of loop invariants. func (l *runLoop) runIteration() { @@ -132,7 +138,7 @@ func (l *runLoop) runIteration() { return case req := <-l.broker.pushChan: // producer pushing new event - l.handleInsert(req) + l.handlePushRequest(req) case req := <-l.broker.cancelChan: // producer cancelling active events l.handleCancel(&req) @@ -163,7 +169,7 @@ func (l *runLoop) handleGetRequest(req *getRequest) { // Backwards compatibility: if all byte parameters are <= 0, get requests // are capped by settings.MaxGetRequest. if req.byteCount <= 0 && l.broker.settings.Bytes <= 0 { - if req.entryCount <= 0 || req.entryCount > l.broker.settings.MaxGetRequest { + if req.entryCount > l.broker.settings.MaxGetRequest { req.entryCount = l.broker.settings.MaxGetRequest } } @@ -181,46 +187,84 @@ func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { // Never block if the flush timeout isn't positive return false } - eventsAvailable := l.eventCount - l.consumedEventCount - // Block if the available events aren't enough to fill the request - return eventsAvailable < req.entryCount + availableEntries := l.eventCount - l.consumedEventCount + availableBytes := l.byteCount - l.consumedByteCount + + // The entry/byte limits are satisfied if they are <= 0 (indicating no + // limit) or if we have at least the requested number available. + entriesSatisfied := req.entryCount <= 0 || availableEntries >= req.entryCount + bytesSatisfied := req.byteCount <= 0 || availableBytes >= req.byteCount + + // Block if there are neither enough entries nor enough bytes to fill + // the request. + return !entriesSatisfied && !bytesSatisfied } // Respond to the given get request without blocking or waiting for more events func (l *runLoop) handleGetReply(req *getRequest) { - eventsAvailable := l.eventCount - l.consumedEventCount - batchSize := req.entryCount - if eventsAvailable < batchSize { - batchSize = eventsAvailable + entriesAvailable := l.eventCount - l.consumedEventCount + // backwards compatibility: if all byte bounds are <= 0 then batch size + // can't be more than settings.MaxGetRequest. + if req.byteCount <= 0 && l.broker.settings.Bytes <= 0 { + if entriesAvailable > l.broker.settings.MaxGetRequest { + entriesAvailable = l.broker.settings.MaxGetRequest + } + } + startIndex := l.bufferStart.plus(l.consumedEventCount) + batchEntryCount := 0 + batchByteCount := 0 + + for i := 0; i < entriesAvailable; i++ { + if req.entryCount > 0 && batchEntryCount+1 > req.entryCount { + // This would push us over the requested event limit, stop here. + break + } + eventSize := l.buf.entry(startIndex.plus(batchEntryCount)).eventSize + // Don't apply size checks on the first event: if a single event is + // larger than the configured batch maximum, we'll still try to send it, + // we'll just do it in a "batch" of one event. + if i > 0 && req.byteCount > 0 && batchByteCount+eventSize > req.byteCount { + // This would push us over the requested byte limit, stop here. + break + } + batchEntryCount++ + batchByteCount += eventSize } - startIndex := l.bufPos.plus(l.consumedEventCount) - batch := newBatch(l.broker, startIndex, batchSize) + batch := newBatch(l.buf, startIndex, batchEntryCount) // Send the batch to the caller and update internal state req.responseChan <- batch l.consumedBatches.append(batch) - l.consumedEventCount += batchSize + l.consumedEventCount += batchEntryCount + l.consumedByteCount += batchByteCount } -func (l *runLoop) handleDelete(count int) { +func (l *runLoop) handleDelete(deletedEntryCount int) { // Advance position and counters. Event data was already cleared in // batch.FreeEntries when the events were vended, so we just need to // check the byte total being removed. deletedByteCount := 0 - for i := 0; i < count; i++ { - entryIndex := l.bufPos.plus(i) - entry := entryIndex.inBuffer(l.broker.buf) - deletedByteCount += entry.eventSize + for i := 0; i < deletedEntryCount; i++ { + entryIndex := l.bufferStart.plus(i) + deletedByteCount += l.buf.entry(entryIndex).eventSize } - l.bufPos = l.bufPos.plus(count) - l.eventCount -= count - l.consumedEventCount -= count + l.bufferStart = l.bufferStart.plus(deletedEntryCount) + l.eventCount -= deletedEntryCount l.byteCount -= deletedByteCount + l.consumedEventCount -= deletedEntryCount + l.consumedByteCount -= deletedByteCount + + // We just freed up space in the queue, see if this unblocked any + // pending inserts. + l.maybeUnblockPushRequests() } -func (l *runLoop) handleInsert(req pushRequest) { - if !l.canFitPushRequest(req) { +func (l *runLoop) handlePushRequest(req pushRequest) { + // If other inserts are already pending, or we don't have enough room + // for the new entry, we need to either reject the request or block + // until we can handle it. + if !l.pendingPushRequests.Empty() || !l.canFitPushRequest(req) { if req.blockIfFull { // Add this request to the pending list to be handled when there's space. l.pendingPushRequests.Add(req) @@ -232,34 +276,51 @@ func (l *runLoop) handleInsert(req pushRequest) { return } // There is space, insert the new event and report the result. - if l.insert(req) { - // Send back the new event id. - req.resp <- true + l.doInsert(req) +} - l.eventCount++ - l.byteCount += req.eventSize +// Returns true if the given push request can be added to the queue +// without exceeding entry count or byte limits +func (l *runLoop) canFitPushRequest(req pushRequest) bool { + maxEvents := l.broker.settings.Events + maxBytes := l.broker.settings.Bytes - // See if this gave us enough for a new batch - l.maybeUnblockGetRequest() - } + newEventCount := l.eventCount + 1 + newByteCount := l.byteCount + req.eventSize + + eventCountFits := maxEvents <= 0 || newEventCount <= maxEvents + byteCountFits := maxBytes <= 0 || newByteCount <= maxBytes + + return eventCountFits && byteCountFits } // Checks if we can handle pendingGetRequest yet, and handles it if so func (l *runLoop) maybeUnblockGetRequest() { - // If a get request is blocked waiting for more events, check if - // we should unblock it. - if getRequest := l.pendingGetRequest; getRequest != nil { - available := l.eventCount - l.consumedEventCount - if available >= getRequest.entryCount { + if l.pendingGetRequest != nil { + if !l.getRequestShouldBlock(l.pendingGetRequest) { + l.handleGetReply(l.pendingGetRequest) l.pendingGetRequest = nil if !l.getTimer.Stop() { <-l.getTimer.C } - l.handleGetReply(getRequest) } } } +func (l *runLoop) maybeUnblockPushRequests() { + req, err := l.pendingPushRequests.First() + for err == nil { + if !l.canFitPushRequest(req) { + break + } + l.doInsert(req) + l.pendingPushRequests.Remove() + + // Fetch the next request + req, err = l.pendingPushRequests.First() + } +} + // growEventBuffer is called when there is no limit on the queue event // count (i.e. the queue size is byte-based) but the queue's event buffer // (a []queueEntry) is full. @@ -271,32 +332,51 @@ func (l *runLoop) maybeUnblockGetRequest() { // a queue with buffer size N, the entries stored in buf[0] will have // entry indices 0, N, 2*N, 3*N, ... func (l *runLoop) growEventBuffer() { - + bufSize := l.buf.size() + newBuffer := newCircularBuffer(bufSize * 2) + // Copy the elements to the new buffer + for i := 0; i < bufSize; i++ { + index := l.bufferStart.plus(i) + *newBuffer.entry(index) = *l.buf.entry(index) + } + l.buf = newBuffer } -// Returns true if the event was inserted, false if insertion was cancelled. -func (l *runLoop) insert(req pushRequest) bool { +// Insert the given new event without bounds checks, and report the result +// to the caller via the push request's response channel. +func (l *runLoop) doInsert(req pushRequest) { // We reject events if their producer was cancelled before they reach // the queue. if req.producer != nil && req.producer.state.cancelled { - return false + // Report failure to the caller (this only happens if the producer is + // closed before we handle the insert request). + req.resp <- false + return } maxEvents := l.broker.settings.Events - if maxEvents <= 0 && l.eventCount >= len(l.broker.buf) { - // We are allowed to add this event, but we need to grow the queue buffer - // in order to do it. + // If there is no event limit, check if we need to grow the current queue + // buffer to fit the new event. + if maxEvents <= 0 && l.eventCount >= l.buf.size() { l.growEventBuffer() } - entryIndex := l.bufPos.plus(l.eventCount) - *entryIndex.inBuffer(l.broker.buf) = queueEntry{ + entryIndex := l.bufferStart.plus(l.eventCount) + *l.buf.entry(entryIndex) = queueEntry{ event: req.event, eventSize: req.eventSize, producer: req.producer, producerID: req.producerID, } - return true + + // Report success to the caller + req.resp <- true + + l.eventCount++ + l.byteCount += req.eventSize + + // See if this gave us enough for a new batch + l.maybeUnblockGetRequest() } func (l *runLoop) handleMetricsRequest(req *metricsRequest) { @@ -312,12 +392,11 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { // Traverse all unconsumed events in the buffer, removing any with // the specified producer. As we go we condense all the remaining // events to be sequential. - buf := l.broker.buf - startIndex := l.bufPos.plus(l.consumedEventCount) + startIndex := l.bufferStart.plus(l.consumedEventCount) unconsumedEventCount := l.eventCount - l.consumedEventCount for i := 0; i < unconsumedEventCount; i++ { readIndex := startIndex.plus(i) - entry := readIndex.inBuffer(buf) + entry := *l.buf.entry(readIndex) if entry.producer == req.producer { // The producer matches, skip this event removedCount++ @@ -326,9 +405,9 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { // earlier indices that were removed. // (Count backwards from (startIndex + i), not from readIndex, to avoid // sign issues when the buffer wraps.) - writeIndex := startIndex.plus(i - removedCount) - if readIndex != writeIndex { - *writeIndex.inBuffer(buf) = *readIndex.inBuffer(buf) + if removedCount > 0 { + writeIndex := readIndex.plus(-removedCount) + *l.buf.entry(writeIndex) = entry } } } @@ -336,8 +415,8 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { // Clear the event pointers at the end of the buffer so we don't keep // old events in memory by accident. for i := l.eventCount - removedCount; i < l.eventCount; i++ { - entryIndex := l.bufPos.plus(i) - entryIndex.inBuffer(buf).event = nil + entryIndex := l.bufferStart.plus(i) + l.buf.entry(entryIndex).event = nil } // Subtract removed events from the internal event count From 3fcb87582607b969ecbaa6ef49d58bf2fbf9b518 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 11:11:17 -0400 Subject: [PATCH 73/99] remove drop on cancel option --- libbeat/publisher/pipeline/client.go | 15 +++--------- libbeat/publisher/pipeline/controller.go | 3 +-- libbeat/publisher/pipeline/pipeline_test.go | 21 +++++------------ libbeat/publisher/queue/diskqueue/producer.go | 9 ++------ libbeat/publisher/queue/memqueue/broker.go | 2 +- libbeat/publisher/queue/memqueue/produce.go | 23 ++++--------------- .../publisher/queue/memqueue/queue_test.go | 6 ++--- .../publisher/queue/memqueue/runloop_test.go | 4 ++-- libbeat/publisher/queue/queue.go | 12 +++------- .../queue/queuetest/producer_cancel.go | 3 +-- 10 files changed, 25 insertions(+), 73 deletions(-) diff --git a/libbeat/publisher/pipeline/client.go b/libbeat/publisher/pipeline/client.go index a5c02faace6d..aec498441dd8 100644 --- a/libbeat/publisher/pipeline/client.go +++ b/libbeat/publisher/pipeline/client.go @@ -146,8 +146,8 @@ func (c *client) Close() error { c.logger.Debug("client: done closing acker") c.logger.Debug("client: close queue producer") - cancelledEventCount := c.producer.Cancel() - c.onClosed(cancelledEventCount) + c.producer.Cancel() + c.onClosed() c.logger.Debug("client: done producer close") if c.processors != nil { @@ -168,16 +168,7 @@ func (c *client) onClosing() { } } -func (c *client) onClosed(cancelledEventCount int) { - c.logger.Debugf("client: cancelled %v events", cancelledEventCount) - - if c.eventWaitGroup != nil { - c.logger.Debugf("client: remove client events") - if cancelledEventCount > 0 { - c.eventWaitGroup.Add(-cancelledEventCount) - } - } - +func (c *client) onClosed() { c.observer.clientClosed() if c.clientListener != nil { c.clientListener.Closed() diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index bb75c9619c57..1cf6a2a4573d 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -303,6 +303,5 @@ func (emptyProducer) TryPublish(_ queue.Entry) (queue.EntryID, bool) { return 0, false } -func (emptyProducer) Cancel() int { - return 0 +func (emptyProducer) Cancel() { } diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index feb01c4fa6e0..746118972f9a 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -93,7 +93,6 @@ func makeDiscardQueue() queue.Queue { producer: func(cfg queue.ProducerConfig) queue.Producer { producerID.Inc() - id := producerID.Load() // count is a counter that increments on every published event // it's also the returned Event ID @@ -103,10 +102,8 @@ func makeDiscardQueue() queue.Queue { count++ return queue.EntryID(count), true }, - cancel: func() int { - + cancel: func() { wg.Done() - return id }, } @@ -125,7 +122,7 @@ type testQueue struct { type testProducer struct { publish func(try bool, event queue.Entry) (queue.EntryID, bool) - cancel func() int + cancel func() } func (q *testQueue) Metrics() (queue.Metrics, error) { @@ -178,11 +175,10 @@ func (p *testProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { return 0, false } -func (p *testProducer) Cancel() int { +func (p *testProducer) Cancel() { if p.cancel != nil { - return p.cancel() + p.cancel() } - return 0 } func makeTestQueue() queue.Queue { @@ -216,15 +212,11 @@ func makeTestQueue() queue.Queue { } return p.Publish(event) }, - cancel: func() int { - i := p.Cancel() - + cancel: func() { mux.Lock() defer mux.Unlock() delete(producers, producer) wg.Done() - - return i }, } @@ -248,9 +240,8 @@ func blockingProducer(_ queue.ProducerConfig) queue.Producer { return 0, false }, - cancel: func() int { + cancel: func() { close(sig) - return waiting.Load() }, } } diff --git a/libbeat/publisher/queue/diskqueue/producer.go b/libbeat/publisher/queue/diskqueue/producer.go index 69725c62ccc1..800bb3de3c7f 100644 --- a/libbeat/publisher/queue/diskqueue/producer.go +++ b/libbeat/publisher/queue/diskqueue/producer.go @@ -94,15 +94,10 @@ func (producer *diskQueueProducer) publish( } } -func (producer *diskQueueProducer) Cancel() int { +func (producer *diskQueueProducer) Cancel() { if producer.cancelled { - return 0 + return } producer.cancelled = true close(producer.done) - - // TODO (possibly?): message the core loop to remove any pending events that - // were sent through this producer. If we do, return the number of cancelled - // events here instead of zero. - return 0 } diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index a42215f48a67..d301495f2e49 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -264,7 +264,7 @@ func (b *broker) Producer(cfg queue.ProducerConfig) queue.Producer { if b.encoderFactory != nil { encoder = b.encoderFactory() } - return newProducer(b, cfg.ACK, cfg.OnDrop, cfg.DropOnCancel, encoder) + return newProducer(b, cfg.ACK, cfg.OnDrop, encoder) } func (b *broker) Get(count int) (queue.Batch, error) { diff --git a/libbeat/publisher/queue/memqueue/produce.go b/libbeat/publisher/queue/memqueue/produce.go index 55f15a8cc869..c1481321c35e 100644 --- a/libbeat/publisher/queue/memqueue/produce.go +++ b/libbeat/publisher/queue/memqueue/produce.go @@ -29,7 +29,6 @@ type forgetfulProducer struct { type ackProducer struct { broker *broker - dropOnCancel bool producedCount uint64 state produceState openState openState @@ -58,7 +57,7 @@ type produceState struct { type ackHandler func(count int) -func newProducer(b *broker, cb ackHandler, dropCB func(queue.Entry), dropOnCancel bool, encoder queue.Encoder) queue.Producer { +func newProducer(b *broker, cb ackHandler, dropCB func(queue.Entry), encoder queue.Encoder) queue.Producer { openState := openState{ log: b.logger, done: make(chan struct{}), @@ -68,7 +67,7 @@ func newProducer(b *broker, cb ackHandler, dropCB func(queue.Entry), dropOnCance } if cb != nil { - p := &ackProducer{broker: b, dropOnCancel: dropOnCancel, openState: openState} + p := &ackProducer{broker: b, openState: openState} p.state.cb = cb p.state.dropCB = dropCB return p @@ -91,9 +90,8 @@ func (p *forgetfulProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) return p.openState.tryPublish(p.makePushRequest(event)) } -func (p *forgetfulProducer) Cancel() int { +func (p *forgetfulProducer) Cancel() { p.openState.Close() - return 0 } func (p *ackProducer) makePushRequest(event queue.Entry) pushRequest { @@ -123,21 +121,8 @@ func (p *ackProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { return id, published } -func (p *ackProducer) Cancel() int { +func (p *ackProducer) Cancel() { p.openState.Close() - - if p.dropOnCancel { - ch := make(chan producerCancelResponse) - p.broker.cancelChan <- producerCancelRequest{ - producer: p, - resp: ch, - } - - // wait for cancel to being processed - resp := <-ch - return resp.removed - } - return 0 } func (st *openState) Close() { diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index 41228046c532..3bd5a88896c5 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -96,9 +96,8 @@ func TestProducerDoesNotBlockWhenQueueClosed(t *testing.T) { p := q.Producer(queue.ProducerConfig{ // We do not read from the queue, so the callbacks are never called - ACK: func(count int) {}, - OnDrop: func(e queue.Entry) {}, - DropOnCancel: false, + ACK: func(count int) {}, + OnDrop: func(e queue.Entry) {}, }) success := atomic.Bool{} @@ -173,7 +172,6 @@ func TestProducerClosePreservesEventCount(t *testing.T) { OnDrop: func(e queue.Entry) { //activeEvents.Add(-1) }, - DropOnCancel: false, }) // Asynchronously, send 4 events to the queue. diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index d25537265ea3..acfd4ea56817 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -44,7 +44,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { }, 10, nil) - producer := newProducer(broker, nil, nil, false, nil) + producer := newProducer(broker, nil, nil, nil) rl := broker.runLoop for i := 0; i < 100; i++ { // Pair each publish call with an iteration of the run loop so we @@ -83,7 +83,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { }, 10, nil) - producer := newProducer(broker, nil, nil, false, nil) + producer := newProducer(broker, nil, nil, nil) rl := broker.runLoop for i := 0; i < 100; i++ { // Pair each publish call with an iteration of the run loop so we diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 8758c055945f..ec081cf3562e 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -112,10 +112,6 @@ type ProducerConfig struct { // to the memory queue's request channel but the producer is cancelled // before it reaches the queue buffer. OnDrop func(Entry) - - // DropOnCancel is a hint to the queue to drop events if the producer disconnects - // via Cancel. - DropOnCancel bool } type EntryID uint64 @@ -134,12 +130,10 @@ type Producer interface { // the event's assigned ID, and false otherwise. TryPublish(entry Entry) (EntryID, bool) - // Cancel closes this Producer endpoint. If the producer is configured to - // drop its entries on Cancel, the number of dropped entries is returned. + // Cancel closes this Producer endpoint. // Note: A queue may still send ACK signals even after Cancel is called on - // the originating Producer. The pipeline client must accept and - // discard these ACKs. - Cancel() int + // the originating Producer. The pipeline client must accept these ACKs. + Cancel() } // Batch of entries (usually publisher.Event) to be returned to Consumers. diff --git a/libbeat/publisher/queue/queuetest/producer_cancel.go b/libbeat/publisher/queue/queuetest/producer_cancel.go index 6bb8a9bdd083..1b79649ddc63 100644 --- a/libbeat/publisher/queue/queuetest/producer_cancel.go +++ b/libbeat/publisher/queue/queuetest/producer_cancel.go @@ -47,8 +47,7 @@ func TestProducerCancelRemovesEvents(t *testing.T, factory QueueFactory) { log.Debug("create first producer") producer := b.Producer(queue.ProducerConfig{ - ACK: func(int) {}, // install function pointer, so 'cancel' will remove events - DropOnCancel: true, + ACK: func(int) {}, // install function pointer, so 'cancel' will remove events }) for ; i < N1; i++ { From 47c07a649018599fc79622831e8fd84d216bf3c9 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 11:12:11 -0400 Subject: [PATCH 74/99] producer.Cancel -> producer.Close --- libbeat/publisher/pipeline/client.go | 2 +- libbeat/publisher/pipeline/controller.go | 2 +- libbeat/publisher/pipeline/pipeline_test.go | 4 ++-- libbeat/publisher/queue/diskqueue/producer.go | 2 +- libbeat/publisher/queue/memqueue/produce.go | 4 ++-- libbeat/publisher/queue/memqueue/queue_test.go | 2 +- libbeat/publisher/queue/queue.go | 6 +++--- libbeat/publisher/queue/queuetest/producer_cancel.go | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libbeat/publisher/pipeline/client.go b/libbeat/publisher/pipeline/client.go index aec498441dd8..7ecce6fd8c70 100644 --- a/libbeat/publisher/pipeline/client.go +++ b/libbeat/publisher/pipeline/client.go @@ -146,7 +146,7 @@ func (c *client) Close() error { c.logger.Debug("client: done closing acker") c.logger.Debug("client: close queue producer") - c.producer.Cancel() + c.producer.Close() c.onClosed() c.logger.Debug("client: done producer close") diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index 1cf6a2a4573d..b34d6a64d2c0 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -303,5 +303,5 @@ func (emptyProducer) TryPublish(_ queue.Entry) (queue.EntryID, bool) { return 0, false } -func (emptyProducer) Cancel() { +func (emptyProducer) Close() { } diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index 746118972f9a..78725b043f1a 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -175,7 +175,7 @@ func (p *testProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { return 0, false } -func (p *testProducer) Cancel() { +func (p *testProducer) Close() { if p.cancel != nil { p.cancel() } @@ -190,7 +190,7 @@ func makeTestQueue() queue.Queue { close: func() error { mux.Lock() for producer := range producers { - producer.Cancel() + producer.Close() } mux.Unlock() diff --git a/libbeat/publisher/queue/diskqueue/producer.go b/libbeat/publisher/queue/diskqueue/producer.go index 800bb3de3c7f..7d084adf5ea4 100644 --- a/libbeat/publisher/queue/diskqueue/producer.go +++ b/libbeat/publisher/queue/diskqueue/producer.go @@ -94,7 +94,7 @@ func (producer *diskQueueProducer) publish( } } -func (producer *diskQueueProducer) Cancel() { +func (producer *diskQueueProducer) Close() { if producer.cancelled { return } diff --git a/libbeat/publisher/queue/memqueue/produce.go b/libbeat/publisher/queue/memqueue/produce.go index c1481321c35e..5a1a8ebca6b2 100644 --- a/libbeat/publisher/queue/memqueue/produce.go +++ b/libbeat/publisher/queue/memqueue/produce.go @@ -90,7 +90,7 @@ func (p *forgetfulProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) return p.openState.tryPublish(p.makePushRequest(event)) } -func (p *forgetfulProducer) Cancel() { +func (p *forgetfulProducer) Close() { p.openState.Close() } @@ -121,7 +121,7 @@ func (p *ackProducer) TryPublish(event queue.Entry) (queue.EntryID, bool) { return id, published } -func (p *ackProducer) Cancel() { +func (p *ackProducer) Close() { p.openState.Close() } diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index 3bd5a88896c5..f4aa30172f1c 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -207,7 +207,7 @@ func TestProducerClosePreservesEventCount(t *testing.T) { // Cancel the producer, then read and acknowledge two batches. If the // Publish calls and the queue code are working, activeEvents should // _usually_ end up as 0, but _always_ end up non-negative. - p.Cancel() + p.Close() // The queue reads also need to be done in a goroutine, in case the // producer cancellation signal went through before the Publish diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index ec081cf3562e..98216c36e435 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -130,10 +130,10 @@ type Producer interface { // the event's assigned ID, and false otherwise. TryPublish(entry Entry) (EntryID, bool) - // Cancel closes this Producer endpoint. - // Note: A queue may still send ACK signals even after Cancel is called on + // Close closes this Producer endpoint. + // Note: A queue may still send ACK signals even after Close is called on // the originating Producer. The pipeline client must accept these ACKs. - Cancel() + Close() } // Batch of entries (usually publisher.Event) to be returned to Consumers. diff --git a/libbeat/publisher/queue/queuetest/producer_cancel.go b/libbeat/publisher/queue/queuetest/producer_cancel.go index 1b79649ddc63..491d062fd147 100644 --- a/libbeat/publisher/queue/queuetest/producer_cancel.go +++ b/libbeat/publisher/queue/queuetest/producer_cancel.go @@ -59,7 +59,7 @@ func TestProducerCancelRemovesEvents(t *testing.T, factory QueueFactory) { // cancel producer log.Debugf("cancel producer") - producer.Cancel() + producer.Close() // reconnect and send some more events log.Debug("connect new producer") From f9d4c39a778f67fa4457f235306bd6ba72271b43 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 11:33:02 -0400 Subject: [PATCH 75/99] remove OnDrop callbacks --- libbeat/publisher/pipeline/pipeline.go | 12 ------- libbeat/publisher/queue/memqueue/broker.go | 2 +- libbeat/publisher/queue/memqueue/produce.go | 9 ++---- .../publisher/queue/memqueue/queue_test.go | 6 +--- libbeat/publisher/queue/memqueue/runloop.go | 31 +++++-------------- .../publisher/queue/memqueue/runloop_test.go | 4 +-- libbeat/publisher/queue/queue.go | 6 ---- 7 files changed, 15 insertions(+), 55 deletions(-) diff --git a/libbeat/publisher/pipeline/pipeline.go b/libbeat/publisher/pipeline/pipeline.go index 85eeb0e64977..3f928709a708 100644 --- a/libbeat/publisher/pipeline/pipeline.go +++ b/libbeat/publisher/pipeline/pipeline.go @@ -250,18 +250,6 @@ func (p *Pipeline) ConnectWith(cfg beat.ClientConfig) (beat.Client, error) { producerCfg := queue.ProducerConfig{} - if client.eventWaitGroup != nil || cfg.ClientListener != nil { - producerCfg.OnDrop = func(event queue.Entry) { - publisherEvent, _ := event.(publisher.Event) - if cfg.ClientListener != nil { - cfg.ClientListener.DroppedOnPublish(publisherEvent.Content) - } - if client.eventWaitGroup != nil { - client.eventWaitGroup.Add(-1) - } - } - } - var waiter *clientCloseWaiter if waitClose > 0 { waiter = newClientCloseWaiter(waitClose) diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index d301495f2e49..66ee6fd3b236 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -264,7 +264,7 @@ func (b *broker) Producer(cfg queue.ProducerConfig) queue.Producer { if b.encoderFactory != nil { encoder = b.encoderFactory() } - return newProducer(b, cfg.ACK, cfg.OnDrop, encoder) + return newProducer(b, cfg.ACK, encoder) } func (b *broker) Get(count int) (queue.Batch, error) { diff --git a/libbeat/publisher/queue/memqueue/produce.go b/libbeat/publisher/queue/memqueue/produce.go index 5a1a8ebca6b2..a206e357aacb 100644 --- a/libbeat/publisher/queue/memqueue/produce.go +++ b/libbeat/publisher/queue/memqueue/produce.go @@ -49,15 +49,13 @@ type openState struct { type producerID uint64 type produceState struct { - cb ackHandler - dropCB func(queue.Entry) - cancelled bool - lastACK producerID + cb ackHandler + lastACK producerID } type ackHandler func(count int) -func newProducer(b *broker, cb ackHandler, dropCB func(queue.Entry), encoder queue.Encoder) queue.Producer { +func newProducer(b *broker, cb ackHandler, encoder queue.Encoder) queue.Producer { openState := openState{ log: b.logger, done: make(chan struct{}), @@ -69,7 +67,6 @@ func newProducer(b *broker, cb ackHandler, dropCB func(queue.Entry), encoder que if cb != nil { p := &ackProducer{broker: b, openState: openState} p.state.cb = cb - p.state.dropCB = dropCB return p } return &forgetfulProducer{broker: b, openState: openState} diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index f4aa30172f1c..af36736cce47 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -96,8 +96,7 @@ func TestProducerDoesNotBlockWhenQueueClosed(t *testing.T) { p := q.Producer(queue.ProducerConfig{ // We do not read from the queue, so the callbacks are never called - ACK: func(count int) {}, - OnDrop: func(e queue.Entry) {}, + ACK: func(count int) {}, }) success := atomic.Bool{} @@ -169,9 +168,6 @@ func TestProducerClosePreservesEventCount(t *testing.T) { ACK: func(count int) { activeEvents.Add(-int64(count)) }, - OnDrop: func(e queue.Entry) { - //activeEvents.Add(-1) - }, }) // Asynchronously, send 4 events to the queue. diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index 45ae3c0a1a2b..b15bdba9cc19 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -195,16 +195,15 @@ func (l *runLoop) handleDelete(count int) { } func (l *runLoop) handleInsert(req *pushRequest) { - if l.insert(req, l.nextEntryID) { - // Send back the new event id. - req.resp <- l.nextEntryID + l.insert(req, l.nextEntryID) + // Send back the new event id. + req.resp <- l.nextEntryID - l.nextEntryID++ - l.eventCount++ + l.nextEntryID++ + l.eventCount++ - // See if this gave us enough for a new batch - l.maybeUnblockGetRequest() - } + // See if this gave us enough for a new batch + l.maybeUnblockGetRequest() } // Checks if we can handle pendingGetRequest yet, and handles it if so @@ -223,13 +222,7 @@ func (l *runLoop) maybeUnblockGetRequest() { } } -// Returns true if the event was inserted, false if insertion was cancelled. -func (l *runLoop) insert(req *pushRequest, id queue.EntryID) bool { - if req.producer != nil && req.producer.state.cancelled { - reportCancelledState(req) - return false - } - +func (l *runLoop) insert(req *pushRequest, id queue.EntryID) { index := (l.bufPos + l.eventCount) % len(l.broker.buf) l.broker.buf[index] = queueEntry{ event: req.event, @@ -237,7 +230,6 @@ func (l *runLoop) insert(req *pushRequest, id queue.EntryID) bool { producer: req.producer, producerID: req.producerID, } - return true } func (l *runLoop) handleMetricsRequest(req *metricsRequest) { @@ -293,10 +285,3 @@ func (l *runLoop) handleCancel(req *producerCancelRequest) { req.resp <- producerCancelResponse{removed: removedCount} } } - -func reportCancelledState(req *pushRequest) { - // do not add waiting events if producer did send cancel signal - if cb := req.producer.state.dropCB; cb != nil { - cb(req.event) - } -} diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index acfd4ea56817..266704fc1fde 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -44,7 +44,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { }, 10, nil) - producer := newProducer(broker, nil, nil, nil) + producer := newProducer(broker, nil, nil) rl := broker.runLoop for i := 0; i < 100; i++ { // Pair each publish call with an iteration of the run loop so we @@ -83,7 +83,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { }, 10, nil) - producer := newProducer(broker, nil, nil, nil) + producer := newProducer(broker, nil, nil) rl := broker.runLoop for i := 0; i < 100; i++ { // Pair each publish call with an iteration of the run loop so we diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 98216c36e435..9c186ad30d0d 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -106,12 +106,6 @@ type ProducerConfig struct { // if ACK is set, the callback will be called with number of events produced // by the producer instance and being ACKed by the queue. ACK func(count int) - - // OnDrop is called to report events being silently dropped by - // the queue. Currently this can only happen when a Publish call is sent - // to the memory queue's request channel but the producer is cancelled - // before it reaches the queue buffer. - OnDrop func(Entry) } type EntryID uint64 From d398b46746a37c463bd49ea266cad857647c526c Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 11:40:21 -0400 Subject: [PATCH 76/99] remove internal cancellation helpers --- libbeat/publisher/queue/memqueue/broker.go | 5 --- .../publisher/queue/memqueue/internal_api.go | 9 ---- libbeat/publisher/queue/memqueue/runloop.go | 43 ------------------- 3 files changed, 57 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 66ee6fd3b236..d9aff10bd3ac 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -66,10 +66,6 @@ type broker struct { // Consumers send requests to getChan to read events from the queue. getChan chan getRequest - // Producers send requests to cancelChan to cancel events they've - // sent so far that have not yet reached a consumer. - cancelChan chan producerCancelRequest - // Metrics() sends requests to metricChan to expose internal queue // metrics to external callers. metricChan chan metricsRequest @@ -224,7 +220,6 @@ func newQueue( // broker API channels pushChan: make(chan pushRequest, chanSize), getChan: make(chan getRequest), - cancelChan: make(chan producerCancelRequest, 5), metricChan: make(chan metricsRequest), // internal runLoop and ackLoop channels diff --git a/libbeat/publisher/queue/memqueue/internal_api.go b/libbeat/publisher/queue/memqueue/internal_api.go index 95b5e0eba90f..6575472edbd0 100644 --- a/libbeat/publisher/queue/memqueue/internal_api.go +++ b/libbeat/publisher/queue/memqueue/internal_api.go @@ -38,15 +38,6 @@ type pushRequest struct { resp chan queue.EntryID } -type producerCancelRequest struct { - producer *ackProducer - resp chan producerCancelResponse -} - -type producerCancelResponse struct { - removed int -} - // consumer -> broker API type getRequest struct { diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index b15bdba9cc19..ed14106f20c9 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -122,9 +122,6 @@ func (l *runLoop) runIteration() { case req := <-pushChan: // producer pushing new event l.handleInsert(&req) - case req := <-l.broker.cancelChan: // producer cancelling active events - l.handleCancel(&req) - case req := <-getChan: // consumer asking for next batch l.handleGetRequest(&req) @@ -245,43 +242,3 @@ func (l *runLoop) handleMetricsRequest(req *metricsRequest) { oldestEntryID: oldestEntryID, } } - -func (l *runLoop) handleCancel(req *producerCancelRequest) { - var removedCount int - - // Traverse all unconsumed events in the buffer, removing any with - // the specified producer. As we go we condense all the remaining - // events to be sequential. - buf := l.broker.buf - startIndex := l.bufPos + l.consumedCount - unconsumedEventCount := l.eventCount - l.consumedCount - for i := 0; i < unconsumedEventCount; i++ { - readIndex := (startIndex + i) % len(buf) - if buf[readIndex].producer == req.producer { - // The producer matches, skip this event - removedCount++ - } else { - // Move the event to its final position after accounting for any - // earlier indices that were removed. - // (Count backwards from (startIndex + i), not from readIndex, to avoid - // sign issues when the buffer wraps.) - writeIndex := (startIndex + i - removedCount) % len(buf) - buf[writeIndex] = buf[readIndex] - } - } - - // Clear the event pointers at the end of the buffer so we don't keep - // old events in memory by accident. - for i := 0; i < removedCount; i++ { - index := (l.bufPos + l.eventCount - removedCount + i) % len(buf) - buf[index].event = nil - } - - // Subtract removed events from the internal event count - l.eventCount -= removedCount - - // signal cancel request being finished - if req.resp != nil { - req.resp <- producerCancelResponse{removed: removedCount} - } -} From bad2498b0d7a91b9b09ce40cb215c4b3200c03f4 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 12:20:22 -0400 Subject: [PATCH 77/99] remove the queue's shipper metrics hook --- libbeat/publisher/pipeline/pipeline_test.go | 4 - .../publisher/queue/diskqueue/core_loop.go | 11 - libbeat/publisher/queue/diskqueue/queue.go | 42 ---- .../publisher/queue/diskqueue/queue_test.go | 41 ---- libbeat/publisher/queue/memqueue/broker.go | 20 -- .../publisher/queue/memqueue/queue_test.go | 228 ------------------ libbeat/publisher/queue/queue.go | 31 --- 7 files changed, 377 deletions(-) diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index 78725b043f1a..015b24af5b46 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -125,10 +125,6 @@ type testProducer struct { cancel func() } -func (q *testQueue) Metrics() (queue.Metrics, error) { - return queue.Metrics{}, nil -} - func (q *testQueue) Close() error { if q.close != nil { return q.close() diff --git a/libbeat/publisher/queue/diskqueue/core_loop.go b/libbeat/publisher/queue/diskqueue/core_loop.go index 93051dd4581e..c08c204d51ed 100644 --- a/libbeat/publisher/queue/diskqueue/core_loop.go +++ b/libbeat/publisher/queue/diskqueue/core_loop.go @@ -84,21 +84,10 @@ func (dq *diskQueue) run() { // If there were blocked producers waiting for more queue space, // we might be able to unblock them now. dq.maybeUnblockProducers() - - case metricsReq := <-dq.metricsRequestChan: - dq.handleMetricsRequest(metricsReq) } } } -// handleMetricsRequest responds to an event on the metricsRequestChan chan -func (dq *diskQueue) handleMetricsRequest(request metricsRequest) { - resp := metricsRequestResponse{ - sizeOnDisk: dq.segments.sizeOnDisk(), - } - request.response <- resp -} - func (dq *diskQueue) handleProducerWriteRequest(request producerWriteRequest) { // Pathological case checking: make sure the incoming frame isn't bigger // than an entire segment all by itself (as long as it isn't, it is diff --git a/libbeat/publisher/queue/diskqueue/queue.go b/libbeat/publisher/queue/diskqueue/queue.go index 4fedcfa6a6e6..32a4bd220a36 100644 --- a/libbeat/publisher/queue/diskqueue/queue.go +++ b/libbeat/publisher/queue/diskqueue/queue.go @@ -20,13 +20,11 @@ package diskqueue import ( "errors" "fmt" - "io" "os" "sync" "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/opt" ) // The string used to specify this queue in beats configurations. @@ -74,9 +72,6 @@ type diskQueue struct { // The API channel used by diskQueueProducer to write events. producerWriteRequestChan chan producerWriteRequest - // API channel used by the public Metrics() API to request queue metrics - metricsRequestChan chan metricsRequest - // pendingFrames is a list of all incoming data frames that have been // accepted by the queue and are waiting to be sent to the writer loop. // Segment ids in this list always appear in sorted order, even between @@ -92,16 +87,6 @@ type diskQueue struct { done chan struct{} } -// channel request for metrics from an external client -type metricsRequest struct { - response chan metricsRequestResponse -} - -// metrics response from the disk queue -type metricsRequestResponse struct { - sizeOnDisk uint64 -} - // FactoryForSettings is a simple wrapper around NewQueue so a concrete // Settings object can be wrapped in a queue-agnostic interface for // later use by the pipeline. @@ -237,7 +222,6 @@ func NewQueue( deleterLoop: newDeleterLoop(settings), producerWriteRequestChan: make(chan producerWriteRequest), - metricsRequestChan: make(chan metricsRequest), done: make(chan struct{}), } @@ -296,29 +280,3 @@ func (dq *diskQueue) Producer(cfg queue.ProducerConfig) queue.Producer { done: make(chan struct{}), } } - -// Metrics returns current disk metrics -func (dq *diskQueue) Metrics() (queue.Metrics, error) { - respChan := make(chan metricsRequestResponse, 1) - req := metricsRequest{response: respChan} - - select { - case <-dq.done: - return queue.Metrics{}, io.EOF - case dq.metricsRequestChan <- req: - - } - - resp := metricsRequestResponse{} - select { - case <-dq.done: - return queue.Metrics{}, io.EOF - case resp = <-respChan: - } - - maxSize := dq.settings.MaxBufferSize - return queue.Metrics{ - ByteLimit: opt.UintWith(maxSize), - ByteCount: opt.UintWith(resp.sizeOnDisk), - }, nil -} diff --git a/libbeat/publisher/queue/diskqueue/queue_test.go b/libbeat/publisher/queue/diskqueue/queue_test.go index f6a4c406ed32..30c770e45a48 100644 --- a/libbeat/publisher/queue/diskqueue/queue_test.go +++ b/libbeat/publisher/queue/diskqueue/queue_test.go @@ -28,9 +28,6 @@ import ( "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/beats/v7/libbeat/publisher/queue/queuetest" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/mapstr" - - "github.com/stretchr/testify/require" ) var seed int64 @@ -78,44 +75,6 @@ func TestProduceConsumer(t *testing.T) { t.Run("direct", testWith(makeTestQueue())) } -func TestMetrics(t *testing.T) { - dir, err := ioutil.TempDir("", "diskqueue_metrics") - defer func() { - _ = os.RemoveAll(dir) - }() - require.NoError(t, err) - settings := DefaultSettings() - settings.Path = dir - // lower max segment size so we can get multiple segments - settings.MaxSegmentSize = 100 - - testQueue, err := NewQueue(logp.L(), nil, settings, nil) - require.NoError(t, err) - defer testQueue.Close() - - eventsToTest := 100 - - // Send events to queue - producer := testQueue.Producer(queue.ProducerConfig{}) - sendEventsToQueue(eventsToTest, producer) - - // fetch metrics before we read any events - time.Sleep(time.Millisecond * 500) - testMetrics, err := testQueue.Metrics() - require.NoError(t, err) - - require.Equal(t, testMetrics.ByteLimit.ValueOr(0), uint64((1 << 30))) - require.NotZero(t, testMetrics.ByteCount.ValueOr(0)) - t.Logf("got %d bytes written", testMetrics.ByteCount.ValueOr(0)) - -} - -func sendEventsToQueue(count int, prod queue.Producer) { - for i := 0; i < count; i++ { - prod.Publish(queuetest.MakeEvent(mapstr.M{"count": i})) - } -} - func makeTestQueue() queuetest.QueueFactory { return func(t *testing.T) queue.Queue { dir, err := ioutil.TempDir("", "diskqueue_test") diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index d9aff10bd3ac..7a2114869f38 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -25,7 +25,6 @@ import ( "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/opt" ) // The string used to specify this queue in beats configurations. @@ -276,25 +275,6 @@ func (b *broker) Get(count int) (queue.Batch, error) { return resp, nil } -func (b *broker) Metrics() (queue.Metrics, error) { - - responseChan := make(chan memQueueMetrics, 1) - select { - case <-b.ctx.Done(): - return queue.Metrics{}, io.EOF - case b.metricChan <- metricsRequest{ - responseChan: responseChan}: - } - resp := <-responseChan - - return queue.Metrics{ - EventCount: opt.UintWith(uint64(resp.currentQueueSize)), - EventLimit: opt.UintWith(uint64(len(b.buf))), - UnackedConsumedEvents: opt.UintWith(uint64(resp.occupiedRead)), - OldestEntryID: resp.oldestEntryID, - }, nil -} - var batchPool = sync.Pool{ New: func() interface{} { return &batch{ diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index af36736cce47..9cd209bbd51e 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -32,7 +32,6 @@ import ( "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/beats/v7/libbeat/publisher/queue/queuetest" - "github.com/elastic/elastic-agent-libs/mapstr" ) var seed int64 @@ -228,73 +227,6 @@ func TestProducerClosePreservesEventCount(t *testing.T) { assert.False(t, activeEvents.Load() < 0, "active event count should never be negative") } -func TestQueueMetricsDirect(t *testing.T) { - eventsToTest := 5 - maxEvents := 10 - - // Test the directEventLoop - directSettings := Settings{ - Events: maxEvents, - MaxGetRequest: 1, - FlushTimeout: 0, - } - t.Logf("Testing directEventLoop") - queueTestWithSettings(t, directSettings, eventsToTest, "directEventLoop") - -} - -func TestQueueMetricsBuffer(t *testing.T) { - eventsToTest := 5 - maxEvents := 10 - // Test Buffered Event Loop - bufferedSettings := Settings{ - Events: maxEvents, - MaxGetRequest: eventsToTest, // The buffered event loop can only return FlushMinEvents per Get() - FlushTimeout: time.Millisecond, - } - t.Logf("Testing bufferedEventLoop") - queueTestWithSettings(t, bufferedSettings, eventsToTest, "bufferedEventLoop") -} - -func queueTestWithSettings(t *testing.T, settings Settings, eventsToTest int, testName string) { - testQueue := NewQueue(nil, nil, settings, 0, nil) - defer testQueue.Close() - - // Send events to queue - producer := testQueue.Producer(queue.ProducerConfig{}) - for i := 0; i < eventsToTest; i++ { - producer.Publish(queuetest.MakeEvent(mapstr.M{"count": i})) - } - queueMetricsAreValid(t, testQueue, 5, settings.Events, 0, fmt.Sprintf("%s - First send of metrics to queue", testName)) - - // Read events, don't yet ack them - batch, err := testQueue.Get(eventsToTest) - assert.NoError(t, err, "error in Get") - t.Logf("Got batch of %d events", batch.Count()) - - queueMetricsAreValid(t, testQueue, 5, settings.Events, 5, fmt.Sprintf("%s - Producer Getting events, no ACK", testName)) - - // Test metrics after ack - batch.Done() - - queueMetricsAreValid(t, testQueue, 0, settings.Events, 0, fmt.Sprintf("%s - Producer Getting events, no ACK", testName)) - -} - -func queueMetricsAreValid(t *testing.T, q queue.Queue, evtCount, evtLimit, occupied int, test string) { - // wait briefly to avoid races across all the queue channels - time.Sleep(time.Millisecond * 100) - testMetrics, err := q.Metrics() - assert.NoError(t, err, "error calling metrics for test %s", test) - assert.Equal(t, testMetrics.EventCount.ValueOr(0), uint64(evtCount), "incorrect EventCount for %s", test) - assert.Equal(t, testMetrics.EventLimit.ValueOr(0), uint64(evtLimit), "incorrect EventLimit for %s", test) - assert.Equal(t, testMetrics.UnackedConsumedEvents.ValueOr(0), uint64(occupied), "incorrect OccupiedRead for %s", test) -} - -func TestProducerCancelRemovesEvents(t *testing.T) { - queuetest.TestProducerCancelRemovesEvents(t, makeTestQueue(1024, 0, 0)) -} - func makeTestQueue(sz, minEvents int, flushTimeout time.Duration) queuetest.QueueFactory { return func(_ *testing.T) queue.Queue { return NewQueue(nil, nil, Settings{ @@ -330,163 +262,3 @@ func TestAdjustInputQueueSize(t *testing.T) { assert.Equal(t, int(float64(mainQueue)*maxInputQueueSizeRatio), AdjustInputQueueSize(mainQueue, mainQueue)) }) } - -func TestEntryIDs(t *testing.T) { - entryCount := 100 - - testForward := func(q queue.Queue) { - waiter := &producerACKWaiter{} - producer := q.Producer(queue.ProducerConfig{ACK: waiter.ack}) - for i := 0; i < entryCount; i++ { - id, success := producer.Publish(nil) - assert.Equal(t, success, true, "Queue publish should succeed") - assert.Equal(t, id, queue.EntryID(i), "Entry ID should match publication order") - } - - for i := 0; i < entryCount; i++ { - batch, err := q.Get(1) - assert.NoError(t, err, "Queue read should succeed") - assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") - - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(i), - fmt.Sprintf("Oldest entry ID before ACKing event %v should be %v", i, i)) - - batch.Done() - waiter.waitForEvents(1) - metrics, err = q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(i+1), - fmt.Sprintf("Oldest entry ID after ACKing event %v should be %v", i, i+1)) - - } - } - - testBackward := func(q queue.Queue) { - waiter := &producerACKWaiter{} - producer := q.Producer(queue.ProducerConfig{ACK: waiter.ack}) - for i := 0; i < entryCount; i++ { - id, success := producer.Publish(nil) - assert.Equal(t, success, true, "Queue publish should succeed") - assert.Equal(t, id, queue.EntryID(i), "Entry ID should match publication order") - } - - batches := []queue.Batch{} - - for i := 0; i < entryCount; i++ { - batch, err := q.Get(1) - assert.NoError(t, err, "Queue read should succeed") - assert.Equal(t, batch.Count(), 1, "Returned batch should have 1 entry") - batches = append(batches, batch) - } - - for i := entryCount - 1; i > 0; i-- { - batches[i].Done() - - // It's hard to remove this delay since the Done signal is propagated - // asynchronously to the queue, and since this test is ensuring that the - // queue _doesn't_ advance we can't use a callback to gate the comparison - // like we do in testForward. However: - // - While this race condition could sometimes let a buggy implementation - // pass, it will not produce a false failure (so it won't contribute - // to general test flakiness) - // - That notwithstanding, when the ACK _does_ cause an incorrect - // metrics update, this delay is enough to recognize it approximately - // 100% of the time, so this test is still a good signal despite - // the slight nondeterminism. - time.Sleep(1 * time.Millisecond) - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(0), - fmt.Sprintf("Oldest entry ID after ACKing event %v should be 0", i)) - } - // ACK the first batch, which should unblock all the later ones - batches[0].Done() - waiter.waitForEvents(100) - metrics, err := q.Metrics() - assert.NoError(t, err, "Queue metrics call should succeed") - assert.Equal(t, metrics.OldestEntryID, queue.EntryID(100), - fmt.Sprintf("Oldest entry ID after ACKing event 0 should be %v", queue.EntryID(entryCount))) - - } - - t.Run("acking in forward order with directEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000}, 0, nil) - testForward(testQueue) - }) - - t.Run("acking in reverse order with directEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000}, 0, nil) - testBackward(testQueue) - }) - - t.Run("acking in forward order with bufferedEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000, MaxGetRequest: 2, FlushTimeout: time.Microsecond}, 0, nil) - testForward(testQueue) - }) - - t.Run("acking in reverse order with bufferedEventLoop reports the right event IDs", func(t *testing.T) { - testQueue := NewQueue(nil, nil, Settings{Events: 1000, MaxGetRequest: 2, FlushTimeout: time.Microsecond}, 0, nil) - testBackward(testQueue) - }) -} - -// producerACKWaiter is a helper that can listen to queue producer callbacks -// and wait on them from the test thread, so we can test the queue's asynchronous -// behavior without relying on time.Sleep. -type producerACKWaiter struct { - sync.Mutex - - // The number of acks received from a producer callback. - acked int - - // The number of acks that callers have waited for in waitForEvents. - waited int - - // When non-nil, this channel is being listened to by a test thread - // blocking on ACKs, and incoming producer callbacks are forwarded - // to it. - ackChan chan int -} - -func (w *producerACKWaiter) ack(count int) { - w.Lock() - defer w.Unlock() - w.acked += count - if w.ackChan != nil { - w.ackChan <- count - } -} - -func (w *producerACKWaiter) waitForEvents(count int) { - w.Lock() - defer w.Unlock() - if w.ackChan != nil { - panic("don't call producerACKWaiter.waitForEvents from multiple goroutines") - } - - avail := w.acked - w.waited - if count <= avail { - w.waited += count - return - } - w.waited = w.acked - count -= avail - // We have advanced as far as we can, we have to wait for - // more incoming ACKs. - // Set a listener and unlock, so ACKs can come in on another - // goroutine. - w.ackChan = make(chan int) - w.Unlock() - - newAcked := 0 - for newAcked < count { - newAcked += <-w.ackChan - } - // When we're done, turn off the listener channel and update - // the number of events waited on. - w.Lock() - w.ackChan = nil - w.waited += count -} diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 9c186ad30d0d..72349c5a5f4b 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -18,11 +18,7 @@ package queue import ( - "errors" - - "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent-libs/opt" ) // Entry is a placeholder type for the objects contained by the queue, which @@ -31,31 +27,6 @@ import ( // and reduces accidental type mismatches. type Entry interface{} -// Metrics is a set of basic-user friendly metrics that report the current state of the queue. These metrics are meant to be relatively generic and high-level, and when reported directly, can be comprehensible to a user. -type Metrics struct { - //EventCount is the total events currently in the queue - EventCount opt.Uint - //ByteCount is the total byte size of the queue - ByteCount opt.Uint - //ByteLimit is the user-configured byte limit of the queue - ByteLimit opt.Uint - //EventLimit is the user-configured event limit of the queue - EventLimit opt.Uint - - //UnackedConsumedEvents is the count of events that an output consumer has read, but not yet ack'ed - UnackedConsumedEvents opt.Uint - - //OldestActiveTimestamp is the timestamp of the oldest item in the queue. - OldestActiveTimestamp common.Time - - // OldestActiveID is ID of the oldest unacknowledged event in the queue, or - // the next ID that will be assigned if the queue is empty. - OldestEntryID EntryID -} - -// ErrMetricsNotImplemented is a hopefully temporary type to mark queue metrics as not yet implemented -var ErrMetricsNotImplemented = errors.New("Queue metrics not implemented") - // Queue is responsible for accepting, forwarding and ACKing events. // A queue will receive and buffer single events from its producers. // Consumers will receive events in batches from the queues buffers. @@ -76,8 +47,6 @@ type Queue interface { // Get retrieves a batch of up to eventCount events. If eventCount <= 0, // there is no bound on the number of returned events. Get(eventCount int) (Batch, error) - - Metrics() (Metrics, error) } // If encoderFactory is provided, then the resulting queue must use it to From eca723b093236d1a41e521dcd0a0a0554dba18eb Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 12:26:09 -0400 Subject: [PATCH 78/99] remove unused fields and producer cancel tests --- libbeat/publisher/pipeline/pipeline.go | 4 - .../publisher/queue/memqueue/queue_test.go | 4 - .../queue/queuetest/producer_cancel.go | 105 ------------------ 3 files changed, 113 deletions(-) delete mode 100644 libbeat/publisher/queue/queuetest/producer_cancel.go diff --git a/libbeat/publisher/pipeline/pipeline.go b/libbeat/publisher/pipeline/pipeline.go index 3f928709a708..dbe87681ea63 100644 --- a/libbeat/publisher/pipeline/pipeline.go +++ b/libbeat/publisher/pipeline/pipeline.go @@ -71,10 +71,6 @@ type Pipeline struct { waitCloseTimeout time.Duration eventWaitGroup *sync.WaitGroup - // closeRef signal propagation support - guardStartSigPropagation sync.Once - sigNewClient chan *client - processors processing.Supporter } diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index af36736cce47..5ebf6b6f6fb5 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -291,10 +291,6 @@ func queueMetricsAreValid(t *testing.T, q queue.Queue, evtCount, evtLimit, occup assert.Equal(t, testMetrics.UnackedConsumedEvents.ValueOr(0), uint64(occupied), "incorrect OccupiedRead for %s", test) } -func TestProducerCancelRemovesEvents(t *testing.T) { - queuetest.TestProducerCancelRemovesEvents(t, makeTestQueue(1024, 0, 0)) -} - func makeTestQueue(sz, minEvents int, flushTimeout time.Duration) queuetest.QueueFactory { return func(_ *testing.T) queue.Queue { return NewQueue(nil, nil, Settings{ diff --git a/libbeat/publisher/queue/queuetest/producer_cancel.go b/libbeat/publisher/queue/queuetest/producer_cancel.go deleted file mode 100644 index 491d062fd147..000000000000 --- a/libbeat/publisher/queue/queuetest/producer_cancel.go +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to Elasticsearch B.V. under one or more contributor -// license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright -// ownership. Elasticsearch B.V. licenses this file to you under -// the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package queuetest - -import ( - "testing" - - "github.com/stretchr/testify/assert" - - "github.com/elastic/beats/v7/libbeat/publisher" - "github.com/elastic/beats/v7/libbeat/publisher/queue" - "github.com/elastic/elastic-agent-libs/mapstr" -) - -// TestSingleProducerConsumer tests buffered events for a producer getting -// cancelled will not be consumed anymore. Concurrent producer/consumer pairs -// might still have active events not yet ACKed (not tested here). -// -// Note: queues not requiring consumers to ACK a events in order to -// return ACKs to the producer are not supported by this test. -func TestProducerCancelRemovesEvents(t *testing.T, factory QueueFactory) { - fn := withOptLogOutput(true, func(t *testing.T) { - var ( - i int - N1 = 3 - N2 = 10 - ) - - log := NewTestLogger(t) - b := factory(t) - defer b.Close() - - log.Debug("create first producer") - producer := b.Producer(queue.ProducerConfig{ - ACK: func(int) {}, // install function pointer, so 'cancel' will remove events - }) - - for ; i < N1; i++ { - log.Debugf("send event %v to first producer", i) - producer.Publish(MakeEvent(mapstr.M{ - "value": i, - })) - } - - // cancel producer - log.Debugf("cancel producer") - producer.Close() - - // reconnect and send some more events - log.Debug("connect new producer") - producer = b.Producer(queue.ProducerConfig{}) - for ; i < N2; i++ { - log.Debugf("send event %v to new producer", i) - producer.Publish(MakeEvent(mapstr.M{ - "value": i, - })) - } - - // consume all events - total := N2 - N1 - events := make([]interface{}, 0, total) - for len(events) < total { - batch, err := b.Get(-1) // collect all events - if err != nil { - panic(err) - } - - for i := 0; i < batch.Count(); i++ { - events = append(events, batch.Entry(i)) - } - batch.Done() - } - - // verify - if total != len(events) { - assert.Equal(t, total, len(events)) - return - } - - for i, event := range events { - pubEvent, ok := event.(publisher.Event) - assert.True(t, ok, "queue output should be the same type as its input") - value, ok := pubEvent.Content.Fields["value"].(int) - assert.True(t, ok, "event.value should be an int") - assert.Equal(t, i+N1, value) - } - }) - - fn(t) -} From bffd70d814d8f309161d91d7b5d5a4caccd48a88 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 12:47:06 -0400 Subject: [PATCH 79/99] fix merge --- .../publisher/queue/memqueue/queue_test.go | 66 ------------------- 1 file changed, 66 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index bd6f5e7b8b19..9cd209bbd51e 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -227,72 +227,6 @@ func TestProducerClosePreservesEventCount(t *testing.T) { assert.False(t, activeEvents.Load() < 0, "active event count should never be negative") } -<<<<<<< HEAD -======= -func TestQueueMetricsDirect(t *testing.T) { - eventsToTest := 5 - maxEvents := 10 - - // Test the directEventLoop - directSettings := Settings{ - Events: maxEvents, - MaxGetRequest: 1, - FlushTimeout: 0, - } - t.Logf("Testing directEventLoop") - queueTestWithSettings(t, directSettings, eventsToTest, "directEventLoop") - -} - -func TestQueueMetricsBuffer(t *testing.T) { - eventsToTest := 5 - maxEvents := 10 - // Test Buffered Event Loop - bufferedSettings := Settings{ - Events: maxEvents, - MaxGetRequest: eventsToTest, // The buffered event loop can only return FlushMinEvents per Get() - FlushTimeout: time.Millisecond, - } - t.Logf("Testing bufferedEventLoop") - queueTestWithSettings(t, bufferedSettings, eventsToTest, "bufferedEventLoop") -} - -func queueTestWithSettings(t *testing.T, settings Settings, eventsToTest int, testName string) { - testQueue := NewQueue(nil, nil, settings, 0, nil) - defer testQueue.Close() - - // Send events to queue - producer := testQueue.Producer(queue.ProducerConfig{}) - for i := 0; i < eventsToTest; i++ { - producer.Publish(queuetest.MakeEvent(mapstr.M{"count": i})) - } - queueMetricsAreValid(t, testQueue, 5, settings.Events, 0, fmt.Sprintf("%s - First send of metrics to queue", testName)) - - // Read events, don't yet ack them - batch, err := testQueue.Get(eventsToTest) - assert.NoError(t, err, "error in Get") - t.Logf("Got batch of %d events", batch.Count()) - - queueMetricsAreValid(t, testQueue, 5, settings.Events, 5, fmt.Sprintf("%s - Producer Getting events, no ACK", testName)) - - // Test metrics after ack - batch.Done() - - queueMetricsAreValid(t, testQueue, 0, settings.Events, 0, fmt.Sprintf("%s - Producer Getting events, no ACK", testName)) - -} - -func queueMetricsAreValid(t *testing.T, q queue.Queue, evtCount, evtLimit, occupied int, test string) { - // wait briefly to avoid races across all the queue channels - time.Sleep(time.Millisecond * 100) - testMetrics, err := q.Metrics() - assert.NoError(t, err, "error calling metrics for test %s", test) - assert.Equal(t, testMetrics.EventCount.ValueOr(0), uint64(evtCount), "incorrect EventCount for %s", test) - assert.Equal(t, testMetrics.EventLimit.ValueOr(0), uint64(evtLimit), "incorrect EventLimit for %s", test) - assert.Equal(t, testMetrics.UnackedConsumedEvents.ValueOr(0), uint64(occupied), "incorrect OccupiedRead for %s", test) -} - ->>>>>>> remove-producer-cancel func makeTestQueue(sz, minEvents int, flushTimeout time.Duration) queuetest.QueueFactory { return func(_ *testing.T) queue.Queue { return NewQueue(nil, nil, Settings{ From 7409be1ad63174ee66b1e27a55ee9993626ca136 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 16:01:27 -0400 Subject: [PATCH 80/99] moving metric ownership around --- libbeat/publisher/pipeline/client.go | 8 +--- libbeat/publisher/pipeline/controller.go | 23 ++-------- libbeat/publisher/pipeline/controller_test.go | 2 +- libbeat/publisher/pipeline/monitoring.go | 32 ++++---------- libbeat/publisher/pipeline/pipeline.go | 42 +++++++------------ libbeat/publisher/queue/diskqueue/queue.go | 8 ++-- .../publisher/queue/diskqueue/writer_loop.go | 15 +------ libbeat/publisher/queue/memqueue/ackloop.go | 4 -- libbeat/publisher/queue/memqueue/broker.go | 20 ++++----- libbeat/publisher/queue/monitoring.go | 42 +++++++++++++++++++ libbeat/publisher/queue/queue.go | 2 +- 11 files changed, 87 insertions(+), 111 deletions(-) create mode 100644 libbeat/publisher/queue/monitoring.go diff --git a/libbeat/publisher/pipeline/client.go b/libbeat/publisher/pipeline/client.go index 7ecce6fd8c70..0ddba6f94be7 100644 --- a/libbeat/publisher/pipeline/client.go +++ b/libbeat/publisher/pipeline/client.go @@ -37,9 +37,8 @@ type client struct { mutex sync.Mutex waiter *clientCloseWaiter - eventFlags publisher.EventFlags - canDrop bool - eventWaitGroup *sync.WaitGroup + eventFlags publisher.EventFlags + canDrop bool // Open state, signaling, and sync primitives for coordinating client Close. isOpen atomic.Bool // set to false during shutdown, such that no new events will be accepted anymore. @@ -180,9 +179,6 @@ func (c *client) onNewEvent() { } func (c *client) onPublished() { - if c.eventWaitGroup != nil { - c.eventWaitGroup.Add(1) - } c.observer.publishedEvent() if c.clientListener != nil { c.clientListener.Published() diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index b34d6a64d2c0..36ffc5f2ea73 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -40,10 +40,6 @@ type outputController struct { monitors Monitors observer outputObserver - // If eventWaitGroup is non-nil, it will be decremented as the queue - // reports upstream acknowledgment of published events. - eventWaitGroup *sync.WaitGroup - // The queue is not created until the outputController is assigned a // nonempty outputs.Group, in case the output group requests a proxy // queue. At that time, any prior calls to outputController.queueProducer @@ -86,7 +82,6 @@ func newOutputController( beat beat.Info, monitors Monitors, observer outputObserver, - eventWaitGroup *sync.WaitGroup, queueFactory queue.QueueFactory, inputQueueSize int, ) (*outputController, error) { @@ -94,7 +89,6 @@ func newOutputController( beat: beat, monitors: monitors, observer: observer, - eventWaitGroup: eventWaitGroup, queueFactory: queueFactory, workerChan: make(chan publisher.Batch), consumer: newEventConsumer(monitors.Logger, observer), @@ -233,16 +227,6 @@ func (c *outputController) queueProducer(config queue.ProducerConfig) queue.Prod return <-request.responseChan } -// onACK receives event acknowledgment notifications from the queue and -// forwards them to the metrics observer and the pipeline's global event -// wait group if one is set. -func (c *outputController) onACK(eventCount int) { - c.observer.queueACKed(eventCount) - if c.eventWaitGroup != nil { - c.eventWaitGroup.Add(-eventCount) - } -} - func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { logger := c.monitors.Logger if len(outGrp.Clients) == 0 { @@ -266,12 +250,13 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { if factory == nil { factory = c.queueFactory } + queueObserver := queue.NewQueueObserver() - queue, err := factory(logger, c.onACK, c.inputQueueSize, outGrp.EncoderFactory) + queue, err := factory(logger, queueObserver, c.inputQueueSize, outGrp.EncoderFactory) if err != nil { logger.Errorf("queue creation failed, falling back to default memory queue, check your queue configuration") s, _ := memqueue.SettingsForUserConfig(nil) - queue = memqueue.NewQueue(logger, c.onACK, s, c.inputQueueSize, outGrp.EncoderFactory) + queue = memqueue.NewQueue(logger, queueObserver, s, c.inputQueueSize, outGrp.EncoderFactory) } c.queue = queue @@ -279,8 +264,6 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { queueReg := c.monitors.Telemetry.NewRegistry("queue") monitoring.NewString(queueReg, "name").Set(c.queue.QueueType()) } - maxEvents := c.queue.BufferConfig().MaxEvents - c.observer.queueMaxEvents(maxEvents) // Now that we've created a queue, go through and unblock any callers // that are waiting for a producer. diff --git a/libbeat/publisher/pipeline/controller_test.go b/libbeat/publisher/pipeline/controller_test.go index 6834af2c7f37..9fd0ab8903e9 100644 --- a/libbeat/publisher/pipeline/controller_test.go +++ b/libbeat/publisher/pipeline/controller_test.go @@ -189,7 +189,7 @@ func TestOutputQueueFactoryTakesPrecedence(t *testing.T) { func TestFailedQueueFactoryRevertsToDefault(t *testing.T) { defaultSettings, _ := memqueue.SettingsForUserConfig(nil) - failedFactory := func(_ *logp.Logger, _ func(int), _ int, _ queue.EncoderFactory) (queue.Queue, error) { + failedFactory := func(_ *logp.Logger, _ queue.Observer, _ int, _ queue.EncoderFactory) (queue.Queue, error) { return nil, fmt.Errorf("This queue creation intentionally failed") } controller := outputController{ diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index cda329e0963a..e058ef0647e6 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -41,13 +41,12 @@ type clientObserver interface { filteredEvent() publishedEvent() failedPublishEvent() + eventsACKed(count int) } type outputObserver interface { eventsDropped(int) eventsRetry(int) - queueACKed(n int) - queueMaxEvents(n int) } // metricsObserver is used by many component in the publisher pipeline, to report @@ -94,9 +93,6 @@ func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { dropped: monitoring.NewUint(reg, "events.dropped"), retry: monitoring.NewUint(reg, "events.retry"), - queueACKed: monitoring.NewUint(reg, "queue.acked"), - queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), - activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge percentQueueFull: monitoring.NewFloat(reg, "queue.filled.pct.events"), }, @@ -152,27 +148,16 @@ func (o *metricsObserver) publishedEvent() { o.vars.published.Inc() } -// (client) client closing down or DropIfFull is set -func (o *metricsObserver) failedPublishEvent() { - o.vars.failed.Inc() - o.vars.activeEvents.Dec() - o.setPercentageFull() -} - -// -// queue events -// - -// (queue) number of events ACKed by the queue/broker in use -func (o *metricsObserver) queueACKed(n int) { - o.vars.queueACKed.Add(uint64(n)) +// (client) number of ACKed events from this client +func (o *metricsObserver) eventsACKed(n int) { o.vars.activeEvents.Sub(uint64(n)) o.setPercentageFull() } -// (queue) maximum queue event capacity -func (o *metricsObserver) queueMaxEvents(n int) { - o.vars.queueMaxEvents.Set(uint64(n)) +// (client) client closing down or DropIfFull is set +func (o *metricsObserver) failedPublishEvent() { + o.vars.failed.Inc() + o.vars.activeEvents.Dec() o.setPercentageFull() } @@ -201,7 +186,6 @@ func (*emptyObserver) newEvent() {} func (*emptyObserver) filteredEvent() {} func (*emptyObserver) publishedEvent() {} func (*emptyObserver) failedPublishEvent() {} -func (*emptyObserver) queueACKed(n int) {} -func (*emptyObserver) queueMaxEvents(int) {} +func (*emptyObserver) eventsACKed(n int) {} func (*emptyObserver) eventsDropped(int) {} func (*emptyObserver) eventsRetry(int) {} diff --git a/libbeat/publisher/pipeline/pipeline.go b/libbeat/publisher/pipeline/pipeline.go index dbe87681ea63..0ea278d2b2d3 100644 --- a/libbeat/publisher/pipeline/pipeline.go +++ b/libbeat/publisher/pipeline/pipeline.go @@ -22,7 +22,6 @@ package pipeline import ( "fmt" - "sync" "time" "github.com/elastic/beats/v7/libbeat/beat" @@ -64,12 +63,9 @@ type Pipeline struct { observer observer - // wait close support. If eventWaitGroup is non-nil, then publishing - // an event through this pipeline will increment it and acknowledging - // a published event will decrement it, so the pipeline can wait on - // the group on shutdown to allow pending events to be acknowledged. + // If waitCloseTimeout is positive, then the pipeline will wait up to the + // specified time when it is closed for pending events to be acknowledged. waitCloseTimeout time.Duration - eventWaitGroup *sync.WaitGroup processors processing.Supporter } @@ -132,9 +128,7 @@ func New( processors: settings.Processors, } if settings.WaitCloseMode == WaitOnPipelineClose && settings.WaitClose > 0 { - // If wait-on-close is enabled, give the pipeline a WaitGroup for - // events that have been Published but not yet ACKed. - p.eventWaitGroup = &sync.WaitGroup{} + p.waitCloseTimeout = settings.WaitClose } if monitors.Metrics != nil { @@ -153,7 +147,7 @@ func New( return nil, err } - output, err := newOutputController(beat, monitors, p.observer, p.eventWaitGroup, queueFactory, settings.InputQueueSize) + output, err := newOutputController(beat, monitors, p.observer, queueFactory, settings.InputQueueSize) if err != nil { return nil, err } @@ -172,20 +166,9 @@ func (p *Pipeline) Close() error { log.Debug("close pipeline") - if p.eventWaitGroup != nil { - ch := make(chan struct{}) - go func() { - p.eventWaitGroup.Wait() - ch <- struct{}{} - }() - - select { - case <-ch: - // all events have been ACKed - - case <-time.After(p.waitCloseTimeout): - // timeout -> close pipeline with pending events - } + if p.waitCloseTimeout > 0 { + // TODO (hi fae): delay up to the specified timeout waiting for the queue + // to empty. } // Note: active clients are not closed / disconnected. @@ -238,14 +221,11 @@ func (p *Pipeline) ConnectWith(cfg beat.ClientConfig) (beat.Client, error) { processors: processors, eventFlags: eventFlags, canDrop: canDrop, - eventWaitGroup: p.eventWaitGroup, observer: p.observer, } ackHandler := cfg.EventListener - producerCfg := queue.ProducerConfig{} - var waiter *clientCloseWaiter if waitClose > 0 { waiter = newClientCloseWaiter(waitClose) @@ -259,6 +239,14 @@ func (p *Pipeline) ConnectWith(cfg beat.ClientConfig) (beat.Client, error) { } } + producerCfg := queue.ProducerConfig{ + ACK: func(count int) { + client.observer.eventsACKed(count) + if ackHandler != nil { + ackHandler.ACKEvents(count) + } + }, + } if ackHandler != nil { producerCfg.ACK = ackHandler.ACKEvents } else { diff --git a/libbeat/publisher/queue/diskqueue/queue.go b/libbeat/publisher/queue/diskqueue/queue.go index 32a4bd220a36..d220d1c83f77 100644 --- a/libbeat/publisher/queue/diskqueue/queue.go +++ b/libbeat/publisher/queue/diskqueue/queue.go @@ -93,11 +93,11 @@ type diskQueue struct { func FactoryForSettings(settings Settings) queue.QueueFactory { return func( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, inputQueueSize int, encoderFactory queue.EncoderFactory, ) (queue.Queue, error) { - return NewQueue(logger, ackCallback, settings, encoderFactory) + return NewQueue(logger, observer, settings, encoderFactory) } } @@ -105,7 +105,7 @@ func FactoryForSettings(settings Settings) queue.QueueFactory { // and settings, creating it if it doesn't exist. func NewQueue( logger *logp.Logger, - writeToDiskCallback func(eventCount int), + observer queue.Observer, settings Settings, encoderFactory queue.EncoderFactory, ) (*diskQueue, error) { @@ -218,7 +218,7 @@ func NewQueue( acks: newDiskQueueACKs(logger, nextReadPosition, positionFile), readerLoop: newReaderLoop(settings, encoder), - writerLoop: newWriterLoop(logger, writeToDiskCallback, settings), + writerLoop: newWriterLoop(logger, settings), deleterLoop: newDeleterLoop(settings), producerWriteRequestChan: make(chan producerWriteRequest), diff --git a/libbeat/publisher/queue/diskqueue/writer_loop.go b/libbeat/publisher/queue/diskqueue/writer_loop.go index c0e7103c41b5..72cfb04642e3 100644 --- a/libbeat/publisher/queue/diskqueue/writer_loop.go +++ b/libbeat/publisher/queue/diskqueue/writer_loop.go @@ -71,10 +71,6 @@ type writerLoop struct { // The logger for the writer loop, assigned when the queue creates it. logger *logp.Logger - // A callback that, if set, should be invoked with an event count when - // events are successfully written to disk. - writeToDiskCallback func(eventCount int) - // The writer loop listens on requestChan for frames to write, and // writes them to disk immediately (all queue capacity checking etc. is // done by the core loop before sending it to the writer). @@ -102,14 +98,12 @@ type writerLoop struct { func newWriterLoop( logger *logp.Logger, - writeToDiskCallback func(eventCount int), settings Settings, ) *writerLoop { buffer := &bytes.Buffer{} return &writerLoop{ - logger: logger, - writeToDiskCallback: writeToDiskCallback, - settings: settings, + logger: logger, + settings: settings, requestChan: make(chan writerLoopRequest, 1), responseChan: make(chan writerLoopResponse), @@ -243,11 +237,6 @@ outerLoop: // Try to sync the written data to disk. _ = wl.outputFile.Sync() - // If the queue has an ACK listener, notify it the frames were written. - if wl.writeToDiskCallback != nil { - wl.writeToDiskCallback(totalACKCount) - } - // Notify any producers with ACK listeners that their frames were written. for producer, ackCount := range producerACKCounts { producer.config.ACK(ackCount) diff --git a/libbeat/publisher/queue/memqueue/ackloop.go b/libbeat/publisher/queue/memqueue/ackloop.go index 1a964d8bb45f..9432bd5af19c 100644 --- a/libbeat/publisher/queue/memqueue/ackloop.go +++ b/libbeat/publisher/queue/memqueue/ackloop.go @@ -67,10 +67,6 @@ func (l *ackLoop) handleBatchSig() int { } if count > 0 { - if callback := l.broker.ackCallback; callback != nil { - callback(count) - } - // report acks to waiting clients l.processACK(ackedBatches, count) } diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 7a2114869f38..62c24dad951d 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -76,11 +76,9 @@ type broker struct { // through this channel so ackLoop can monitor them for acknowledgments. consumedChan chan batchList - // ackCallback is a configurable callback to invoke when ACKs are processed. - // ackLoop calls this function when it advances the consumer ACK position. - // Right now this forwards the notification to queueACKed() in - // the pipeline observer, which updates the beats registry if needed. - ackCallback func(eventCount int) + // observer is a metrics observer that the queue should use to report + // internal state. + observer queue.Observer // When batches are acknowledged, ackLoop saves any metadata needed // for producer callbacks and such, then notifies runLoop that it's @@ -143,11 +141,11 @@ type batchList struct { func FactoryForSettings(settings Settings) queue.QueueFactory { return func( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, inputQueueSize int, encoderFactory queue.EncoderFactory, ) (queue.Queue, error) { - return NewQueue(logger, ackCallback, settings, inputQueueSize, encoderFactory), nil + return NewQueue(logger, observer, settings, inputQueueSize, encoderFactory), nil } } @@ -156,12 +154,12 @@ func FactoryForSettings(settings Settings) queue.QueueFactory { // workers handling incoming messages and ACKs have been shut down. func NewQueue( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, settings Settings, inputQueueSize int, encoderFactory queue.EncoderFactory, ) *broker { - b := newQueue(logger, ackCallback, settings, inputQueueSize, encoderFactory) + b := newQueue(logger, observer, settings, inputQueueSize, encoderFactory) // Start the queue workers b.wg.Add(2) @@ -183,7 +181,7 @@ func NewQueue( // when the workers are active. func newQueue( logger *logp.Logger, - ackCallback func(eventCount int), + observer queue.Observer, settings Settings, inputQueueSize int, encoderFactory queue.EncoderFactory, @@ -225,7 +223,7 @@ func newQueue( consumedChan: make(chan batchList), deleteChan: make(chan int), - ackCallback: ackCallback, + observer: observer, } b.ctx, b.ctxCancel = context.WithCancel(context.Background()) diff --git a/libbeat/publisher/queue/monitoring.go b/libbeat/publisher/queue/monitoring.go new file mode 100644 index 000000000000..44abe3d19828 --- /dev/null +++ b/libbeat/publisher/queue/monitoring.go @@ -0,0 +1,42 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package queue + +// Observer is an interface for queues to send state updates to a metrics +// or test listener. +type Observer interface { + MaxEvents(int) + MaxBytes(int) + + // Restore queue state on startup. Used by the disk queue to report events + // that are already in the queue from a previous run. + Restore(eventCount int, byteCount int) + + // All reported byte counts are zero if the output doesn't support + // early encoding. + AddEvent(byteCount int) + ConsumeEvents(eventCount int, byteCount int) + RemoveEvents(eventCount int, byteCount int) +} + +func NewQueueObserver() Observer { + //queueACKed: monitoring.NewUint(reg, "queue.acked"), + //queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), + + return nil +} diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 72349c5a5f4b..953c1fee83ee 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -53,7 +53,7 @@ type Queue interface { // encode queued events before returning them. type QueueFactory func( logger *logp.Logger, - ack func(eventCount int), + observer Observer, inputQueueSize int, encoderFactory EncoderFactory, ) (Queue, error) From 75ac0f4fe6aa06d70158f71c7e19839c383d7df5 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 16:59:18 -0400 Subject: [PATCH 81/99] plumbing for queue metrics --- libbeat/publisher/pipeline/consumer.go | 9 +++--- libbeat/publisher/pipeline/controller.go | 23 +++++++------- libbeat/publisher/pipeline/controller_test.go | 8 ++--- libbeat/publisher/queue/monitoring.go | 30 ++++++++++++++++++- 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/libbeat/publisher/pipeline/consumer.go b/libbeat/publisher/pipeline/consumer.go index 1ff8c1bc95d7..096935b46f43 100644 --- a/libbeat/publisher/pipeline/consumer.go +++ b/libbeat/publisher/pipeline/consumer.go @@ -58,10 +58,11 @@ type eventConsumer struct { // consumerTarget specifies the queue to read from, the parameters needed // to generate a batch, and the output channel to send batches to. type consumerTarget struct { - queue queue.Queue - ch chan publisher.Batch - timeToLive int - batchSize int + queue queue.Queue + ch chan publisher.Batch + timeToLive int + batchSize int + retryObserver outputObserver } // retryRequest is used by ttlBatch to add itself back to the eventConsumer diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index 36ffc5f2ea73..631a48308b44 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -36,9 +36,9 @@ import ( // - stop // - reload type outputController struct { - beat beat.Info - monitors Monitors - observer outputObserver + beat beat.Info + monitors Monitors + retryObserver outputObserver // The queue is not created until the outputController is assigned a // nonempty outputs.Group, in case the output group requests a proxy @@ -81,17 +81,17 @@ type outputWorker interface { func newOutputController( beat beat.Info, monitors Monitors, - observer outputObserver, + retryObserver outputObserver, queueFactory queue.QueueFactory, inputQueueSize int, ) (*outputController, error) { controller := &outputController{ beat: beat, monitors: monitors, - observer: observer, + retryObserver: retryObserver, queueFactory: queueFactory, workerChan: make(chan publisher.Batch), - consumer: newEventConsumer(monitors.Logger, observer), + consumer: newEventConsumer(monitors.Logger, retryObserver), inputQueueSize: inputQueueSize, } @@ -164,10 +164,11 @@ func (c *outputController) Set(outGrp outputs.Group) { // Resume consumer targeting the new work queue c.consumer.setTarget( consumerTarget{ - queue: c.queue, - ch: targetChan, - batchSize: outGrp.BatchSize, - timeToLive: outGrp.Retry + 1, + queue: c.queue, + ch: targetChan, + batchSize: outGrp.BatchSize, + timeToLive: outGrp.Retry + 1, + retryObserver: c.retryObserver, }) } @@ -250,7 +251,7 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { if factory == nil { factory = c.queueFactory } - queueObserver := queue.NewQueueObserver() + queueObserver := queue.NewQueueObserver(c.monitors.Metrics) queue, err := factory(logger, queueObserver, c.inputQueueSize, outGrp.EncoderFactory) if err != nil { diff --git a/libbeat/publisher/pipeline/controller_test.go b/libbeat/publisher/pipeline/controller_test.go index 9fd0ab8903e9..e6394787563c 100644 --- a/libbeat/publisher/pipeline/controller_test.go +++ b/libbeat/publisher/pipeline/controller_test.go @@ -152,7 +152,7 @@ func TestQueueCreatedOnlyAfterOutputExists(t *testing.T) { // send configuration updates without blocking. targetChan: make(chan consumerTarget, 4), }, - observer: nilObserver, + retryObserver: nilObserver, } // Set to an empty output group. This should not create a queue. controller.Set(outputs.Group{}) @@ -175,7 +175,7 @@ func TestOutputQueueFactoryTakesPrecedence(t *testing.T) { consumer: &eventConsumer{ targetChan: make(chan consumerTarget, 4), }, - observer: nilObserver, + retryObserver: nilObserver, } controller.Set(outputs.Group{ Clients: []outputs.Client{newMockClient(nil)}, @@ -197,7 +197,7 @@ func TestFailedQueueFactoryRevertsToDefault(t *testing.T) { consumer: &eventConsumer{ targetChan: make(chan consumerTarget, 4), }, - observer: nilObserver, + retryObserver: nilObserver, monitors: Monitors{ Logger: logp.NewLogger("tests"), }, @@ -215,7 +215,7 @@ func TestQueueProducerBlocksUntilOutputIsSet(t *testing.T) { consumer: &eventConsumer{ targetChan: make(chan consumerTarget, 4), }, - observer: nilObserver, + retryObserver: nilObserver, } // Send producer requests from different goroutines. They should all // block, because there is no queue, but they should become unblocked diff --git a/libbeat/publisher/queue/monitoring.go b/libbeat/publisher/queue/monitoring.go index 44abe3d19828..3aa6d6342329 100644 --- a/libbeat/publisher/queue/monitoring.go +++ b/libbeat/publisher/queue/monitoring.go @@ -17,6 +17,10 @@ package queue +import ( + "github.com/elastic/elastic-agent-libs/monitoring" +) + // Observer is an interface for queues to send state updates to a metrics // or test listener. type Observer interface { @@ -34,9 +38,33 @@ type Observer interface { RemoveEvents(eventCount int, byteCount int) } -func NewQueueObserver() Observer { +type nilObserver struct{} + +func (nilObserver) MaxEvents(_ int) {} +func (nilObserver) MaxBytes(_ int) {} +func (nilObserver) Restore(_ int, _ int) {} +func (nilObserver) AddEvent(_ int) {} +func (nilObserver) ConsumeEvents(_ int, _ int) {} +func (nilObserver) RemoveEvents(_ int, _ int) {} + +func NewQueueObserver(metrics *monitoring.Registry) Observer { + if metrics == nil { + return nilObserver{} + } + metrics = metrics.GetRegistry("queue") + if metrics != nil { + err := metrics.Clear() + if err != nil { + return nilObserver{} + } + } else { + metrics = metrics.NewRegistry("queue") + } + //outStats = outputs.NewStats(metrics) + //queueACKed: monitoring.NewUint(reg, "queue.acked"), //queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), return nil + } From ca949b8edc007fbd98cf834d4efef97ca4020cfd Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 18:11:34 -0400 Subject: [PATCH 82/99] flesh out queue observer internals --- libbeat/publisher/queue/monitoring.go | 96 ++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 15 deletions(-) diff --git a/libbeat/publisher/queue/monitoring.go b/libbeat/publisher/queue/monitoring.go index 3aa6d6342329..ae5399ac493f 100644 --- a/libbeat/publisher/queue/monitoring.go +++ b/libbeat/publisher/queue/monitoring.go @@ -38,33 +38,99 @@ type Observer interface { RemoveEvents(eventCount int, byteCount int) } -type nilObserver struct{} +type queueObserver struct { + maxEvents *monitoring.Uint // gauge + maxBytes *monitoring.Uint // gauge -func (nilObserver) MaxEvents(_ int) {} -func (nilObserver) MaxBytes(_ int) {} -func (nilObserver) Restore(_ int, _ int) {} -func (nilObserver) AddEvent(_ int) {} -func (nilObserver) ConsumeEvents(_ int, _ int) {} -func (nilObserver) RemoveEvents(_ int, _ int) {} + addedEvents *monitoring.Uint + addedBytes *monitoring.Uint + consumedEvents *monitoring.Uint + consumedBytes *monitoring.Uint + removedEvents *monitoring.Uint + removedBytes *monitoring.Uint + + events *monitoring.Uint // gauge + bytes *monitoring.Uint // gauge + filled *monitoring.Float // gauge +} + +type nilObserver struct{} func NewQueueObserver(metrics *monitoring.Registry) Observer { if metrics == nil { return nilObserver{} } - metrics = metrics.GetRegistry("queue") - if metrics != nil { - err := metrics.Clear() + queueMetrics := metrics.GetRegistry("queue") + if queueMetrics != nil { + err := queueMetrics.Clear() if err != nil { return nilObserver{} } } else { - metrics = metrics.NewRegistry("queue") + queueMetrics = metrics.NewRegistry("queue") + } + + ob := &queueObserver{ + maxEvents: monitoring.NewUint(queueMetrics, "max_events"), // gauge + maxBytes: monitoring.NewUint(queueMetrics, "max_bytes"), // gauge + + addedEvents: monitoring.NewUint(queueMetrics, "added.events"), + addedBytes: monitoring.NewUint(queueMetrics, "added.bytes"), + consumedEvents: monitoring.NewUint(queueMetrics, "consumed.events"), + consumedBytes: monitoring.NewUint(queueMetrics, "consumed.bytes"), + removedEvents: monitoring.NewUint(queueMetrics, "removed.events"), + removedBytes: monitoring.NewUint(queueMetrics, "removed.bytes"), + + events: monitoring.NewUint(queueMetrics, "events"), // gauge + bytes: monitoring.NewUint(queueMetrics, "bytes"), // gauge + filled: monitoring.NewFloat(queueMetrics, "filled.pct"), // gauge } - //outStats = outputs.NewStats(metrics) - //queueACKed: monitoring.NewUint(reg, "queue.acked"), - //queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), + // Backwards compatibility: "queue.acked" represents the same value as + // "queue.removed.events", when the queue had no other metrics variables + // and didn't support byte measurements. We keep a copy of it under the + // old name to avoid breaking dashboards that used it. + monitoring.AliasVar(queueMetrics, "removed.events", "acked") + return ob +} + +func (ob *queueObserver) MaxEvents(value int) { + ob.maxEvents.Set(uint64(value)) +} + +func (ob *queueObserver) MaxBytes(value int) { + ob.maxBytes.Set(uint64(value)) +} + +func (ob *queueObserver) Restore(eventCount int, byteCount int) { + ob.events.Set(uint64(eventCount)) + ob.bytes.Set(uint64(byteCount)) +} - return nil +func (ob *queueObserver) AddEvent(byteCount int) { + ob.addedEvents.Inc() + ob.addedBytes.Add(uint64(byteCount)) + ob.events.Inc() + ob.bytes.Add(uint64(byteCount)) } + +func (ob *queueObserver) ConsumeEvents(eventCount int, byteCount int) { + ob.consumedEvents.Add(uint64(eventCount)) + ob.consumedBytes.Add(uint64(byteCount)) +} + +func (ob *queueObserver) RemoveEvents(eventCount int, byteCount int) { + ob.removedEvents.Add(uint64(eventCount)) + ob.removedBytes.Add(uint64(byteCount)) + + ob.events.Sub(uint64(eventCount)) + ob.bytes.Sub(uint64(byteCount)) +} + +func (nilObserver) MaxEvents(_ int) {} +func (nilObserver) MaxBytes(_ int) {} +func (nilObserver) Restore(_ int, _ int) {} +func (nilObserver) AddEvent(_ int) {} +func (nilObserver) ConsumeEvents(_ int, _ int) {} +func (nilObserver) RemoveEvents(_ int, _ int) {} From a27986259477848c51994f77f9ab8bf0ab14b654 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 18:25:35 -0400 Subject: [PATCH 83/99] update queue filled percent --- libbeat/publisher/queue/monitoring.go | 35 ++++++++++++++++++--------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/libbeat/publisher/queue/monitoring.go b/libbeat/publisher/queue/monitoring.go index ae5399ac493f..ea0e7f0e8133 100644 --- a/libbeat/publisher/queue/monitoring.go +++ b/libbeat/publisher/queue/monitoring.go @@ -49,9 +49,9 @@ type queueObserver struct { removedEvents *monitoring.Uint removedBytes *monitoring.Uint - events *monitoring.Uint // gauge - bytes *monitoring.Uint // gauge - filled *monitoring.Float // gauge + filledEvents *monitoring.Uint // gauge + filledBytes *monitoring.Uint // gauge + filledPct *monitoring.Float // gauge } type nilObserver struct{} @@ -81,9 +81,9 @@ func NewQueueObserver(metrics *monitoring.Registry) Observer { removedEvents: monitoring.NewUint(queueMetrics, "removed.events"), removedBytes: monitoring.NewUint(queueMetrics, "removed.bytes"), - events: monitoring.NewUint(queueMetrics, "events"), // gauge - bytes: monitoring.NewUint(queueMetrics, "bytes"), // gauge - filled: monitoring.NewFloat(queueMetrics, "filled.pct"), // gauge + filledEvents: monitoring.NewUint(queueMetrics, "filled.events"), // gauge + filledBytes: monitoring.NewUint(queueMetrics, "filled.bytes"), // gauge + filledPct: monitoring.NewFloat(queueMetrics, "filled.pct"), // gauge } // Backwards compatibility: "queue.acked" represents the same value as @@ -103,16 +103,18 @@ func (ob *queueObserver) MaxBytes(value int) { } func (ob *queueObserver) Restore(eventCount int, byteCount int) { - ob.events.Set(uint64(eventCount)) - ob.bytes.Set(uint64(byteCount)) + ob.filledEvents.Set(uint64(eventCount)) + ob.filledBytes.Set(uint64(byteCount)) + ob.updateFilledPct() } func (ob *queueObserver) AddEvent(byteCount int) { ob.addedEvents.Inc() ob.addedBytes.Add(uint64(byteCount)) - ob.events.Inc() - ob.bytes.Add(uint64(byteCount)) + ob.filledEvents.Inc() + ob.filledBytes.Add(uint64(byteCount)) + ob.updateFilledPct() } func (ob *queueObserver) ConsumeEvents(eventCount int, byteCount int) { @@ -124,8 +126,17 @@ func (ob *queueObserver) RemoveEvents(eventCount int, byteCount int) { ob.removedEvents.Add(uint64(eventCount)) ob.removedBytes.Add(uint64(byteCount)) - ob.events.Sub(uint64(eventCount)) - ob.bytes.Sub(uint64(byteCount)) + ob.filledEvents.Sub(uint64(eventCount)) + ob.filledBytes.Sub(uint64(byteCount)) + ob.updateFilledPct() +} + +func (ob *queueObserver) updateFilledPct() { + if maxBytes := ob.maxBytes.Get(); maxBytes > 0 { + ob.filledPct.Set(float64(ob.filledBytes.Get()) / float64(maxBytes)) + } else { + ob.filledPct.Set(float64(ob.filledEvents.Get()) / float64(ob.maxEvents.Get())) + } } func (nilObserver) MaxEvents(_ int) {} From 517ffe182c1881e3518408a938cdf811ba04e289 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 22:36:10 -0400 Subject: [PATCH 84/99] clean up shipper metric hooks --- libbeat/publisher/queue/memqueue/broker.go | 14 +++++--------- .../publisher/queue/memqueue/internal_api.go | 16 ---------------- libbeat/publisher/queue/memqueue/runloop.go | 18 +----------------- 3 files changed, 6 insertions(+), 42 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 62c24dad951d..6500792ad6a0 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -65,10 +65,6 @@ type broker struct { // Consumers send requests to getChan to read events from the queue. getChan chan getRequest - // Metrics() sends requests to metricChan to expose internal queue - // metrics to external callers. - metricChan chan metricsRequest - /////////////////////////// // internal channels @@ -109,8 +105,9 @@ type Settings struct { } type queueEntry struct { - event queue.Entry - id queue.EntryID + event queue.Entry + eventSize int + id queue.EntryID producer *ackProducer producerID producerID // The order of this entry within its producer @@ -215,9 +212,8 @@ func newQueue( encoderFactory: encoderFactory, // broker API channels - pushChan: make(chan pushRequest, chanSize), - getChan: make(chan getRequest), - metricChan: make(chan metricsRequest), + pushChan: make(chan pushRequest, chanSize), + getChan: make(chan getRequest), // internal runLoop and ackLoop channels consumedChan: make(chan batchList), diff --git a/libbeat/publisher/queue/memqueue/internal_api.go b/libbeat/publisher/queue/memqueue/internal_api.go index 6575472edbd0..0d983de65200 100644 --- a/libbeat/publisher/queue/memqueue/internal_api.go +++ b/libbeat/publisher/queue/memqueue/internal_api.go @@ -46,19 +46,3 @@ type getRequest struct { } type batchDoneMsg struct{} - -// Metrics API - -type metricsRequest struct { - responseChan chan memQueueMetrics -} - -// memQueueMetrics tracks metrics that are returned by the individual memory queue implementations -type memQueueMetrics struct { - // the size of items in the queue - currentQueueSize int - // the number of items that have been read by a consumer but not yet ack'ed - occupiedRead int - - oldestEntryID queue.EntryID -} diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index ed14106f20c9..d4980d4bb11b 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -133,9 +133,6 @@ func (l *runLoop) runIteration() { case count := <-l.broker.deleteChan: l.handleDelete(count) - case req := <-l.broker.metricChan: // asking broker for queue metrics - l.handleMetricsRequest(&req) - case <-timeoutChan: // The get timer has expired, handle the blocked request l.getTimer.Stop() @@ -223,22 +220,9 @@ func (l *runLoop) insert(req *pushRequest, id queue.EntryID) { index := (l.bufPos + l.eventCount) % len(l.broker.buf) l.broker.buf[index] = queueEntry{ event: req.event, + eventSize: req.eventSize, id: id, producer: req.producer, producerID: req.producerID, } } - -func (l *runLoop) handleMetricsRequest(req *metricsRequest) { - oldestEntryID := l.nextEntryID - if l.eventCount > 0 { - index := l.bufPos % len(l.broker.buf) - oldestEntryID = l.broker.buf[index].id - } - - req.responseChan <- memQueueMetrics{ - currentQueueSize: l.eventCount, - occupiedRead: l.consumedCount, - oldestEntryID: oldestEntryID, - } -} From 4f6d02cd17918560d597bf3d4093ed02ff35c90e Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Wed, 29 May 2024 22:49:53 -0400 Subject: [PATCH 85/99] use the metrics observer from the memqueue --- libbeat/publisher/queue/memqueue/runloop.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index d4980d4bb11b..39df817dfcc2 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -174,18 +174,30 @@ func (l *runLoop) handleGetReply(req *getRequest) { startIndex := l.bufPos + l.consumedCount batch := newBatch(l.broker, startIndex, batchSize) + batchBytes := 0 + for i := 0; i < batchSize; i++ { + batchBytes += batch.rawEntry(i).eventSize + } + // Send the batch to the caller and update internal state req.responseChan <- batch l.consumedBatches.append(batch) l.consumedCount += batchSize + l.broker.observer.ConsumeEvents(batchSize, batchBytes) } func (l *runLoop) handleDelete(count int) { + byteCount := 0 + for i := 0; i < count; i++ { + entry := l.broker.buf[(l.bufPos+i)%len(l.broker.buf)] + byteCount += entry.eventSize + } // Advance position and counters. Event data was already cleared in // batch.FreeEntries when the events were vended. l.bufPos = (l.bufPos + count) % len(l.broker.buf) l.eventCount -= count l.consumedCount -= count + l.broker.observer.RemoveEvents(count, byteCount) } func (l *runLoop) handleInsert(req *pushRequest) { @@ -225,4 +237,5 @@ func (l *runLoop) insert(req *pushRequest, id queue.EntryID) { producer: req.producer, producerID: req.producerID, } + l.broker.observer.AddEvent(req.eventSize) } From fd18f4e8d319fdad17bc1294c97da823db9a0c6a Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 09:59:25 -0400 Subject: [PATCH 86/99] configure gauges --- libbeat/monitoring/report/log/log.go | 63 +++++++++++----------- libbeat/publisher/pipeline/monitoring.go | 24 +-------- libbeat/publisher/queue/memqueue/broker.go | 2 + 3 files changed, 36 insertions(+), 53 deletions(-) diff --git a/libbeat/monitoring/report/log/log.go b/libbeat/monitoring/report/log/log.go index e11e8228cf70..b40c6d33e42c 100644 --- a/libbeat/monitoring/report/log/log.go +++ b/libbeat/monitoring/report/log/log.go @@ -37,36 +37,39 @@ import ( // TODO: Replace this with a proper solution that uses the metric type from // where it is defined. See: https://github.com/elastic/beats/issues/5433 var gauges = map[string]bool{ - "libbeat.output.events.active": true, - "libbeat.pipeline.events.active": true, - "libbeat.pipeline.clients": true, - "libbeat.pipeline.queue.max_events": true, - "libbeat.pipeline.queue.filled.pct.events": true, - "libbeat.config.module.running": true, - "registrar.states.current": true, - "filebeat.events.active": true, - "filebeat.harvester.running": true, - "filebeat.harvester.open_files": true, - "beat.memstats.memory_total": true, - "beat.memstats.memory_alloc": true, - "beat.memstats.rss": true, - "beat.memstats.gc_next": true, - "beat.info.uptime.ms": true, - "beat.cgroup.memory.mem.usage.bytes": true, - "beat.cpu.user.ticks": true, - "beat.cpu.system.ticks": true, - "beat.cpu.total.value": true, - "beat.cpu.total.ticks": true, - "beat.handles.open": true, - "beat.handles.limit.hard": true, - "beat.handles.limit.soft": true, - "beat.runtime.goroutines": true, - "system.load.1": true, - "system.load.5": true, - "system.load.15": true, - "system.load.norm.1": true, - "system.load.norm.5": true, - "system.load.norm.15": true, + "libbeat.output.events.active": true, + "libbeat.pipeline.events.active": true, + "libbeat.pipeline.clients": true, + "libbeat.pipeline.queue.max_events": true, + "libbeat.pipeline.queue.max_bytes": true, + "libbeat.pipeline.queue.filled.events": true, + "libbeat.pipeline.queue.filled.bytes": true, + "libbeat.pipeline.queue.filled.pct": true, + "libbeat.config.module.running": true, + "registrar.states.current": true, + "filebeat.events.active": true, + "filebeat.harvester.running": true, + "filebeat.harvester.open_files": true, + "beat.memstats.memory_total": true, + "beat.memstats.memory_alloc": true, + "beat.memstats.rss": true, + "beat.memstats.gc_next": true, + "beat.info.uptime.ms": true, + "beat.cgroup.memory.mem.usage.bytes": true, + "beat.cpu.user.ticks": true, + "beat.cpu.system.ticks": true, + "beat.cpu.total.value": true, + "beat.cpu.total.ticks": true, + "beat.handles.open": true, + "beat.handles.limit.hard": true, + "beat.handles.limit.soft": true, + "beat.runtime.goroutines": true, + "system.load.1": true, + "system.load.5": true, + "system.load.15": true, + "system.load.norm.1": true, + "system.load.norm.5": true, + "system.load.norm.15": true, } // isGauge returns true when the given metric key name represents a gauge value. diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index e058ef0647e6..942ecaf019e5 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -18,8 +18,6 @@ package pipeline import ( - "math" - "github.com/elastic/elastic-agent-libs/monitoring" ) @@ -68,11 +66,6 @@ type metricsObserverVars struct { events, filtered, published, failed *monitoring.Uint dropped, retry *monitoring.Uint // (retryer) drop/retry counters activeEvents *monitoring.Uint - - // queue metrics - queueACKed *monitoring.Uint - queueMaxEvents *monitoring.Uint - percentQueueFull *monitoring.Float } func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { @@ -93,8 +86,7 @@ func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { dropped: monitoring.NewUint(reg, "events.dropped"), retry: monitoring.NewUint(reg, "events.retry"), - activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge - percentQueueFull: monitoring.NewFloat(reg, "queue.filled.pct.events"), + activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge }, } } @@ -123,24 +115,12 @@ func (o *metricsObserver) clientClosed() { o.vars.clients.Dec() } func (o *metricsObserver) newEvent() { o.vars.events.Inc() o.vars.activeEvents.Inc() - o.setPercentageFull() -} - -// setPercentageFull is used interally to set the `queue.full` metric -func (o *metricsObserver) setPercentageFull() { - maxEvt := o.vars.queueMaxEvents.Get() - if maxEvt != 0 { - pct := float64(o.vars.activeEvents.Get()) / float64(maxEvt) - pctRound := math.Round(pct/0.0005) * 0.0005 - o.vars.percentQueueFull.Set(pctRound) - } } // (client) event is filtered out (on purpose or failed) func (o *metricsObserver) filteredEvent() { o.vars.filtered.Inc() o.vars.activeEvents.Dec() - o.setPercentageFull() } // (client) managed to push an event into the publisher pipeline @@ -151,14 +131,12 @@ func (o *metricsObserver) publishedEvent() { // (client) number of ACKed events from this client func (o *metricsObserver) eventsACKed(n int) { o.vars.activeEvents.Sub(uint64(n)) - o.setPercentageFull() } // (client) client closing down or DropIfFull is set func (o *metricsObserver) failedPublishEvent() { o.vars.failed.Inc() o.vars.activeEvents.Dec() - o.setPercentageFull() } // diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 6500792ad6a0..286278804a54 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -226,6 +226,8 @@ func newQueue( b.runLoop = newRunLoop(b) b.ackLoop = newACKLoop(b) + observer.MaxEvents(settings.Events) + return b } From 2f6ba9b28f8affb3be59174275b1841791410585 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 11:11:16 -0400 Subject: [PATCH 87/99] report queue metrics from the disk queue --- libbeat/publisher/queue/diskqueue/consumer.go | 7 ++++++ .../publisher/queue/diskqueue/core_loop.go | 24 ++++++++++++++----- libbeat/publisher/queue/diskqueue/queue.go | 7 ++++++ libbeat/publisher/queue/diskqueue/segments.go | 5 +--- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/libbeat/publisher/queue/diskqueue/consumer.go b/libbeat/publisher/queue/diskqueue/consumer.go index 55098b10fa82..20e6648d927e 100644 --- a/libbeat/publisher/queue/diskqueue/consumer.go +++ b/libbeat/publisher/queue/diskqueue/consumer.go @@ -54,6 +54,13 @@ eventLoop: } } + // Check the batch size so we can report to the metrics observer + batchByteCount := 0 + for _, frame := range frames { + batchByteCount += int(frame.bytesOnDisk) + } + dq.observer.ConsumeEvents(len(frames), batchByteCount) + // There is a mild race condition here based on queue closure: events // written to readerLoop.output may have been buffered before the // queue was closed, and we may be reading its leftovers afterwards. diff --git a/libbeat/publisher/queue/diskqueue/core_loop.go b/libbeat/publisher/queue/diskqueue/core_loop.go index c08c204d51ed..2acc9dcc2b72 100644 --- a/libbeat/publisher/queue/diskqueue/core_loop.go +++ b/libbeat/publisher/queue/diskqueue/core_loop.go @@ -111,6 +111,7 @@ func (dq *diskQueue) handleProducerWriteRequest(request producerWriteRequest) { // pending list and report success, then dispatch it to the // writer loop if no other requests are outstanding. dq.enqueueWriteFrame(request.frame) + dq.observer.AddEvent(int(request.frame.sizeOnDisk())) request.responseChan <- true } else { // The queue is too full. Either add the request to blockedProducers, @@ -175,6 +176,8 @@ func (dq *diskQueue) handleDeleterLoopResponse(response deleterLoopResponse) { dq.deleting = false newAckedSegments := []*queueSegment{} errors := []error{} + removedEventCount := 0 + removedByteCount := 0 for i, err := range response.results { if err != nil { // This segment had an error, so it stays in the acked list. @@ -182,8 +185,15 @@ func (dq *diskQueue) handleDeleterLoopResponse(response deleterLoopResponse) { errors = append(errors, fmt.Errorf("couldn't delete segment %d: %w", dq.segments.acked[i].id, err)) + } else { + removedEventCount += int(dq.segments.acked[i].frameCount) + // For the metrics observer, we (can) only report the size of the raw + // events, not the segment header, so subtract that here so it doesn't + // look like we're deleting more than was added in the first place. + removedByteCount += int(dq.segments.acked[i].byteCount - dq.segments.acked[i].headerSize()) } } + dq.observer.RemoveEvents(removedEventCount, removedByteCount) if len(dq.segments.acked) > len(response.results) { // Preserve any new acked segments that were added during the deletion // request. @@ -468,9 +478,13 @@ func (dq *diskQueue) canAcceptFrameOfSize(frameSize uint64) bool { return true } - // Compute the current queue size. We accept if there is enough capacity - // left in the queue after accounting for the existing segments and the - // pending writes that were already accepted. + // We accept if there is enough capacity left in the queue after accounting + // for the existing segments and the pending writes that were already + // accepted. + return dq.currentSize()+frameSize <= dq.settings.MaxBufferSize +} + +func (dq *diskQueue) currentSize() uint64 { pendingBytes := uint64(0) for _, sf := range dq.pendingFrames { pendingBytes += sf.frame.sizeOnDisk() @@ -479,7 +493,5 @@ func (dq *diskQueue) canAcceptFrameOfSize(frameSize uint64) bool { if dq.writing { pendingBytes += dq.writeRequestSize } - currentSize := pendingBytes + dq.segments.sizeOnDisk() - - return currentSize+frameSize <= dq.settings.MaxBufferSize + return pendingBytes + dq.segments.sizeOnDisk() } diff --git a/libbeat/publisher/queue/diskqueue/queue.go b/libbeat/publisher/queue/diskqueue/queue.go index d220d1c83f77..f59e041035a8 100644 --- a/libbeat/publisher/queue/diskqueue/queue.go +++ b/libbeat/publisher/queue/diskqueue/queue.go @@ -34,6 +34,7 @@ const QueueType = "disk" // of queue.Queue. type diskQueue struct { logger *logp.Logger + observer queue.Observer settings Settings // Metadata related to the segment files. @@ -120,6 +121,7 @@ func NewQueue( "twice the segment size (%v)", settings.MaxBufferSize, settings.MaxSegmentSize) } + observer.MaxBytes(int(settings.MaxBufferSize)) // Create the given directory path if it doesn't exist. err := os.MkdirAll(settings.directoryPath(), os.ModePerm) @@ -193,10 +195,14 @@ func NewQueue( //nolint:godox // Ignore This // TODO: pass in a context that queues can use to report these events. activeFrameCount := 0 + activeByteCount := 0 for _, segment := range initialSegments { activeFrameCount += int(segment.frameCount) + activeByteCount += int(segment.byteCount) } activeFrameCount -= int(nextReadPosition.frameIndex) + activeByteCount -= int(nextReadPosition.byteIndex) + observer.Restore(activeFrameCount, activeByteCount) logger.Infof("Found %d existing events on queue start", activeFrameCount) var encoder queue.Encoder @@ -206,6 +212,7 @@ func NewQueue( queue := &diskQueue{ logger: logger, + observer: observer, settings: settings, segments: diskQueueSegments{ diff --git a/libbeat/publisher/queue/diskqueue/segments.go b/libbeat/publisher/queue/diskqueue/segments.go index 0460fc4431a7..7e3661f6e5b4 100644 --- a/libbeat/publisher/queue/diskqueue/segments.go +++ b/libbeat/publisher/queue/diskqueue/segments.go @@ -94,10 +94,7 @@ type queueSegment struct { // If this segment was loaded from a previous session, schemaVersion // points to the file schema version that was read from its header. // This is only used by queueSegment.headerSize(), which is used in - // maybeReadPending to calculate the position of the first data frame, - // and by queueSegment.shouldUseJSON(), which is used in the reader - // loop to detect old segments that used JSON encoding instead of - // the current CBOR. + // maybeReadPending to calculate the position of the first data frame. schemaVersion *uint32 // The number of bytes occupied by this segment on-disk, as of the most From ce2c2876b0e69cc68b951fdb909a3d8a8cb3d413 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 11:20:52 -0400 Subject: [PATCH 88/99] fix disk queue initialization --- libbeat/publisher/queue/diskqueue/queue.go | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/libbeat/publisher/queue/diskqueue/queue.go b/libbeat/publisher/queue/diskqueue/queue.go index f59e041035a8..b76dc266b162 100644 --- a/libbeat/publisher/queue/diskqueue/queue.go +++ b/libbeat/publisher/queue/diskqueue/queue.go @@ -169,6 +169,15 @@ func NewQueue( lastID := initialSegments[len(initialSegments)-1].id nextSegmentID = lastID + 1 } + // Check the initial contents to report to the metrics observer. + initialEventCount := 0 + initialByteCount := 0 + for _, segment := range initialSegments { + initialEventCount += int(segment.frameCount) + // Event metrics for the queue observer don't include segment headser size + initialByteCount += int(segment.byteCount - segment.headerSize()) + } + observer.Restore(initialEventCount, initialByteCount) // If any of the initial segments are older than the current queue // position, move them directly to the acked list where they can be @@ -186,24 +195,13 @@ func NewQueue( nextReadPosition = queuePosition{segmentID: initialSegments[0].id} } - // We can compute the active frames right now but still need a way to report - // them to the global beat metrics. For now, just log the total. - // Note that for consistency with existing queue behavior, this excludes - // events that are still present on disk but were already sent and - // acknowledged on a previous run (we probably want to track these as well - // in the future.) - //nolint:godox // Ignore This - // TODO: pass in a context that queues can use to report these events. + // Count just the active events to report in the log activeFrameCount := 0 - activeByteCount := 0 for _, segment := range initialSegments { activeFrameCount += int(segment.frameCount) - activeByteCount += int(segment.byteCount) } activeFrameCount -= int(nextReadPosition.frameIndex) - activeByteCount -= int(nextReadPosition.byteIndex) - observer.Restore(activeFrameCount, activeByteCount) - logger.Infof("Found %d existing events on queue start", activeFrameCount) + logger.Infof("Found %v queued events consuming %v bytes, %v events still pending", initialEventCount, initialByteCount, activeFrameCount) var encoder queue.Encoder if encoderFactory != nil { From e70c13b6387f4ca846ed11ac8abda3ce9fdc0081 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 11:32:10 -0400 Subject: [PATCH 89/99] outputObserver -> retryObserver --- libbeat/publisher/pipeline/consumer.go | 25 +++++++++---------- libbeat/publisher/pipeline/controller.go | 17 ++++++------- libbeat/publisher/pipeline/controller_test.go | 16 ++++++------ libbeat/publisher/pipeline/monitoring.go | 4 +-- 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/libbeat/publisher/pipeline/consumer.go b/libbeat/publisher/pipeline/consumer.go index 096935b46f43..a7806a3ded27 100644 --- a/libbeat/publisher/pipeline/consumer.go +++ b/libbeat/publisher/pipeline/consumer.go @@ -31,8 +31,8 @@ import ( type eventConsumer struct { logger *logp.Logger - // eventConsumer calls the observer methods eventsRetry and eventsDropped. - observer outputObserver + // eventConsumer calls the retryObserver methods eventsRetry and eventsDropped. + retryObserver retryObserver // When the output changes, the new target is sent to the worker routine // on this channel. Clients should call eventConsumer.setTarget(). @@ -58,11 +58,10 @@ type eventConsumer struct { // consumerTarget specifies the queue to read from, the parameters needed // to generate a batch, and the output channel to send batches to. type consumerTarget struct { - queue queue.Queue - ch chan publisher.Batch - timeToLive int - batchSize int - retryObserver outputObserver + queue queue.Queue + ch chan publisher.Batch + timeToLive int + batchSize int } // retryRequest is used by ttlBatch to add itself back to the eventConsumer @@ -74,12 +73,12 @@ type retryRequest struct { func newEventConsumer( log *logp.Logger, - observer outputObserver, + observer retryObserver, ) *eventConsumer { c := &eventConsumer{ - logger: log, - observer: observer, - queueReader: makeQueueReader(), + logger: log, + retryObserver: observer, + queueReader: makeQueueReader(), targetChan: make(chan consumerTarget), retryChan: make(chan retryRequest), @@ -164,7 +163,7 @@ outerLoop: // Successfully sent a batch to the output workers if len(retryBatches) > 0 { // This was a retry, report it to the observer - c.observer.eventsRetry(len(active.Events())) + c.retryObserver.eventsRetry(len(active.Events())) retryBatches = retryBatches[1:] } else { // This was directly from the queue, clear the value so we can @@ -184,7 +183,7 @@ outerLoop: alive := req.batch.reduceTTL() countDropped := countFailed - len(req.batch.Events()) - c.observer.eventsDropped(countDropped) + c.retryObserver.eventsDropped(countDropped) if !alive { log.Info("Drop batch") diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index 631a48308b44..e99c501b2943 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -36,9 +36,8 @@ import ( // - stop // - reload type outputController struct { - beat beat.Info - monitors Monitors - retryObserver outputObserver + beat beat.Info + monitors Monitors // The queue is not created until the outputController is assigned a // nonempty outputs.Group, in case the output group requests a proxy @@ -81,14 +80,13 @@ type outputWorker interface { func newOutputController( beat beat.Info, monitors Monitors, - retryObserver outputObserver, + retryObserver retryObserver, queueFactory queue.QueueFactory, inputQueueSize int, ) (*outputController, error) { controller := &outputController{ beat: beat, monitors: monitors, - retryObserver: retryObserver, queueFactory: queueFactory, workerChan: make(chan publisher.Batch), consumer: newEventConsumer(monitors.Logger, retryObserver), @@ -164,11 +162,10 @@ func (c *outputController) Set(outGrp outputs.Group) { // Resume consumer targeting the new work queue c.consumer.setTarget( consumerTarget{ - queue: c.queue, - ch: targetChan, - batchSize: outGrp.BatchSize, - timeToLive: outGrp.Retry + 1, - retryObserver: c.retryObserver, + queue: c.queue, + ch: targetChan, + batchSize: outGrp.BatchSize, + timeToLive: outGrp.Retry + 1, }) } diff --git a/libbeat/publisher/pipeline/controller_test.go b/libbeat/publisher/pipeline/controller_test.go index e6394787563c..2e4f0df990f6 100644 --- a/libbeat/publisher/pipeline/controller_test.go +++ b/libbeat/publisher/pipeline/controller_test.go @@ -150,9 +150,9 @@ func TestQueueCreatedOnlyAfterOutputExists(t *testing.T) { // We aren't testing the values sent to eventConsumer, we // just need a placeholder here so outputController can // send configuration updates without blocking. - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - retryObserver: nilObserver, } // Set to an empty output group. This should not create a queue. controller.Set(outputs.Group{}) @@ -173,9 +173,9 @@ func TestOutputQueueFactoryTakesPrecedence(t *testing.T) { memqueue.Settings{Events: 1}, ), consumer: &eventConsumer{ - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - retryObserver: nilObserver, } controller.Set(outputs.Group{ Clients: []outputs.Client{newMockClient(nil)}, @@ -195,9 +195,9 @@ func TestFailedQueueFactoryRevertsToDefault(t *testing.T) { controller := outputController{ queueFactory: failedFactory, consumer: &eventConsumer{ - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - retryObserver: nilObserver, monitors: Monitors{ Logger: logp.NewLogger("tests"), }, @@ -213,9 +213,9 @@ func TestQueueProducerBlocksUntilOutputIsSet(t *testing.T) { controller := outputController{ queueFactory: memqueue.FactoryForSettings(memqueue.Settings{Events: 1}), consumer: &eventConsumer{ - targetChan: make(chan consumerTarget, 4), + targetChan: make(chan consumerTarget, 4), + retryObserver: nilObserver, }, - retryObserver: nilObserver, } // Send producer requests from different goroutines. They should all // block, because there is no queue, but they should become unblocked diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index 942ecaf019e5..6a6b6705925f 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -24,7 +24,7 @@ import ( type observer interface { pipelineObserver clientObserver - outputObserver + retryObserver cleanup() } @@ -42,7 +42,7 @@ type clientObserver interface { eventsACKed(count int) } -type outputObserver interface { +type retryObserver interface { eventsDropped(int) eventsRetry(int) } From e6dbb2dc7c29eba0bfebba6888c860b0c826ac82 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 13:17:18 -0400 Subject: [PATCH 90/99] move queue draining logic into the queue --- libbeat/publisher/pipeline/controller.go | 71 ++++++++++++------- libbeat/publisher/pipeline/pipeline.go | 7 +- libbeat/publisher/pipeline/pipeline_test.go | 4 ++ .../publisher/queue/diskqueue/core_loop.go | 2 +- libbeat/publisher/queue/diskqueue/producer.go | 2 +- libbeat/publisher/queue/diskqueue/queue.go | 46 +++++------- libbeat/publisher/queue/memqueue/broker.go | 14 +++- libbeat/publisher/queue/memqueue/runloop.go | 26 +++++-- libbeat/publisher/queue/queue.go | 6 ++ 9 files changed, 108 insertions(+), 70 deletions(-) diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index e99c501b2943..c3015b8dddca 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -19,6 +19,7 @@ package pipeline import ( "sync" + "time" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common/reload" @@ -53,10 +54,15 @@ type outputController struct { // is called. queueFactory queue.QueueFactory + // consumer is a helper goroutine that reads event batches from the queue + // and sends them to workerChan for an output worker to process. + consumer *eventConsumer + + // Each worker is a goroutine that will read batches from workerChan and + // send them to the output. + workers []outputWorker workerChan chan publisher.Batch - consumer *eventConsumer - workers []outputWorker // The InputQueueSize can be set when the Beat is started, in // libbeat/cmd/instance/Settings we need to preserve that // value and pass it into the queue factory. The queue @@ -96,35 +102,26 @@ func newOutputController( return controller, nil } -func (c *outputController) Close() error { +func (c *outputController) WaitClose(timeout time.Duration) error { + // First: signal the queue that we're shutting down, and wait up to the + // given duration for it to drain and process ACKs. + c.closeQueue(timeout) + + // We've drained the queue as much as we can, signal eventConsumer to + // close, and wait for it to finish. After consumer.close returns, + // there will be no more writes to c.workerChan, so it is safe to close. c.consumer.close() close(c.workerChan) + // Signal the output workers to close. This step is a hint, and carries + // no guarantees. For example, on close the Elasticsearch output workers + // will close idle connections, but will not change any behavior for + // active connections, giving any remaining events a chance to ingest + // before we terminate. for _, out := range c.workers { out.Close() } - // Closing the queue stops ACKs from propagating, so we close everything - // else first to give it a chance to wait for any outstanding events to be - // acknowledged. - c.queueLock.Lock() - if c.queue != nil { - c.queue.Close() - } - for _, req := range c.pendingRequests { - // We can only end up here if there was an attempt to connect to the - // pipeline but it was shut down before any output was set. - // In this case, return nil and Pipeline.ConnectWith will pass on a - // real error to the caller. - // NOTE: under the current shutdown process, Pipeline.Close (and hence - // outputController.Close) is ~never called. So even if we did have - // blocked callers here, in a real shutdown they will never be woken - // up. But in hopes of a day when the shutdown process is more robust, - // I've decided to do the right thing here anyway. - req.responseChan <- nil - } - c.queueLock.Unlock() - return nil } @@ -195,6 +192,32 @@ func (c *outputController) Reload( return nil } +// Close the queue, waiting up to the specified timeout for pending events +// to complete. +func (c *outputController) closeQueue(timeout time.Duration) { + c.queueLock.Lock() + defer c.queueLock.Unlock() + if c.queue != nil { + c.queue.Close() + select { + case <-c.queue.Done(): + case <-time.After(timeout): + } + } + for _, req := range c.pendingRequests { + // We can only end up here if there was an attempt to connect to the + // pipeline but it was shut down before any output was set. + // In this case, return nil and Pipeline.ConnectWith will pass on a + // real error to the caller. + // NOTE: under the current shutdown process, Pipeline.Close (and hence + // outputController.Close) is ~never called. So even if we did have + // blocked callers here, in a real shutdown they will never be woken + // up. But in hopes of a day when the shutdown process is more robust, + // I've decided to do the right thing here anyway. + req.responseChan <- nil + } +} + // queueProducer creates a queue producer with the given config, blocking // until the queue is created if it does not yet exist. func (c *outputController) queueProducer(config queue.ProducerConfig) queue.Producer { diff --git a/libbeat/publisher/pipeline/pipeline.go b/libbeat/publisher/pipeline/pipeline.go index 0ea278d2b2d3..46d71676b3f7 100644 --- a/libbeat/publisher/pipeline/pipeline.go +++ b/libbeat/publisher/pipeline/pipeline.go @@ -166,13 +166,8 @@ func (p *Pipeline) Close() error { log.Debug("close pipeline") - if p.waitCloseTimeout > 0 { - // TODO (hi fae): delay up to the specified timeout waiting for the queue - // to empty. - } - // Note: active clients are not closed / disconnected. - p.outputController.Close() + p.outputController.WaitClose(p.waitCloseTimeout) p.observer.cleanup() return nil diff --git a/libbeat/publisher/pipeline/pipeline_test.go b/libbeat/publisher/pipeline/pipeline_test.go index 015b24af5b46..a8cf34b895aa 100644 --- a/libbeat/publisher/pipeline/pipeline_test.go +++ b/libbeat/publisher/pipeline/pipeline_test.go @@ -132,6 +132,10 @@ func (q *testQueue) Close() error { return nil } +func (q *testQueue) Done() <-chan struct{} { + return nil +} + func (q *testQueue) QueueType() string { return "test" } diff --git a/libbeat/publisher/queue/diskqueue/core_loop.go b/libbeat/publisher/queue/diskqueue/core_loop.go index 2acc9dcc2b72..4f30a0e58bad 100644 --- a/libbeat/publisher/queue/diskqueue/core_loop.go +++ b/libbeat/publisher/queue/diskqueue/core_loop.go @@ -47,7 +47,7 @@ func (dq *diskQueue) run() { // After receiving new ACKs, a segment might be ready to delete. dq.maybeDeleteACKed() - case <-dq.done: + case <-dq.close: dq.handleShutdown() return diff --git a/libbeat/publisher/queue/diskqueue/producer.go b/libbeat/publisher/queue/diskqueue/producer.go index 7d084adf5ea4..c379ac40637d 100644 --- a/libbeat/publisher/queue/diskqueue/producer.go +++ b/libbeat/publisher/queue/diskqueue/producer.go @@ -87,7 +87,7 @@ func (producer *diskQueueProducer) publish( // blocking the core loop. response := <-request.responseChan return response - case <-producer.queue.done: + case <-producer.queue.close: return false case <-producer.done: return false diff --git a/libbeat/publisher/queue/diskqueue/queue.go b/libbeat/publisher/queue/diskqueue/queue.go index b76dc266b162..13d72a62a34f 100644 --- a/libbeat/publisher/queue/diskqueue/queue.go +++ b/libbeat/publisher/queue/diskqueue/queue.go @@ -21,7 +21,6 @@ import ( "errors" "fmt" "os" - "sync" "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" @@ -49,10 +48,6 @@ type diskQueue struct { writerLoop *writerLoop deleterLoop *deleterLoop - // Wait group for shutdown of the goroutines associated with this queue: - // reader loop, writer loop, deleter loop, and core loop (diskQueue.run()). - waitGroup sync.WaitGroup - // writing is true if the writer loop is processing a request, false // otherwise. writing bool @@ -84,7 +79,12 @@ type diskQueue struct { // waiting for free space in the queue. blockedProducers []producerWriteRequest - // The channel to signal our goroutines to shut down. + // The channel to signal our goroutines to shut down, used by + // (*diskQueue).Close. + close chan struct{} + + // The channel to report that shutdown is finished, used by + // (*diskQueue).Done. done chan struct{} } @@ -228,30 +228,15 @@ func NewQueue( producerWriteRequestChan: make(chan producerWriteRequest), - done: make(chan struct{}), + close: make(chan struct{}), + done: make(chan struct{}), } - // We wait for four goroutines on shutdown: core loop, reader loop, - // writer loop, deleter loop. - queue.waitGroup.Add(4) - // Start the goroutines and return the queue! - go func() { - queue.readerLoop.run() - queue.waitGroup.Done() - }() - go func() { - queue.writerLoop.run() - queue.waitGroup.Done() - }() - go func() { - queue.deleterLoop.run() - queue.waitGroup.Done() - }() - go func() { - queue.run() - queue.waitGroup.Done() - }() + go queue.readerLoop.run() + go queue.writerLoop.run() + go queue.deleterLoop.run() + go queue.run() return queue, nil } @@ -263,12 +248,15 @@ func NewQueue( func (dq *diskQueue) Close() error { // Closing the done channel signals to the core loop that it should // shut down the other helper goroutines and wrap everything up. - close(dq.done) - dq.waitGroup.Wait() + close(dq.close) return nil } +func (dq *diskQueue) Done() <-chan struct{} { + return dq.done +} + func (dq *diskQueue) QueueType() string { return QueueType } diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index 286278804a54..964b495fd887 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -65,6 +65,9 @@ type broker struct { // Consumers send requests to getChan to read events from the queue. getChan chan getRequest + // Close triggers a queue close by sending to closeChan. + closeChan chan struct{} + /////////////////////////// // internal channels @@ -212,8 +215,9 @@ func newQueue( encoderFactory: encoderFactory, // broker API channels - pushChan: make(chan pushRequest, chanSize), - getChan: make(chan getRequest), + pushChan: make(chan pushRequest, chanSize), + getChan: make(chan getRequest), + closeChan: make(chan struct{}), // internal runLoop and ackLoop channels consumedChan: make(chan batchList), @@ -232,10 +236,14 @@ func newQueue( } func (b *broker) Close() error { - b.ctxCancel() + b.closeChan <- struct{}{} return nil } +func (b *broker) Done() <-chan struct{} { + return b.ctx.Done() +} + func (b *broker) QueueType() string { return QueueType } diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index 39df817dfcc2..f029571e43f8 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -57,6 +57,11 @@ type runLoop struct { // It is active if and only if pendingGetRequest is non-nil. getTimer *time.Timer + // closing is set when a close request is received. Once closing is true, + // the queue will not accept any new events, but will continue responding + // to Gets and Acks to allow pending events to complete on shutdown. + closing bool + // TODO (https://github.com/elastic/beats/issues/37893): entry IDs were a // workaround for an external project that no longer exists. At this point // they just complicate the API and should be removed. @@ -90,8 +95,8 @@ func (l *runLoop) run() { // standalone helper function to allow testing of loop invariants. func (l *runLoop) runIteration() { var pushChan chan pushRequest - // Push requests are enabled if the queue isn't yet full. - if l.eventCount < len(l.broker.buf) { + // Push requests are enabled if the queue isn't full or closing. + if l.eventCount < len(l.broker.buf) && !l.closing { pushChan = l.broker.pushChan } @@ -116,7 +121,13 @@ func (l *runLoop) runIteration() { } select { + case <-l.broker.closeChan: + l.closing = true + // Get requests are handled immediately during shutdown + l.maybeUnblockGetRequest() + case <-l.broker.ctx.Done(): + // The queue is fully shut down, do nothing return case req := <-pushChan: // producer pushing new event @@ -154,8 +165,8 @@ func (l *runLoop) handleGetRequest(req *getRequest) { } func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { - if l.broker.settings.FlushTimeout <= 0 { - // Never block if the flush timeout isn't positive + if l.broker.settings.FlushTimeout <= 0 || l.closing { + // Never block if the flush timeout isn't positive, or during shutdown return false } eventsAvailable := l.eventCount - l.consumedCount @@ -198,6 +209,10 @@ func (l *runLoop) handleDelete(count int) { l.eventCount -= count l.consumedCount -= count l.broker.observer.RemoveEvents(count, byteCount) + if l.closing && l.eventCount == 0 { + // Our last events were acknowledged during shutdown, signal final shutdown + l.broker.ctxCancel() + } } func (l *runLoop) handleInsert(req *pushRequest) { @@ -217,8 +232,7 @@ func (l *runLoop) maybeUnblockGetRequest() { // If a get request is blocked waiting for more events, check if // we should unblock it. if getRequest := l.pendingGetRequest; getRequest != nil { - available := l.eventCount - l.consumedCount - if available >= getRequest.entryCount { + if !l.getRequestShouldBlock(getRequest) { l.pendingGetRequest = nil if !l.getTimer.Stop() { <-l.getTimer.C diff --git a/libbeat/publisher/queue/queue.go b/libbeat/publisher/queue/queue.go index 953c1fee83ee..075d7ad66a46 100644 --- a/libbeat/publisher/queue/queue.go +++ b/libbeat/publisher/queue/queue.go @@ -37,8 +37,14 @@ type Entry interface{} // consumer or flush to some other intermediate storage), it will send an ACK signal // with the number of ACKed events to the Producer (ACK happens in batches). type Queue interface { + // Close signals the queue to shut down, but it may keep handling requests + // and acknowledgments for events that are already in progress. Close() error + // Done returns a channel that unblocks when the queue is closed and all + // its events are persisted or acknowledged. + Done() <-chan struct{} + QueueType() string BufferConfig() BufferConfig From afa3793c8cdfcdbe3225fc13bb55af859072a8a3 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 13:44:37 -0400 Subject: [PATCH 91/99] shadow acked var the simple way --- libbeat/publisher/queue/monitoring.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/libbeat/publisher/queue/monitoring.go b/libbeat/publisher/queue/monitoring.go index ea0e7f0e8133..71d23ec03e39 100644 --- a/libbeat/publisher/queue/monitoring.go +++ b/libbeat/publisher/queue/monitoring.go @@ -52,6 +52,13 @@ type queueObserver struct { filledEvents *monitoring.Uint // gauge filledBytes *monitoring.Uint // gauge filledPct *monitoring.Float // gauge + + // backwards compatibility: the metric "acked" is the old name for + // "removed.events". Ideally we would like to define an alias in the + // monitoring API, but until that's possible we shadow it with this + // extra variable and make sure to always change removedEvents and + // acked at the same time. + acked *monitoring.Uint } type nilObserver struct{} @@ -84,13 +91,10 @@ func NewQueueObserver(metrics *monitoring.Registry) Observer { filledEvents: monitoring.NewUint(queueMetrics, "filled.events"), // gauge filledBytes: monitoring.NewUint(queueMetrics, "filled.bytes"), // gauge filledPct: monitoring.NewFloat(queueMetrics, "filled.pct"), // gauge - } - // Backwards compatibility: "queue.acked" represents the same value as - // "queue.removed.events", when the queue had no other metrics variables - // and didn't support byte measurements. We keep a copy of it under the - // old name to avoid breaking dashboards that used it. - monitoring.AliasVar(queueMetrics, "removed.events", "acked") + // backwards compatibility: "acked" is an alias for "removed.events". + acked: monitoring.NewUint(queueMetrics, "acked"), + } return ob } @@ -124,6 +128,7 @@ func (ob *queueObserver) ConsumeEvents(eventCount int, byteCount int) { func (ob *queueObserver) RemoveEvents(eventCount int, byteCount int) { ob.removedEvents.Add(uint64(eventCount)) + ob.acked.Add(uint64(eventCount)) ob.removedBytes.Add(uint64(byteCount)) ob.filledEvents.Sub(uint64(eventCount)) From 5a158ec690ea4e2c09cfadb864e6ba9c0399fa49 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 15:59:58 -0400 Subject: [PATCH 92/99] memqueue uses event or byte limits, not both --- libbeat/publisher/queue/memqueue/broker.go | 18 ++++------- libbeat/publisher/queue/memqueue/runloop.go | 34 +++++++++------------ 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index a66a22ff97a5..d6fcdbf8d9ed 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -46,9 +46,6 @@ type broker struct { ctx context.Context ctxCancel context.CancelFunc - // wait group for queue workers (runLoop and ackLoop) - wg sync.WaitGroup - // The factory used to create an event encoder when creating a producer encoderFactory queue.EncoderFactory @@ -168,15 +165,8 @@ func NewQueue( b := newQueue(logger, observer, settings, inputQueueSize, encoderFactory) // Start the queue workers - b.wg.Add(2) - go func() { - defer b.wg.Done() - b.runLoop.run() - }() - go func() { - defer b.wg.Done() - b.ackLoop.run() - }() + go b.runLoop.run() + go b.ackLoop.run() return b } @@ -283,6 +273,10 @@ func (b *broker) Get(count int, bytes int) (queue.Batch, error) { return resp, nil } +func (b *broker) useByteLimits() bool { + return b.settings.Bytes > 0 +} + var batchPool = sync.Pool{ New: func() interface{} { return &batch{ diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index a62257316b96..247b2cedbc2e 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -177,9 +177,9 @@ func (l *runLoop) runIteration() { } func (l *runLoop) handleGetRequest(req *getRequest) { - // Backwards compatibility: if all byte parameters are <= 0, get requests + // Backwards compatibility: when using event-based limits, get requests // are capped by settings.MaxGetRequest. - if req.byteCount <= 0 && l.broker.settings.Bytes <= 0 { + if !l.broker.useByteLimits() { if req.entryCount > l.broker.settings.MaxGetRequest { req.entryCount = l.broker.settings.MaxGetRequest } @@ -198,17 +198,15 @@ func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { // Never block if the flush timeout isn't positive, or during shutdown return false } - availableEntries := l.eventCount - l.consumedEventCount - availableBytes := l.byteCount - l.consumedByteCount // The entry/byte limits are satisfied if they are <= 0 (indicating no // limit) or if we have at least the requested number available. - entriesSatisfied := req.entryCount <= 0 || availableEntries >= req.entryCount - bytesSatisfied := req.byteCount <= 0 || availableBytes >= req.byteCount - - // Block if there are neither enough entries nor enough bytes to fill - // the request. - return !entriesSatisfied && !bytesSatisfied + if l.broker.useByteLimits() { + availableBytes := l.byteCount - l.consumedByteCount + return req.byteCount <= 0 || availableBytes >= req.byteCount + } + availableEntries := l.eventCount - l.consumedEventCount + return req.entryCount <= 0 || availableEntries >= req.entryCount } // Respond to the given get request without blocking or waiting for more events @@ -297,18 +295,14 @@ func (l *runLoop) handlePushRequest(req pushRequest) { } // Returns true if the given push request can be added to the queue -// without exceeding entry count or byte limits +// without exceeding the entry count or byte limit. func (l *runLoop) canFitPushRequest(req pushRequest) bool { - maxEvents := l.broker.settings.Events - maxBytes := l.broker.settings.Bytes - + if l.broker.useByteLimits() { + newByteCount := l.byteCount + req.eventSize + return newByteCount <= l.broker.settings.Bytes + } newEventCount := l.eventCount + 1 - newByteCount := l.byteCount + req.eventSize - - eventCountFits := maxEvents <= 0 || newEventCount <= maxEvents - byteCountFits := maxBytes <= 0 || newByteCount <= maxBytes - - return eventCountFits && byteCountFits + return newEventCount <= l.broker.settings.Events } func (l *runLoop) maybeUnblockPushRequests() { From 24c5564a64f23631f54795eef539ece6354a1d61 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 16:18:48 -0400 Subject: [PATCH 93/99] fix byte vs event logic --- libbeat/publisher/queue/memqueue/config.go | 14 ++++++++------ libbeat/publisher/queue/memqueue/runloop.go | 13 ++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/config.go b/libbeat/publisher/queue/memqueue/config.go index 52483f7689f1..c282650298b7 100644 --- a/libbeat/publisher/queue/memqueue/config.go +++ b/libbeat/publisher/queue/memqueue/config.go @@ -41,16 +41,17 @@ type config struct { const minQueueBytes = 32768 const minQueueEvents = 32 +const defaultMaxQueueEvents = 3200 func (c *config) Validate() error { if c.Bytes != nil && *c.Bytes < minQueueBytes { - return errors.New(fmt.Sprintf("queue byte size must be at least %v", minQueueBytes)) + return fmt.Errorf("queue byte size must be at least %v", minQueueBytes) } if c.Events != nil && *c.Events < minQueueEvents { - return errors.New(fmt.Sprintf("queue event size must be at least %v", minQueueEvents)) + return fmt.Errorf("queue event size must be at least %v", minQueueEvents) } - if c.Events == nil && c.Bytes == nil { - return errors.New("queue must have an event limit or a byte limit") + if c.Events != nil && c.Bytes != nil { + return errors.New("memory queue can only have an event limit or a byte limit, not both") } if c.Events != nil && c.MaxGetEvents > *c.Events { return errors.New("flush.min_events must be less than events") @@ -66,7 +67,7 @@ var defaultConfig = config{ // SettingsForUserConfig unpacks a ucfg config from a Beats queue // configuration and returns the equivalent memqueue.Settings object. func SettingsForUserConfig(cfg *c.C) (Settings, error) { - var config config + config := defaultConfig if cfg != nil { if err := cfg.Unpack(&config); err != nil { return Settings{}, fmt.Errorf("couldn't unpack memory queue config: %w", err) @@ -76,6 +77,7 @@ func SettingsForUserConfig(cfg *c.C) (Settings, error) { MaxGetRequest: config.MaxGetEvents, FlushTimeout: config.FlushTimeout, } + if config.Events != nil { result.Events = *config.Events } @@ -84,7 +86,7 @@ func SettingsForUserConfig(cfg *c.C) (Settings, error) { } // If no size constraint was given, fall back on the default event cap if config.Events == nil && config.Bytes == nil { - result.Events = 3200 + result.Events = defaultMaxQueueEvents } return result, nil } diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index 247b2cedbc2e..a43094632955 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -96,7 +96,7 @@ func newRunLoop(broker *broker) *runLoop { } eventBufSize := broker.settings.Events - if eventBufSize <= 0 { + if broker.useByteLimits() { // The queue is using byte limits, start with a buffer of 2^10 and // we will expand it as needed. eventBufSize = 1 << 10 @@ -212,9 +212,9 @@ func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { // Respond to the given get request without blocking or waiting for more events func (l *runLoop) handleGetReply(req *getRequest) { entriesAvailable := l.eventCount - l.consumedEventCount - // backwards compatibility: if all byte bounds are <= 0 then batch size + // backwards compatibility: when using event-based limits, batch size // can't be more than settings.MaxGetRequest. - if req.byteCount <= 0 && l.broker.settings.Bytes <= 0 { + if l.broker.useByteLimits() { if entriesAvailable > l.broker.settings.MaxGetRequest { entriesAvailable = l.broker.settings.MaxGetRequest } @@ -343,10 +343,9 @@ func (l *runLoop) growEventBuffer() { // Insert the given new event without bounds checks, and report the result // to the caller via the push request's response channel. func (l *runLoop) doInsert(req pushRequest) { - maxEvents := l.broker.settings.Events - // If there is no event limit, check if we need to grow the current queue - // buffer to fit the new event. - if maxEvents <= 0 && l.eventCount >= l.buf.size() { + // If using byte limits (no hard limit on event count), check if we need to + // grow the current queue buffer to fit the new event. + if l.broker.useByteLimits() && l.eventCount >= l.buf.size() { l.growEventBuffer() } From 3da6d32f428524635563faca5debb39efe35893e Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 16:31:15 -0400 Subject: [PATCH 94/99] clean up FIFO handling --- libbeat/common/fifo/fifo.go | 10 ++++---- .../queue/memqueue/circular_buffer.go | 24 ------------------- libbeat/publisher/queue/memqueue/runloop.go | 7 ++---- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/libbeat/common/fifo/fifo.go b/libbeat/common/fifo/fifo.go index c6cde4d491fd..10879ee2bb28 100644 --- a/libbeat/common/fifo/fifo.go +++ b/libbeat/common/fifo/fifo.go @@ -45,13 +45,15 @@ func (f *FIFO[T]) Empty() bool { return f.first == nil } -// Return the first value (if present) without removing it from the queue -func (f *FIFO[T]) First() (T, error) { +// Return the first value (if present) without removing it from the queue. +// Returns a default value if the queue is empty. To recognize this case, +// check (*FIFO).Empty(). +func (f *FIFO[T]) First() T { if f.first == nil { var none T - return none, errFIFOEmpty + return none } - return f.first.value, nil + return f.first.value } // Remove the first entry in the queue. Does nothing if the FIFO is empty. diff --git a/libbeat/publisher/queue/memqueue/circular_buffer.go b/libbeat/publisher/queue/memqueue/circular_buffer.go index 0b3e3ec8df56..c66a76e2e696 100644 --- a/libbeat/publisher/queue/memqueue/circular_buffer.go +++ b/libbeat/publisher/queue/memqueue/circular_buffer.go @@ -17,30 +17,6 @@ package memqueue -// the queue's underlying array buffer needs to coordinate concurrent -// access by: -// -// runLoop -// - when a pushRequest is accepted, writes to the newly created entry index. -// - when a producer is cancelled, reads and writes to entry indices that -// have been created but not yet consumed, to discard events from that -// producer. -// - when entries are deleted (after consumed events have been -// acknowledged), reads from the deleted entry indices. -// - when a pushRequest requires resizing of the array, expands and/or -// replaces the buffer. -// -// the queue's consumer (in a live Beat this means queueReader in -// libbeat/publisher/pipeline/queue_reader.go) which reads from entry -// indices that have been consumed but not deleted via (*batch).Entry(). -// -// ackLoop, which reads producer metadata from acknowledged entry -// indices before they are deleted so acknowledgment callbacks can be -// invoked. -// -// Most of these are not in conflict since they access disjoint array indices. -// The exception is growing the circular buffer, which conflicts with read -// access from batches of consumed events. type circularBuffer struct { // Do not access this array directly! use (circularBuffer).entry(). _entries []queueEntry diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index a43094632955..27881351c444 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -306,16 +306,13 @@ func (l *runLoop) canFitPushRequest(req pushRequest) bool { } func (l *runLoop) maybeUnblockPushRequests() { - req, err := l.pendingPushRequests.First() - for err == nil { + for !l.pendingPushRequests.Empty() { + req := l.pendingPushRequests.First() if !l.canFitPushRequest(req) { break } l.doInsert(req) l.pendingPushRequests.Remove() - - // Fetch the next request - req, err = l.pendingPushRequests.First() } } From ac94a2b81119c91e0f2cbc57945c6358ee346021 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 17:08:38 -0400 Subject: [PATCH 95/99] replace batchList implementation with FIFO helper --- libbeat/common/fifo/fifo.go | 21 ++++ libbeat/publisher/queue/memqueue/ackloop.go | 65 +++++----- libbeat/publisher/queue/memqueue/broker.go | 117 ++---------------- .../publisher/queue/memqueue/internal_api.go | 2 +- libbeat/publisher/queue/memqueue/runloop.go | 4 +- 5 files changed, 68 insertions(+), 141 deletions(-) diff --git a/libbeat/common/fifo/fifo.go b/libbeat/common/fifo/fifo.go index 10879ee2bb28..8b592e94c079 100644 --- a/libbeat/common/fifo/fifo.go +++ b/libbeat/common/fifo/fifo.go @@ -56,6 +56,27 @@ func (f *FIFO[T]) First() T { return f.first.value } +// Remove the first entry in this FIFO and return it. +func (f *FIFO[T]) ConsumeFirst() T { + result := f.First() + f.Remove() + return result +} + +// Append another FIFO queue to an existing one. Takes ownership of +// the given FIFO's contents. +func (f *FIFO[T]) Concat(list FIFO[T]) { + if list.Empty() { + return + } + if f.Empty() { + *f = list + return + } + f.last.next = list.first + f.last = list.last +} + // Remove the first entry in the queue. Does nothing if the FIFO is empty. func (f *FIFO[T]) Remove() { if f.first != nil { diff --git a/libbeat/publisher/queue/memqueue/ackloop.go b/libbeat/publisher/queue/memqueue/ackloop.go index b544940022f9..94a02ba2e4ab 100644 --- a/libbeat/publisher/queue/memqueue/ackloop.go +++ b/libbeat/publisher/queue/memqueue/ackloop.go @@ -37,7 +37,10 @@ func newACKLoop(broker *broker) *ackLoop { func (l *ackLoop) run() { b := l.broker for { - nextBatchChan := l.pendingBatches.nextBatchChannel() + var nextBatchChan chan batchDoneMsg + if !l.pendingBatches.Empty() { + nextBatchChan = l.pendingBatches.First().doneChan + } select { case <-b.ctx.Done(): @@ -46,7 +49,7 @@ func (l *ackLoop) run() { case chanList := <-b.consumedChan: // New batches have been generated, add them to the pending list - l.pendingBatches.concat(&chanList) + l.pendingBatches.Concat(chanList) case <-nextBatchChan: // The oldest outstanding batch has been acknowledged, advance our @@ -58,43 +61,30 @@ func (l *ackLoop) run() { // handleBatchSig collects and handles a batch ACK/Cancel signal. handleBatchSig // is run by the ackLoop. -func (l *ackLoop) handleBatchSig() int { +func (l *ackLoop) handleBatchSig() { ackedBatches := l.collectAcked() - count := 0 - for batch := ackedBatches.front(); batch != nil; batch = batch.next { - count += batch.count - } - - if count > 0 { + if !ackedBatches.Empty() { // report acks to waiting clients - l.processACK(ackedBatches, count) - } - - for !ackedBatches.empty() { - // Release finished batch structs into the shared memory pool - releaseBatch(ackedBatches.pop()) + l.processACK(ackedBatches) } - - // return final ACK to EventLoop, in order to clean up internal buffer - l.broker.logger.Debug("ackloop: return ack to broker loop:", count) - - l.broker.logger.Debug("ackloop: done send ack") - return count } func (l *ackLoop) collectAcked() batchList { ackedBatches := batchList{} - acks := l.pendingBatches.pop() - ackedBatches.append(acks) + // The first batch is always included, since that's what triggered the call + // to collectAcked. + nextBatch := l.pendingBatches.ConsumeFirst() + ackedBatches.Add(nextBatch) done := false - for !l.pendingBatches.empty() && !done { - acks := l.pendingBatches.front() + for !l.pendingBatches.Empty() && !done { + nextBatch = l.pendingBatches.First() select { - case <-acks.doneChan: - ackedBatches.append(l.pendingBatches.pop()) + case <-nextBatch.doneChan: + ackedBatches.Add(nextBatch) + l.pendingBatches.Remove() default: done = true @@ -107,16 +97,22 @@ func (l *ackLoop) collectAcked() batchList { // Called by ackLoop. This function exists to decouple the work of collecting // and running producer callbacks from logical deletion of the events, so // input callbacks can't block the queue by occupying the runLoop goroutine. -func (l *ackLoop) processACK(lst batchList, N int) { +func (l *ackLoop) processACK(lst batchList) { ackCallbacks := []func(){} + batches := []batch{} + for !lst.Empty() { + batches = append(batches, lst.First()) + lst.Remove() + } // First we traverse the entries we're about to remove, collecting any callbacks // we need to run. - lst.reverse() - for !lst.empty() { - batch := lst.pop() + // Traverse entries from last to first, so we can acknowledge the most recent + // ones first and skip repeated producer callbacks. + eventCount := 0 + for batchIndex := len(batches) - 1; batchIndex >= 0; batchIndex-- { + batch := batches[batchIndex] + eventCount += batch.count - // Traverse entries from last to first, so we can acknowledge the most recent - // ones first and skip subsequent producer callbacks. for i := batch.count - 1; i >= 0; i-- { entry := batch.entry(i) if entry.producer == nil { @@ -136,7 +132,8 @@ func (l *ackLoop) processACK(lst batchList, N int) { } } // Signal runLoop to delete the events - l.broker.deleteChan <- N + l.broker.deleteChan <- eventCount + l.broker.logger.Debug("ackloop: return ack to broker loop:", eventCount) // The events have been removed; notify their listeners. for _, f := range ackCallbacks { diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index d6fcdbf8d9ed..29b0c38f2acb 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -20,9 +20,9 @@ package memqueue import ( "context" "io" - "sync" "time" + "github.com/elastic/beats/v7/libbeat/common/fifo" "github.com/elastic/beats/v7/libbeat/publisher/queue" "github.com/elastic/elastic-agent-libs/logp" ) @@ -117,9 +117,6 @@ type batch struct { // to be valid). queueBuf circularBuffer - // Next batch in the containing batchList - next *batch - // Position of the batch's events within the queue. This is an absolute // index over the lifetime of the queue, to get the position within the // queue's current circular buffer, use (start % len(queue.buf)). @@ -133,10 +130,7 @@ type batch struct { doneChan chan batchDoneMsg } -type batchList struct { - head *batch - tail *batch -} +type batchList = fifo.FIFO[batch] // FactoryForSettings is a simple wrapper around NewQueue so a concrete // Settings object can be wrapped in a queue-agnostic interface for @@ -260,7 +254,7 @@ func (b *broker) Producer(cfg queue.ProducerConfig) queue.Producer { } func (b *broker) Get(count int, bytes int) (queue.Batch, error) { - responseChan := make(chan *batch, 1) + responseChan := make(chan batch, 1) select { case <-b.ctx.Done(): return nil, io.EOF @@ -277,93 +271,12 @@ func (b *broker) useByteLimits() bool { return b.settings.Bytes > 0 } -var batchPool = sync.Pool{ - New: func() interface{} { - return &batch{ - doneChan: make(chan batchDoneMsg, 1), - } - }, -} - -func newBatch(queueBuf circularBuffer, start entryIndex, count int) *batch { - batch := batchPool.Get().(*batch) - batch.next = nil - batch.queueBuf = queueBuf - batch.start = start - batch.count = count - return batch -} - -func releaseBatch(b *batch) { - b.next = nil - batchPool.Put(b) -} - -func (l *batchList) prepend(b *batch) { - b.next = l.head - l.head = b - if l.tail == nil { - l.tail = b - } -} - -func (l *batchList) concat(other *batchList) { - if other.head == nil { - return - } - - if l.head == nil { - *l = *other - return - } - - l.tail.next = other.head - l.tail = other.tail -} - -func (l *batchList) append(b *batch) { - if l.head == nil { - l.head = b - } else { - l.tail.next = b - } - l.tail = b -} - -func (l *batchList) empty() bool { - return l.head == nil -} - -func (l *batchList) front() *batch { - return l.head -} - -func (l *batchList) nextBatchChannel() chan batchDoneMsg { - if l.head == nil { - return nil - } - return l.head.doneChan -} - -func (l *batchList) pop() *batch { - ch := l.head - if ch != nil { - l.head = ch.next - if l.head == nil { - l.tail = nil - } - } - - ch.next = nil - return ch -} - -func (l *batchList) reverse() { - tmp := *l - *l = batchList{} - - for !tmp.empty() { - l.prepend(tmp.pop()) +func newBatch(queueBuf circularBuffer, start entryIndex, count int) batch { + return batch{ + doneChan: make(chan batchDoneMsg, 1), + queueBuf: queueBuf, + start: start, + count: count, } } @@ -379,25 +292,21 @@ func AdjustInputQueueSize(requested, mainQueueSize int) (actual int) { return actual } -func (b *batch) Count() int { +func (b batch) Count() int { return b.count } -func (ei entryIndex) inBuffer(buf []queueEntry) *queueEntry { - return &buf[int(ei)%len(buf)] -} - // Return a pointer to the queueEntry for the i-th element of this batch -func (b *batch) entry(i int) *queueEntry { +func (b batch) entry(i int) *queueEntry { entryIndex := b.start.plus(i) return b.queueBuf.entry(entryIndex) } // Return the event referenced by the i-th element of this batch -func (b *batch) Entry(i int) queue.Entry { +func (b batch) Entry(i int) queue.Entry { return b.entry(i).event } -func (b *batch) Done() { +func (b batch) Done() { b.doneChan <- batchDoneMsg{} } diff --git a/libbeat/publisher/queue/memqueue/internal_api.go b/libbeat/publisher/queue/memqueue/internal_api.go index 9ecd634dfc8b..881d2a826ee1 100644 --- a/libbeat/publisher/queue/memqueue/internal_api.go +++ b/libbeat/publisher/queue/memqueue/internal_api.go @@ -53,7 +53,7 @@ type getRequest struct { byteCount int // The channel to send the new batch to. - responseChan chan *batch + responseChan chan batch } type batchDoneMsg struct{} diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index 27881351c444..a01b5a09e0fd 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -134,7 +134,7 @@ func (l *runLoop) runIteration() { var consumedChan chan batchList // Enable sending to the scheduled ACKs channel if we have // something to send. - if !l.consumedBatches.empty() { + if !l.consumedBatches.Empty() { consumedChan = l.broker.consumedChan } @@ -244,7 +244,7 @@ func (l *runLoop) handleGetReply(req *getRequest) { // Send the batch to the caller and update internal state req.responseChan <- batch - l.consumedBatches.append(batch) + l.consumedBatches.Add(batch) l.consumedEventCount += batchEntryCount l.consumedByteCount += batchByteCount l.broker.observer.ConsumeEvents(batchEntryCount, batchByteCount) From d1ba6a1a334dbb70295107371859b7cec5279742 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 17:47:43 -0400 Subject: [PATCH 96/99] remove unrelated test change --- x-pack/filebeat/input/awss3/input_test.go | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index 83015c1661be..432bd360bfc6 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -5,7 +5,6 @@ package awss3 import ( - "context" "errors" "testing" @@ -155,12 +154,3 @@ func TestRegionSelection(t *testing.T) { }) } } - -func newV2Context() (v2.Context, func()) { - ctx, cancel := context.WithCancel(context.Background()) - return v2.Context{ - Logger: logp.NewLogger("awss3_test"), - ID: "test_id", - Cancelation: ctx, - }, cancel -} From 91e4534bf84b6ac7daf4c399e27d234b5aeae79d Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Thu, 30 May 2024 17:51:48 -0400 Subject: [PATCH 97/99] remove unused error --- libbeat/common/fifo/fifo.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libbeat/common/fifo/fifo.go b/libbeat/common/fifo/fifo.go index 8b592e94c079..03d220186dbd 100644 --- a/libbeat/common/fifo/fifo.go +++ b/libbeat/common/fifo/fifo.go @@ -17,10 +17,6 @@ package fifo -import "errors" - -var errFIFOEmpty = errors.New("tried to read from an empty FIFO queue") - type FIFO[T any] struct { first *node[T] last *node[T] From b585727d396554204ea418e6490d7670641b6926 Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 18 Jun 2024 16:41:42 -0400 Subject: [PATCH 98/99] fix merge + tests --- libbeat/publisher/queue/memqueue/runloop.go | 16 +++++++++++----- libbeat/publisher/queue/memqueue/runloop_test.go | 3 +++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/libbeat/publisher/queue/memqueue/runloop.go b/libbeat/publisher/queue/memqueue/runloop.go index ae9d6fd67ec7..7c340064ba6c 100644 --- a/libbeat/publisher/queue/memqueue/runloop.go +++ b/libbeat/publisher/queue/memqueue/runloop.go @@ -18,7 +18,6 @@ package memqueue import ( - "fmt" "time" "github.com/elastic/beats/v7/libbeat/common/fifo" @@ -153,8 +152,9 @@ func (l *runLoop) runIteration() { select { case <-l.broker.closeChan: l.closing = true - // Get requests are handled immediately during shutdown + // Get and push requests are handled immediately during shutdown l.maybeUnblockGetRequest() + l.maybeUnblockPushRequests() case <-l.broker.ctx.Done(): // The queue is fully shut down, do nothing @@ -205,11 +205,10 @@ func (l *runLoop) getRequestShouldBlock(req *getRequest) bool { // limit) or if we have at least the requested number available. if l.broker.useByteLimits() { availableBytes := l.byteCount - l.consumedByteCount - return req.byteCount <= 0 || availableBytes >= req.byteCount + return req.byteCount > 0 && availableBytes < req.byteCount } availableEntries := l.eventCount - l.consumedEventCount - fmt.Printf("hi fae, getRequestShouldBlock for %v entries while there are %v available\n", req.entryCount, availableEntries) - return req.entryCount <= 0 || availableEntries >= req.entryCount + return req.entryCount > 0 && availableEntries < req.entryCount } // Respond to the given get request without blocking or waiting for more events @@ -316,7 +315,14 @@ func (l *runLoop) canFitPushRequest(req pushRequest) bool { func (l *runLoop) maybeUnblockPushRequests() { for !l.pendingPushRequests.Empty() { req := l.pendingPushRequests.First() + if l.closing { + // If the queue is closing, reject all pending requests + req.resp <- false + continue + } if !l.canFitPushRequest(req) { + // We're out of space, the rest of the blocked requests will have + // to wait. break } l.doInsert(req) diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index 8d606184c63d..0483105bdfdb 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -124,10 +124,12 @@ func TestObserverAddEvent(t *testing.T) { rl := &runLoop{ observer: queue.NewQueueObserver(reg), buf: newCircularBuffer(100), + broker: &broker{}, } request := pushRequest{ event: publisher.Event{}, eventSize: 123, + resp: make(chan bool, 1), } rl.doInsert(request) assertRegistryUint(t, reg, "queue.added.events", 1, "Queue insert should report added event") @@ -143,6 +145,7 @@ func TestObserverConsumeEvents(t *testing.T) { observer: queue.NewQueueObserver(reg), buf: newCircularBuffer(bufSize), eventCount: 50, + broker: &broker{}, } // Initialize the queue entries to a test byte size for i := 0; i < bufSize; i++ { From 3643a8f29aed1e2140ebc636661cc0f01cc64d2d Mon Sep 17 00:00:00 2001 From: Fae Charlton Date: Tue, 18 Jun 2024 18:02:55 -0400 Subject: [PATCH 99/99] Add docs / parameter checks --- libbeat/docs/queueconfig.asciidoc | 23 +++++++++++++++++ libbeat/publisher/pipeline/client_test.go | 6 +++-- libbeat/publisher/pipeline/controller.go | 4 ++- libbeat/publisher/queue/memqueue/broker.go | 25 ++++++++++++------- .../publisher/queue/memqueue/queue_test.go | 9 ++++--- .../publisher/queue/memqueue/runloop_test.go | 6 +++-- 6 files changed, 56 insertions(+), 17 deletions(-) diff --git a/libbeat/docs/queueconfig.asciidoc b/libbeat/docs/queueconfig.asciidoc index 499ab9d46672..3bd0d04456a2 100644 --- a/libbeat/docs/queueconfig.asciidoc +++ b/libbeat/docs/queueconfig.asciidoc @@ -67,6 +67,17 @@ queue.mem: flush.timeout: 5s ------------------------------------------------------------------------------ +Here is an alternate configuration that measures queue size in bytes rather +than event count. In this case, the output must set `bulk_max_bytes` +instead of `bulk_max_size` to control the batch size: + +[source,yaml] +------------------------------------------------------------------------------ +queue.mem: + bytes: 32MB + flush.timeout: 10s +------------------------------------------------------------------------------ + [float] === Configuration options @@ -80,6 +91,16 @@ Number of events the queue can store. The default value is 3200 events. +[float] +[[queue-mem-bytes-option]] +===== `bytes` + +Number of bytes the queue can store. This option is only available for outputs +that support byte-based event buffers (currently just the Elasticsearch output). +The queue should set either `events` or `bytes` but not both. + +The default is 0, indicating the queue should use the `events` limit instead. + [float] [[queue-mem-flush-min-events-option]] ===== `flush.min_events` @@ -92,6 +113,8 @@ publishing. If 0 or 1, sets the maximum number of events per batch to half the queue size, and sets the queue to synchronous mode (equivalent to `flush.timeout` of 0). +This value is ignored when `bytes` is set. + The default value is 1600. [float] diff --git a/libbeat/publisher/pipeline/client_test.go b/libbeat/publisher/pipeline/client_test.go index 21c09dad8901..7e1d1c4b3be4 100644 --- a/libbeat/publisher/pipeline/client_test.go +++ b/libbeat/publisher/pipeline/client_test.go @@ -88,11 +88,12 @@ func TestClient(t *testing.T) { l := logp.L() // a small in-memory queue with a very short flush interval - q := memqueue.NewQueue(l, nil, memqueue.Settings{ + q, err := memqueue.NewQueue(l, nil, memqueue.Settings{ Events: 5, MaxGetRequest: 1, FlushTimeout: time.Millisecond, }, 5, nil) + require.NoError(t, err, "Queue creation must succeed") // model a processor that we're going to make produce errors after p := &testProcessor{} @@ -201,7 +202,8 @@ func TestClientWaitClose(t *testing.T) { } logp.TestingSetup() - q := memqueue.NewQueue(logp.L(), nil, memqueue.Settings{Events: 1}, 0, nil) + q, err := memqueue.NewQueue(logp.L(), nil, memqueue.Settings{Events: 1}, 0, nil) + require.NoError(t, err, "Queue creation must succeed") pipeline := makePipeline(Settings{}, q) defer pipeline.Close() diff --git a/libbeat/publisher/pipeline/controller.go b/libbeat/publisher/pipeline/controller.go index ce276175c4ff..fc9285088060 100644 --- a/libbeat/publisher/pipeline/controller.go +++ b/libbeat/publisher/pipeline/controller.go @@ -286,7 +286,9 @@ func (c *outputController) createQueueIfNeeded(outGrp outputs.Group) { if err != nil { logger.Errorf("queue creation failed, falling back to default memory queue, check your queue configuration") s, _ := memqueue.SettingsForUserConfig(nil) - queue = memqueue.NewQueue(logger, queueObserver, s, c.inputQueueSize, outGrp.EncoderFactory) + // Memqueue creation can only fail when it's configured for byte-based limits, + // so we don't need to handle the fallback error. + queue, _ = memqueue.NewQueue(logger, queueObserver, s, c.inputQueueSize, outGrp.EncoderFactory) } c.queue = queue diff --git a/libbeat/publisher/queue/memqueue/broker.go b/libbeat/publisher/queue/memqueue/broker.go index f0e9c3f1afb2..800b95fa252b 100644 --- a/libbeat/publisher/queue/memqueue/broker.go +++ b/libbeat/publisher/queue/memqueue/broker.go @@ -19,6 +19,7 @@ package memqueue import ( "context" + "errors" "io" "time" @@ -142,7 +143,7 @@ func FactoryForSettings(settings Settings) queue.QueueFactory { inputQueueSize int, encoderFactory queue.EncoderFactory, ) (queue.Queue, error) { - return NewQueue(logger, observer, settings, inputQueueSize, encoderFactory), nil + return NewQueue(logger, observer, settings, inputQueueSize, encoderFactory) } } @@ -155,14 +156,16 @@ func NewQueue( settings Settings, inputQueueSize int, encoderFactory queue.EncoderFactory, -) *broker { - b := newQueue(logger, observer, settings, inputQueueSize, encoderFactory) +) (*broker, error) { + b, err := newQueue(logger, observer, settings, inputQueueSize, encoderFactory) - // Start the queue workers - go b.runLoop.run() - go b.ackLoop.run() + if err == nil { + // Start the queue workers + go b.runLoop.run() + go b.ackLoop.run() + } - return b + return b, err } // newQueue does most of the work of creating a queue from the given @@ -175,7 +178,7 @@ func newQueue( settings Settings, inputQueueSize int, encoderFactory queue.EncoderFactory, -) *broker { +) (*broker, error) { if observer == nil { observer = queue.NewQueueObserver(nil) } @@ -190,6 +193,10 @@ func newQueue( settings.MaxGetRequest = (settings.Events + 1) / 2 } + if settings.Bytes > 0 && encoderFactory == nil { + return nil, errors.New("queue.mem.bytes is set but the output doesn't support byte-based event buffers") + } + // Can't request more than the full queue if settings.Events > 0 && settings.MaxGetRequest > settings.Events { settings.MaxGetRequest = settings.Events @@ -222,7 +229,7 @@ func newQueue( observer.MaxEvents(settings.Events) - return b + return b, nil } func (b *broker) Close() error { diff --git a/libbeat/publisher/queue/memqueue/queue_test.go b/libbeat/publisher/queue/memqueue/queue_test.go index 44c081e30a59..5f4716a6d42b 100644 --- a/libbeat/publisher/queue/memqueue/queue_test.go +++ b/libbeat/publisher/queue/memqueue/queue_test.go @@ -86,12 +86,13 @@ func TestProduceConsumer(t *testing.T) { // than 2 events to it, p.Publish will block, once we call q.Close, // we ensure the 3rd event was not successfully published. func TestProducerDoesNotBlockWhenQueueClosed(t *testing.T) { - q := NewQueue(nil, nil, + q, err := NewQueue(nil, nil, Settings{ Events: 2, // Queue size MaxGetRequest: 1, // make sure the queue won't buffer events FlushTimeout: time.Millisecond, }, 0, nil) + require.NoError(t, err, "Queue creation must succeed") p := q.Producer(queue.ProducerConfig{ // We do not read from the queue, so the callbacks are never called @@ -156,12 +157,13 @@ func TestProducerClosePreservesEventCount(t *testing.T) { var activeEvents atomic.Int64 - q := NewQueue(nil, nil, + q, err := NewQueue(nil, nil, Settings{ Events: 3, // Queue size MaxGetRequest: 2, FlushTimeout: 10 * time.Millisecond, }, 1, nil) + require.NoError(t, err, "Queue creation must succeed") p := q.Producer(queue.ProducerConfig{ ACK: func(count int) { @@ -229,11 +231,12 @@ func TestProducerClosePreservesEventCount(t *testing.T) { func makeTestQueue(sz, minEvents int, flushTimeout time.Duration) queuetest.QueueFactory { return func(_ *testing.T) queue.Queue { - return NewQueue(nil, nil, Settings{ + q, _ := NewQueue(nil, nil, Settings{ Events: sz, MaxGetRequest: minEvents, FlushTimeout: flushTimeout, }, 0, nil) + return q } } diff --git a/libbeat/publisher/queue/memqueue/runloop_test.go b/libbeat/publisher/queue/memqueue/runloop_test.go index 0483105bdfdb..969aca466656 100644 --- a/libbeat/publisher/queue/memqueue/runloop_test.go +++ b/libbeat/publisher/queue/memqueue/runloop_test.go @@ -38,7 +38,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { // available. This test verifies that Get requests that can be completely // filled do not wait for the flush timer. - broker := newQueue( + broker, err := newQueue( logp.NewLogger("testing"), nil, Settings{ @@ -47,6 +47,7 @@ func TestFlushSettingsDoNotBlockFullBatches(t *testing.T) { FlushTimeout: 10 * time.Second, }, 10, nil) + require.NoError(t, err, "Queue creation must succeed") producer := newProducer(broker, nil, nil) rl := broker.runLoop @@ -77,7 +78,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { // there are enough events. This one uses the same setup to confirm that // Get requests are delayed if there aren't enough events. - broker := newQueue( + broker, err := newQueue( logp.NewLogger("testing"), nil, Settings{ @@ -86,6 +87,7 @@ func TestFlushSettingsBlockPartialBatches(t *testing.T) { FlushTimeout: 10 * time.Second, }, 10, nil) + require.NoError(t, err, "Queue creation must succeed") producer := newProducer(broker, nil, nil) rl := broker.runLoop