diff --git a/x-pack/elastic-agent/CHANGELOG.asciidoc b/x-pack/elastic-agent/CHANGELOG.asciidoc index f62ab1759e4d..822e7b9b56a9 100644 --- a/x-pack/elastic-agent/CHANGELOG.asciidoc +++ b/x-pack/elastic-agent/CHANGELOG.asciidoc @@ -52,6 +52,7 @@ - Remove support for logs type and use logfile {pull}19761[19761] - Avoid comparing uncomparable types on enroll {issue}19976[19976] - Fix issues with merging of elastic-agent.yml and fleet.yml {pull}20026[20026] +- Improve GRPC stop to be more relaxed {pull}20118[20118] ==== New features diff --git a/x-pack/elastic-agent/pkg/core/server/server.go b/x-pack/elastic-agent/pkg/core/server/server.go index 4cd5c8386ec1..12885e2f0123 100644 --- a/x-pack/elastic-agent/pkg/core/server/server.go +++ b/x-pack/elastic-agent/pkg/core/server/server.go @@ -526,6 +526,7 @@ func (as *ApplicationState) WriteConnInfo(w io.Writer) error { // the application times out during stop and ErrApplication func (as *ApplicationState) Stop(timeout time.Duration) error { as.checkinLock.Lock() + wasConn := as.checkinDone != nil cfgIdx := as.statusConfigIdx as.expected = proto.StateExpected_STOPPING as.checkinLock.Unlock() @@ -548,8 +549,10 @@ func (as *ApplicationState) Stop(timeout time.Duration) error { s := as.status doneChan := as.checkinDone as.checkinLock.RUnlock() - if s == proto.StateObserved_STOPPING && doneChan == nil { - // sent stopping and now is disconnected (so its stopped) + if (wasConn && doneChan == nil) || (!wasConn && s == proto.StateObserved_STOPPING && doneChan == nil) { + // either occurred + // * client was connected then disconnected on stop + // * client was not connected; connected; received stopping; then disconnected as.Destroy() return nil } diff --git a/x-pack/elastic-agent/pkg/core/server/server_test.go b/x-pack/elastic-agent/pkg/core/server/server_test.go index 608be5641f93..424efb143115 100644 --- a/x-pack/elastic-agent/pkg/core/server/server_test.go +++ b/x-pack/elastic-agent/pkg/core/server/server_test.go @@ -416,6 +416,57 @@ func TestServer_Stop(t *testing.T) { assert.NoError(t, stopErr) } +func TestServer_StopJustDisconnect(t *testing.T) { + initConfig := "initial_config" + app := &StubApp{} + srv := createAndStartServer(t, &StubHandler{}) + defer srv.Stop() + as, err := srv.Register(app, initConfig) + require.NoError(t, err) + cImpl := &StubClientImpl{} + c := newClientFromApplicationState(t, as, cImpl) + require.NoError(t, c.Start(context.Background())) + defer c.Stop() + + // clients should get initial check-ins then set as healthy + require.NoError(t, waitFor(func() error { + if cImpl.Config() != initConfig { + return fmt.Errorf("client never got intial config") + } + return nil + })) + c.Status(proto.StateObserved_HEALTHY, "Running", nil) + assert.NoError(t, waitFor(func() error { + if app.Status() != proto.StateObserved_HEALTHY { + return fmt.Errorf("server never updated currect application state") + } + return nil + })) + + // send stop to the client + done := make(chan bool) + var stopErr error + go func() { + stopErr = as.Stop(time.Second * 5) + close(done) + }() + + // process of testing the flow + // 1. server sends stop + // 2. client disconnects + require.NoError(t, waitFor(func() error { + if cImpl.Stop() == 0 { + return fmt.Errorf("client never got expected stop") + } + return nil + })) + c.Stop() + <-done + + // no error on stop + assert.NoError(t, stopErr) +} + func TestServer_StopTimeout(t *testing.T) { initConfig := "initial_config" app := &StubApp{}