-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rename to management.Manager, add UpdateStatus to Manager interface. #19114
Changes from 3 commits
64ffc12
725505d
5ab1334
2e8affe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,11 +18,36 @@ | |
package management | ||
|
||
import ( | ||
"sync" | ||
|
||
"github.com/gofrs/uuid" | ||
|
||
"github.com/elastic/beats/v7/libbeat/common" | ||
"github.com/elastic/beats/v7/libbeat/common/reload" | ||
"github.com/elastic/beats/v7/libbeat/feature" | ||
"github.com/elastic/beats/v7/libbeat/logp" | ||
) | ||
|
||
// Status describes the current status of the beat. | ||
type Status int | ||
|
||
const ( | ||
// Unknown is initial status when none has been reported. | ||
Unknown Status = iota | ||
// Starting is status describing application is starting. | ||
Starting | ||
// Configuring is status describing application is configuring. | ||
Configuring | ||
// Running is status describing application is running. | ||
Running | ||
// Degraded is status describing application is degraded. | ||
Degraded | ||
// Failed is status describing application is failed. This status should | ||
// only be used in the case the beat should stop running as the failure | ||
// cannot be recovered. | ||
Failed | ||
// Stopping is status describing application is stopping. | ||
Stopping | ||
) | ||
|
||
// Namespace is the feature namespace for queue definition. | ||
|
@@ -33,27 +58,31 @@ var DebugK = "centralmgmt" | |
|
||
var centralMgmtKey = "x-pack-cm" | ||
|
||
// ConfigManager interacts with the beat to update configurations | ||
// from an external source | ||
type ConfigManager interface { | ||
// Enabled returns true if config manager is enabled | ||
// Manager interacts with the beat to provide status updates and to receive | ||
// configurations. | ||
type Manager interface { | ||
// Enabled returns true if manager is enabled. | ||
Enabled() bool | ||
|
||
// Start the config manager | ||
Start(func()) | ||
// Start the config manager giving it a stopFunc callback | ||
// so the beat can be told when to stop. | ||
Start(stopFunc func()) | ||
|
||
// Stop the config manager | ||
// Stop the config manager. | ||
Stop() | ||
|
||
// CheckRawConfig check settings are correct before launching the beat | ||
// CheckRawConfig check settings are correct before launching the beat. | ||
CheckRawConfig(cfg *common.Config) error | ||
|
||
// UpdateStatus called when the status of the beat has changed. | ||
UpdateStatus(status Status, msg string) | ||
} | ||
|
||
// PluginFunc for creating FactoryFunc if it matches a config | ||
type PluginFunc func(*common.Config) FactoryFunc | ||
|
||
// FactoryFunc for creating a config manager | ||
type FactoryFunc func(*common.Config, *reload.Registry, uuid.UUID) (ConfigManager, error) | ||
type FactoryFunc func(*common.Config, *reload.Registry, uuid.UUID) (Manager, error) | ||
|
||
// Register a config manager | ||
func Register(name string, fn PluginFunc, stability feature.Stability) { | ||
|
@@ -91,13 +120,32 @@ func defaultModeConfig() *modeConfig { | |
} | ||
|
||
// nilManager, fallback when no manager is present | ||
type nilManager struct{} | ||
type nilManager struct { | ||
logger *logp.Logger | ||
lock sync.Mutex | ||
status Status | ||
msg string | ||
} | ||
|
||
func nilFactory(*common.Config, *reload.Registry, uuid.UUID) (ConfigManager, error) { | ||
return nilManager{}, nil | ||
func nilFactory(*common.Config, *reload.Registry, uuid.UUID) (Manager, error) { | ||
log := logp.NewLogger("mgmt") | ||
return &nilManager{ | ||
logger: log, | ||
status: Unknown, | ||
msg: "", | ||
}, nil | ||
} | ||
|
||
func (nilManager) Enabled() bool { return false } | ||
func (nilManager) Start(_ func()) {} | ||
func (nilManager) Stop() {} | ||
func (nilManager) CheckRawConfig(cfg *common.Config) error { return nil } | ||
func (*nilManager) Enabled() bool { return false } | ||
func (*nilManager) Start(_ func()) {} | ||
func (*nilManager) Stop() {} | ||
func (*nilManager) CheckRawConfig(cfg *common.Config) error { return nil } | ||
func (n *nilManager) UpdateStatus(status Status, msg string) { | ||
n.lock.Lock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need this here if this is nil manager - noop manager? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do it only for logging the status changes. I think it is good to show status updates in the logger as well, so it's clear what is happening in the beat, even if its not controlled by Elastic Agent. |
||
defer n.lock.Unlock() | ||
if n.status != status || n.msg != msg { | ||
n.status = status | ||
n.msg = msg | ||
n.logger.Infof("Status change to %s: %s", status, msg) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ import ( | |
"fmt" | ||
"os" | ||
"sort" | ||
"sync" | ||
|
||
"github.com/gofrs/uuid" | ||
"github.com/pkg/errors" | ||
|
@@ -35,12 +36,15 @@ type Manager struct { | |
registry *reload.Registry | ||
blacklist *xmanagement.ConfigBlacklist | ||
client *client.Client | ||
lock sync.Mutex | ||
status management.Status | ||
msg string | ||
|
||
stopFunc func() | ||
} | ||
|
||
// NewFleetManager returns a X-Pack Beats Fleet Management manager. | ||
func NewFleetManager(config *common.Config, registry *reload.Registry, beatUUID uuid.UUID) (management.ConfigManager, error) { | ||
func NewFleetManager(config *common.Config, registry *reload.Registry, beatUUID uuid.UUID) (management.Manager, error) { | ||
c := defaultConfig() | ||
if config.Enabled() { | ||
if err := config.Unpack(&c); err != nil { | ||
|
@@ -51,7 +55,7 @@ func NewFleetManager(config *common.Config, registry *reload.Registry, beatUUID | |
} | ||
|
||
// NewFleetManagerWithConfig returns a X-Pack Beats Fleet Management manager. | ||
func NewFleetManagerWithConfig(c *Config, registry *reload.Registry, beatUUID uuid.UUID) (management.ConfigManager, error) { | ||
func NewFleetManagerWithConfig(c *Config, registry *reload.Registry, beatUUID uuid.UUID) (management.Manager, error) { | ||
log := logp.NewLogger(management.DebugK) | ||
|
||
m := &Manager{ | ||
|
@@ -122,38 +126,52 @@ func (cm *Manager) CheckRawConfig(cfg *common.Config) error { | |
return nil | ||
} | ||
|
||
// UpdateStatus updates the manager with the current status for the beat. | ||
func (cm *Manager) UpdateStatus(status management.Status, msg string) { | ||
cm.lock.Lock() | ||
if cm.status != status || cm.msg != msg { | ||
cm.status = status | ||
cm.msg = msg | ||
cm.lock.Unlock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should test this with concurrent request, what i fear may occur is that wit many concurrent changes of status There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can also try to 'forget' outdated status updates if we get new ones while we are waiting. This could be implemented using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My only reason to track the state at this level is so it can be logged when it is changed. We could remove the internal tracking and locking in the Fleet Manager if we just log every time Or we could place the call to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think client options is also ok. i think logging inconsistent states will cause us more trouble then help us during some sdh triaging There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I went with moving the logging and client into the lock. |
||
cm.client.Status(statusToProtoStatus(status), msg) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here we update status on the client which will result in some action either during checkin or in watcher after some timeout. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This flow is valid from the side of the Elastic Agent, as reporting Failed will cause the application to be killed and restarted. This is fully tested in covered in the Elastic Agent unit tests, both in the case that the application returns Failed over the protocol and if the application just crashes. |
||
cm.logger.Infof("Status change to %s: %s", status, msg) | ||
return | ||
} | ||
cm.lock.Unlock() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why unlock the call to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I was commenting about on that as well, I can move the |
||
} | ||
|
||
func (cm *Manager) OnConfig(s string) { | ||
cm.client.Status(proto.StateObserved_CONFIGURING, "Updating configuration") | ||
cm.UpdateStatus(management.Configuring, "Updating configuration") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we switch back after config is applied or do we let beat to do that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does switch back to Running at the end of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you;re right it was hidden so i overlooked There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I read this right OnConfig sets the status to 'Healthy' at the end. Are we sure this is the correct state to report? Do we 'forget' the internal state here? How about removing the 'Configuring' status? The less states (and protocol behavior) we have, the less we can get it wrong. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally do not like the idea of setting the status in the At the moment I keep it this way so it works, and once status is being reported correctly by the beat, this should be removed (probably at the same time). I do not think we should removing It is good to know that going There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
+1 |
||
|
||
var configMap common.MapStr | ||
uconfig, err := common.NewConfigFrom(s) | ||
if err != nil { | ||
err = errors.Wrap(err, "config blocks unsuccessfully generated") | ||
cm.logger.Error(err) | ||
cm.client.Status(proto.StateObserved_FAILED, err.Error()) | ||
cm.UpdateStatus(management.Failed, err.Error()) | ||
return | ||
} | ||
|
||
err = uconfig.Unpack(&configMap) | ||
if err != nil { | ||
err = errors.Wrap(err, "config blocks unsuccessfully generated") | ||
cm.logger.Error(err) | ||
cm.client.Status(proto.StateObserved_FAILED, err.Error()) | ||
cm.UpdateStatus(management.Failed, err.Error()) | ||
return | ||
} | ||
|
||
blocks, err := cm.toConfigBlocks(configMap) | ||
if err != nil { | ||
err = errors.Wrap(err, "could not apply the configuration") | ||
cm.logger.Error(err) | ||
cm.client.Status(proto.StateObserved_FAILED, err.Error()) | ||
cm.UpdateStatus(management.Failed, err.Error()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given all these errors I wonder what 'Failed' means. Is the beat really 'failed' or are we (the overall application with agent) running in a degraded mode because not all configurations could have been applied? Even if we fail here the Beat continues to publish events for existing inputs, right? Are we mixing the status of the 'operation' with the beat health here? If the operation fails, are we required to restart the Beat? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As stated in the list of statuses there is
In Degarded state beat can keep sending events and agent will not stop the process. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I've seen these status types. Is this method run on startup only, or also if the beat is already active? In the later case the Beat will continue with the old state if there was an error decoding the configuration. Do we want to enforce a 'restart' if the config update operation failed? Will
I'm more or less wondering about the overal semantics of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will run both on startup and while running. The beat doesn't need to worry about handling the restart on Failed status, the Agent will do that work for it, restart it and bring it back up. |
||
return | ||
} | ||
|
||
if errs := cm.apply(blocks); !errs.IsEmpty() { | ||
err = errors.Wrap(err, "could not apply the configuration") | ||
cm.logger.Error(err) | ||
cm.client.Status(proto.StateObserved_FAILED, err.Error()) | ||
cm.UpdateStatus(management.Failed, err.Error()) | ||
return | ||
} | ||
|
||
|
@@ -285,3 +303,25 @@ func (cm *Manager) toConfigBlocks(cfg common.MapStr) (api.ConfigBlocks, error) { | |
|
||
return res, nil | ||
} | ||
|
||
func statusToProtoStatus(status management.Status) proto.StateObserved_Status { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add unit test to test switching statuses back and forth with some noop client? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment I don't believe client from |
||
switch status { | ||
case management.Unknown: | ||
// unknown is reported as healthy, as the status is unknown | ||
return proto.StateObserved_HEALTHY | ||
case management.Starting: | ||
return proto.StateObserved_STARTING | ||
case management.Configuring: | ||
return proto.StateObserved_CONFIGURING | ||
case management.Running: | ||
return proto.StateObserved_HEALTHY | ||
case management.Degraded: | ||
return proto.StateObserved_DEGRADED | ||
case management.Failed: | ||
return proto.StateObserved_FAILED | ||
case management.Stopping: | ||
return proto.StateObserved_STOPPING | ||
} | ||
// unknown status, still reported as healthy | ||
return proto.StateObserved_HEALTHY | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about moving the UpdateStatus method into it's own interface? I'd like to push only 'status' reporting to the inputs/outputs and provide a set of helper functions that wrap a 'StatusReporter', merging status changes from multiple subsystems. But in the end we can also introduce the minimal interface in another package when we need it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can move it to its own interface, would it also still be in the
management.Manager
interface?