Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data lag metrics from machine-readable status #1708

Merged
merged 2 commits into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions api/v1beta2/foundationdb_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ type FoundationDBStatusClusterInfo struct {
// Logs provides information about log processes running in the cluster.
Logs []FoundationDBStatusLogInfo `json:"logs,omitempty"`

// Qos provides information about various qos metrics of the cluster.
Qos FoundationDBStatusQosInfo `json:"qos,omitempty"`

// FaultTolerance provides information about the fault tolerance status
// of the cluster.
FaultTolerance FaultTolerance `json:"fault_tolerance,omitempty"`
Expand Down Expand Up @@ -336,6 +339,20 @@ type FoundationDBStatusLogInfo struct {
SatelliteLogReplicationFactor int `json:"satellite_log_replication_factor,omitempty"`
}

// FoundationDBStatusLagInfo provides information about the lag being experienced by a storage
// server in the cluster.
type FoundationDBStatusLagInfo struct {
Seconds float64 `json:"seconds,omitempty"`
Versions int64 `json:"versions,omitempty"`
}

// FoundationDBStatusQosInfo provides information about various qos metrics of the cluster.
type FoundationDBStatusQosInfo struct {
LimitingDurabilityLagStorageServer FoundationDBStatusLagInfo `json:"limiting_durability_lag_storage_server,omitempty"`
WorstDataLagStorageServer FoundationDBStatusLagInfo `json:"worst_data_lag_storage_server,omitempty"`
WorstDurabilityLagStorageServer FoundationDBStatusLagInfo `json:"worst_durability_lag_storage_server,omitempty"`
}

// ProcessRole models the role of a pod.
type ProcessRole string

Expand Down
28 changes: 28 additions & 0 deletions api/v1beta2/foundationdb_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,20 @@ var _ = Describe("FoundationDBStatus", func() {
},
},
},
Qos: FoundationDBStatusQosInfo{
LimitingDurabilityLagStorageServer: FoundationDBStatusLagInfo{
Seconds: 14.1153,
Versions: 14115335,
},
WorstDataLagStorageServer: FoundationDBStatusLagInfo{
Seconds: 0,
Versions: 0,
},
WorstDurabilityLagStorageServer: FoundationDBStatusLagInfo{
Seconds: 14.115600000000001,
Versions: 14115618,
},
},
RecoveryState: RecoveryState{
Name: "fully_recovered",
},
Expand Down Expand Up @@ -865,6 +879,20 @@ var _ = Describe("FoundationDBStatus", func() {
SatelliteLogReplicationFactor: 0,
},
},
Qos: FoundationDBStatusQosInfo{
LimitingDurabilityLagStorageServer: FoundationDBStatusLagInfo{
Seconds: 5.0145299999999997,
Versions: 5014530,
},
WorstDataLagStorageServer: FoundationDBStatusLagInfo{
Seconds: 0,
Versions: 0,
},
WorstDurabilityLagStorageServer: FoundationDBStatusLagInfo{
Seconds: 5.0150199999999998,
Versions: 5015017,
},
},
FaultTolerance: FaultTolerance{
MaxZoneFailuresWithoutLosingData: 1,
MaxZoneFailuresWithoutLosingAvailability: 1,
Expand Down
34 changes: 34 additions & 0 deletions api/v1beta2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

70 changes: 70 additions & 0 deletions pkg/fdbadminclient/mock/admin_client_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ type AdminClient struct {
TeamTracker []fdbv1beta2.FoundationDBStatusTeamTracker
Logs []fdbv1beta2.FoundationDBStatusLogInfo
mockError error
LagInfo map[string]fdbv1beta2.FoundationDBStatusLagInfo
}

// adminClientCache provides a cache of mock admin clients.
Expand Down Expand Up @@ -102,6 +103,7 @@ func NewMockAdminClientUncast(cluster *fdbv1beta2.FoundationDBCluster, kubeClien
currentCommandLines: make(map[string]string),
Knobs: make(map[string]fdbv1beta2.None),
VersionProcessGroups: make(map[fdbv1beta2.ProcessGroupID]string),
LagInfo: make(map[string]fdbv1beta2.FoundationDBStatusLagInfo),
}
adminClientCache[cluster.Name] = cachedClient
cachedClient.Backups = make(map[string]fdbv1beta2.FoundationDBBackupStatusBackupDetails)
Expand Down Expand Up @@ -443,6 +445,23 @@ func (client *AdminClient) GetStatus() (*fdbv1beta2.FoundationDBStatus, error) {
}
status.Cluster.MaintenanceZone = client.MaintenanceZone

if len(client.LagInfo) > 0 {
limitingDurabilityLag, ok := client.GetLimitingDurabilityLag()
if ok {
status.Cluster.Qos.LimitingDurabilityLagStorageServer = limitingDurabilityLag
}

worstDataLag, ok := client.GetWorstDataLag()
if ok {
status.Cluster.Qos.WorstDataLagStorageServer = worstDataLag
}

worstDurabilityLag, ok := client.GetWorstDurabilityLag()
if ok {
status.Cluster.Qos.WorstDurabilityLagStorageServer = worstDurabilityLag
}
}

return status, nil
}

Expand Down Expand Up @@ -948,3 +967,54 @@ func (client *AdminClient) MockUptimeSecondsForMaintenanceZone(seconds float64)
func (client *AdminClient) MockError(err error) {
client.mockError = err
}

// SetLimitingDurabilityLag sets/mocks the limiting durability lag of any storage server in the cluster.
func (client *AdminClient) SetLimitingDurabilityLag(lagInfo *fdbv1beta2.FoundationDBStatusLagInfo) {
adminClientMutex.Lock()
defer adminClientMutex.Unlock()

client.LagInfo["limitingDurabilityLag"] = *lagInfo
}

// GetLimitingDurabilityLag returns the limiting durability lag of any storage server in the cluster.
func (client *AdminClient) GetLimitingDurabilityLag() (fdbv1beta2.FoundationDBStatusLagInfo, bool) {
adminClientMutex.Lock()
defer adminClientMutex.Unlock()

lagInfo, ok := client.LagInfo["limitingDurabilityLag"]
return lagInfo, ok
}

// SetWorstDataLag sets/mocks the worst data lag of any storage server in the cluster.
func (client *AdminClient) SetWorstDataLag(lagInfo *fdbv1beta2.FoundationDBStatusLagInfo) {
adminClientMutex.Lock()
defer adminClientMutex.Unlock()

client.LagInfo["worstDataLag"] = *lagInfo
}

// GetWorstDataLag returns the (mocked) worst data lag of any storage server in the cluster.
func (client *AdminClient) GetWorstDataLag() (fdbv1beta2.FoundationDBStatusLagInfo, bool) {
adminClientMutex.Lock()
defer adminClientMutex.Unlock()

lagInfo, ok := client.LagInfo["worstDataLag"]
return lagInfo, ok
}

// SetWorstDurabilityLag sets/mocks the worst durability lag of any storage server in the cluster.
func (client *AdminClient) SetWorstDurabilityLag(lagInfo *fdbv1beta2.FoundationDBStatusLagInfo) {
adminClientMutex.Lock()
defer adminClientMutex.Unlock()

client.LagInfo["worstDurabilityLag"] = *lagInfo
}

// GetWorstDurabilityLag returns the (mocked) worst durability lag of any storage server in the cluster.
func (client *AdminClient) GetWorstDurabilityLag() (fdbv1beta2.FoundationDBStatusLagInfo, bool) {
adminClientMutex.Lock()
defer adminClientMutex.Unlock()

lagInfo, ok := client.LagInfo["worstDurabilityLag"]
return lagInfo, ok
}