Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvserver: fix and re-enable slow lease application logging #105519

Merged
merged 1 commit into from
Jun 26, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 34 additions & 35 deletions pkg/kv/kvserver/replica_proposal.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
Expand Down Expand Up @@ -427,29 +428,16 @@ func (r *Replica) leasePostApplyLocked(

now := r.store.Clock().NowAsClockTimestamp()

// NB: ProposedTS is non-nil in practice, but we never fully migrated it
// in so we need to assume that it can be nil.
const slowLeaseWarningEnabled = false // see https://github.com/cockroachdb/cockroach/issues/97209
if slowLeaseWarningEnabled && iAmTheLeaseHolder && leaseChangingHands && newLease.ProposedTS != nil {
maybeLogSlowLeaseApplyWarning(ctx, time.Duration(now.WallTime-newLease.ProposedTS.WallTime), prevLease, newLease)
}

// Gossip the first range whenever its lease is acquired. We check to make
// sure the lease is active so that a trailing replica won't process an old
// lease request and attempt to gossip the first range.
if leaseChangingHands && iAmTheLeaseHolder && r.IsFirstRange() && r.ownsValidLeaseRLocked(ctx, now) {
r.gossipFirstRangeLocked(ctx)
}

// Log acquisition of meta and liveness range leases. These are critical to
// cluster health, so it's useful to know their location over time.
if leaseChangingHands && iAmTheLeaseHolder &&
r.descRLocked().StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) {
if r.ownsValidLeaseRLocked(ctx, now) {
log.Health.Infof(ctx, "acquired system range lease: %s", newLease)
} else {
log.Health.Warningf(ctx, "applied system range lease after it expired: %s", newLease)
}
// Log the lease acquisition, if appropriate.
if leaseChangingHands && iAmTheLeaseHolder {
r.maybeLogLeaseAcquisition(ctx, now, prevLease, newLease)
}

st := r.leaseStatusAtRLocked(ctx, now)
Expand Down Expand Up @@ -527,14 +515,26 @@ func (r *Replica) leasePostApplyLocked(
}
}

// maybeLogSlowLeaseApplyWarning is called when the lease changes hands on the
// new leaseholder. It logs if either the new lease was proposed well before it
// became visible on the leaseholder (indicating replication lag) or if the
// previous lease looks like we transferred a lease to a behind/offline replica.
func maybeLogSlowLeaseApplyWarning(
ctx context.Context, newLeaseAppDelay time.Duration, prevLease, newLease *roachpb.Lease,
// maybeLogLeaseAcquisition is called on the new leaseholder when the lease
// changes hands, to log the lease acquisition if appropriate.
func (r *Replica) maybeLogLeaseAcquisition(
ctx context.Context, now hlc.ClockTimestamp, prevLease, newLease *roachpb.Lease,
) {
const slowLeaseApplyWarnThreshold = 500 * time.Millisecond
// Log acquisition of meta and liveness range leases. These are critical to
// cluster health, so it's useful to know their location over time.
if r.descRLocked().StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) {
if r.ownsValidLeaseRLocked(ctx, now) {
log.Health.Infof(ctx, "acquired system range lease: %s", newLease)
} else {
log.Health.Warningf(ctx, "applied system range lease after it expired: %s", newLease)
}
}

const slowLeaseApplyWarnThreshold = time.Second
var newLeaseAppDelay time.Duration
if newLease.ProposedTS != nil { // non-nil in practice, but never migrated
newLeaseAppDelay = time.Duration(now.WallTime - newLease.ProposedTS.WallTime)
}
if newLeaseAppDelay > slowLeaseApplyWarnThreshold {
// If we hold the lease now and the lease was proposed "earlier", there
// must have been replication lag, and possibly reads and/or writes were
Expand All @@ -550,23 +550,22 @@ func maybeLogSlowLeaseApplyWarning(
// case, which is good enough here.
//
// [^1]: https://github.com/cockroachdb/cockroach/pull/82758
log.Warningf(ctx,
"lease %v active after replication lag of ~%.2fs; foreground traffic may have been impacted [prev=%v]",
newLease, newLeaseAppDelay.Seconds(), prevLease,
)
log.Health.Warningf(ctx,
"applied lease after ~%.2fs replication lag, client traffic may have been delayed [lease=%v prev=%v]",
newLeaseAppDelay.Seconds(), newLease, prevLease)
} else if prevLease.Type() == roachpb.LeaseExpiration &&
newLease.Type() == roachpb.LeaseEpoch &&
newLease.AcquisitionType == roachpb.LeaseAcquisitionType_Request {
prevLease.Expiration != nil && // nil when there is no previous lease
prevLease.Expiration.LessEq(newLease.Start.ToTimestamp()) {
// If the previous lease is expiration-based, but the new lease is not and
// the acquisition was non-cooperative, it is likely that a lease transfer
// (which is expiration-based) went to a follower that then couldn't hold
// the lease alive (for example, didn't apply it in time for it to
// starts at or after its expiration, it is likely that a lease transfer
// (which is expiration-based) went to a follower that then couldn't upgrade
// it to an epoch lease (for example, didn't apply it in time for it to
// actually serve any traffic). The result was likely an outage which
// resolves right now, so log to point this out.
log.Warningf(ctx,
"lease %v expired before being followed by lease %s; foreground traffic may have been impacted",
prevLease, newLease,
)
log.Health.Warningf(ctx,
"lease expired before epoch lease upgrade, client traffic may have been delayed [lease=%v prev=%v]",
newLease, prevLease)
}
}

Expand Down