From 8a239fa0f1e2897f6edc2d15e3e39a7fdab8474d Mon Sep 17 00:00:00 2001
From: Ben Darnell <ben@bendarnell.com>
Date: Sun, 21 Apr 2019 12:55:05 -0400
Subject: [PATCH] storage: Add comments about leases and liveness

I had to study this code for #35986, so I took the opportunity to
update the documentation while it's fresh in my mind.

Release note: None
---
 pkg/storage/node_liveness.go             | 93 ++++++++++++++++++++----
 pkg/storage/replica_range_lease.go       | 52 +++++++++++++
 pkg/storage/storagepb/lease_status.pb.go | 31 ++++++--
 pkg/storage/storagepb/lease_status.proto | 23 +++++-
 4 files changed, 173 insertions(+), 26 deletions(-)

diff --git a/pkg/storage/node_liveness.go b/pkg/storage/node_liveness.go
index 51192c3aff5a..684928380601 100644
--- a/pkg/storage/node_liveness.go
+++ b/pkg/storage/node_liveness.go
@@ -124,13 +124,33 @@ type IsLiveCallback func(nodeID roachpb.NodeID)
 // indicating that it is alive.
 type HeartbeatCallback func(context.Context)
 
-// NodeLiveness encapsulates information on node liveness and provides
-// an API for querying, updating, and invalidating node
-// liveness. Nodes periodically "heartbeat" the range holding the node
-// liveness system table to indicate that they're available. The
-// resulting liveness information is used to ignore unresponsive nodes
-// while making range quiescence decisions, as well as for efficient,
-// node liveness epoch-based range leases.
+// NodeLiveness is a centralized failure detector that coordinates
+// with the epoch-based range system to provide for leases of
+// indefinite length (replacing frequent per-range lease renewals with
+// heartbeats to the liveness system).
+//
+// It is also used as a general-purpose failure detector, but it is
+// not ideal for this purpose. It is inefficient due to the use of
+// replicated durable writes, and is not very sensitive (it primarily
+// tests connectivity from the node to the liveness range; a node with
+// a failing disk could still be considered live by this system).
+//
+// The persistent state of node liveness is stored in the KV layer,
+// near the beginning of the keyspace. These are normal MVCC keys,
+// written by CPut operations in 1PC transactions (the use of
+// transactions and MVCC is regretted because it means that the
+// liveness span depends on MVCC GC and can get overwhelmed if GC is
+// not working. Transactions were used only to piggyback on the
+// transaction commit trigger). The leaseholder of the liveness range
+// gossips its contents whenever they change (only the changed
+// portion); other nodes rarely read from this range directly.
+//
+// The use of conditional puts is crucial to maintain the guarantees
+// needed by epoch-based leases. Both the Heartbeat and IncrementEpoch
+// on this type require an expected value to be passed in; see
+// comments on those methods for more.
+//
+// TODO(bdarnell): Also document interaction with draining and decommissioning.
 type NodeLiveness struct {
 	ambientCtx        log.AmbientContext
 	clock             *hlc.Clock
@@ -268,6 +288,11 @@ func (nl *NodeLiveness) SetDecommissioning(
 		// decommissioning commands in a tight loop on different nodes sometimes
 		// results in unintentional no-ops (due to the Gossip lag); this could be
 		// observed by users in principle, too.
+		//
+		// TODO(bdarnell): This is the one place where a range other than
+		// the leaseholder reads from this range. Should this read from
+		// gossip instead? (I have vague concerns about concurrent reads
+		// and timestamp cache pushes causing problems here)
 		var oldLiveness storagepb.Liveness
 		if err := nl.db.GetProto(ctx, keys.NodeLivenessKey(nodeID), &oldLiveness); err != nil {
 			return false, errors.Wrap(err, "unable to get liveness")
@@ -507,6 +532,21 @@ var errNodeAlreadyLive = errors.New("node already live")
 // Heartbeat is called to update a node's expiration timestamp. This
 // method does a conditional put on the node liveness record, and if
 // successful, stores the updated liveness record in the nodes map.
+//
+// The liveness argument is the expected previous value of this node's
+// liveness.
+//
+// If this method returns nil, the node's liveness has been extended,
+// relative to the previous value. It may or may not still be alive
+// when this method returns.
+//
+// On failure, this method returns ErrEpochIncremented, although this
+// may not necessarily mean that the epoch was actually incremented.
+// TODO(bdarnell): Fix error semantics here.
+//
+// This method is rarely called directly; heartbeats are normally sent
+// by the StartHeartbeat loop.
+// TODO(bdarnell): Should we just remove this synchronous heartbeat completely?
 func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness *storagepb.Liveness) error {
 	return nl.heartbeatInternal(ctx, liveness, false /* increment epoch */)
 }
@@ -575,6 +615,14 @@ func (nl *NodeLiveness) heartbeatInternal(
 		// considered live, treat the heartbeat as a success. This can
 		// happen when the periodic heartbeater races with a concurrent
 		// lease acquisition.
+		//
+		// TODO(bdarnell): If things are very slow, the new liveness may
+		// have already expired and we'd incorrectly return
+		// ErrEpochIncremented. Is this check even necessary? The common
+		// path through this method doesn't check whether the liveness
+		// expired while in flight, so maybe we don't have to care about
+		// that and only need to distinguish between same and different
+		// epochs in our return value.
 		if actual.IsLive(nl.clock.Now(), nl.clock.MaxOffset()) && !incrementEpoch {
 			return errNodeAlreadyLive
 		}
@@ -690,13 +738,30 @@ func (nl *NodeLiveness) getLivenessLocked(nodeID roachpb.NodeID) (*storagepb.Liv
 	return nil, ErrNoLivenessRecord
 }
 
-// IncrementEpoch is called to increment the current liveness epoch,
-// thereby invalidating anything relying on the liveness of the
-// previous epoch. This method does a conditional put on the node
-// liveness record, and if successful, stores the updated liveness
-// record in the nodes map. If this method is called on a node ID
-// which is considered live according to the most recent information
-// gathered through gossip, an error is returned.
+// IncrementEpoch is called to attempt to revoke another node's
+// current epoch, causing an expiration of all its leases. This method
+// does a conditional put on the node liveness record, and if
+// successful, stores the updated liveness record in the nodes map. If
+// this method is called on a node ID which is considered live
+// according to the most recent information gathered through gossip,
+// an error is returned.
+//
+// The liveness argument is used as the expected value on the
+// conditional put. If this method returns nil, there was a match and
+// the epoch has been incremented. This means that the expiration time
+// in the supplied liveness accurately reflects the time at which the
+// epoch ended.
+//
+// If this method returns ErrEpochAlreadyIncremented, the epoch has
+// already been incremented past the one in the liveness argument, but
+// the conditional put did not find a match. This means that another
+// node performed a successful IncrementEpoch, but we can't tell at
+// what time the epoch actually ended. (Usually when multiple
+// IncrementEpoch calls race, they're using the same expected value.
+// But when there is a severe backlog, it's possible for one increment
+// to get stuck in a queue long enough for the dead node to make
+// another successful heartbeat, and a second increment to come in
+// after that)
 func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness *storagepb.Liveness) error {
 	// Allow only one increment at a time.
 	sem := nl.sem(liveness.NodeID)
diff --git a/pkg/storage/replica_range_lease.go b/pkg/storage/replica_range_lease.go
index 35585fd4cc95..4e39f7a8bd5e 100644
--- a/pkg/storage/replica_range_lease.go
+++ b/pkg/storage/replica_range_lease.go
@@ -13,6 +13,36 @@
 // permissions and limitations under the License.
 
 // This file contains replica methods related to range leases.
+//
+// Here be dragons: The lease system (especially for epoch-based
+// leases) relies on multiple interlocking conditional puts (here and
+// in NodeLiveness). Reads (to get expected values) and conditional
+// puts have to happen in a certain order, leading to surprising
+// dependencies at a distance (for example, there's a LeaseStatus
+// object that gets plumbed most of the way through this file.
+// LeaseStatus bundles the results of multiple checks with the time at
+// which they were performed, so that timestamp must be used for later
+// operations). The current arrangement is not perfect, and some
+// opportunities for improvement appear, but any changes must be made
+// very carefully.
+//
+// NOTE(bdarnell): The biggest problem with the current code is that
+// with epoch-based leases, we may do two separate slow operations
+// (IncrementEpoch/Heartbeat and RequestLease/AdminTransferLease). In
+// the organization that was inherited from expiration-based leases,
+// we prepare the arguments we're going to use for the lease
+// operations before performing the liveness operations, and by the
+// time the liveness operations complete those may be stale.
+//
+// Therefore, my suggested refactoring would be to move the liveness
+// operations earlier in the process, soon after the initial
+// leaseStatus call. If a liveness operation is required, do it and
+// start over, with a fresh leaseStatus.
+//
+// This could also allow the liveness operations to be coalesced per
+// node instead of having each range separately queue up redundant
+// liveness operations. (The InitOrJoin model predates the
+// singleflight package; could we simplify things by using it?)
 
 package storage
 
@@ -139,6 +169,10 @@ func (p *pendingLeaseRequest) RequestPending() (roachpb.Lease, bool) {
 // replica happen either before or after a result for a pending request has
 // happened.
 //
+// The new lease will be a successor to the one in the status
+// argument, and its fields will be used to fill in the expected
+// values for liveness and lease operations.
+//
 // transfer needs to be set if the request represents a lease transfer (as
 // opposed to an extension, or acquiring the lease when none is held).
 //
@@ -173,6 +207,17 @@ func (p *pendingLeaseRequest) InitOrJoinRequest(
 	var leaseReq roachpb.Request
 	now := p.repl.store.Clock().Now()
 	reqLease := roachpb.Lease{
+		// It's up to us to ensure that Lease.Start is greater than the
+		// end time of the previous lease. This means that if status
+		// refers to an expired epoch lease, we must increment the epoch
+		// *at status.Timestamp* before we can propose this lease.
+		//
+		// Note that the server may decrease our proposed start time if it
+		// decides that it is safe to do so (for example, this happens
+		// when renewing an expiration-based lease), but it will never
+		// increase it (and a start timestamp that is too low is unsafe
+		// because it results in incorrect initialization of the timestamp
+		// cache on the new leaseholder).
 		Start:      status.Timestamp,
 		Replica:    nextLeaseHolder,
 		ProposedTS: &now,
@@ -206,6 +251,9 @@ func (p *pendingLeaseRequest) InitOrJoinRequest(
 		leaseReq = &roachpb.RequestLeaseRequest{
 			RequestHeader: reqHeader,
 			Lease:         reqLease,
+			// PrevLease must match for our lease to be accepted. If another
+			// lease is applied between our previous call to leaseStatus and
+			// our lease request applying, it will be rejected.
 			PrevLease:     status.Lease,
 			MinProposedTS: &minProposedTS,
 		}
@@ -231,6 +279,9 @@ func (p *pendingLeaseRequest) InitOrJoinRequest(
 
 // requestLeaseAsync sends a transfer lease or lease request to the
 // specified replica. The request is sent in an async task.
+//
+// The status argument is used as the expected value for liveness operations.
+// reqLease and leaseReq must be consistent with the LeaseStatus.
 func (p *pendingLeaseRequest) requestLeaseAsync(
 	parentCtx context.Context,
 	nextLeaseHolder roachpb.ReplicaDescriptor,
@@ -339,6 +390,7 @@ func (p *pendingLeaseRequest) requestLeaseAsync(
 				}
 				// Set error for propagation to all waiters below.
 				if err != nil {
+					// TODO(bdarnell): is status.Lease really what we want to put in the NotLeaseHolderError here?
 					pErr = roachpb.NewError(newNotLeaseHolderError(&status.Lease, p.repl.store.StoreID(), p.repl.Desc()))
 				}
 			}
diff --git a/pkg/storage/storagepb/lease_status.pb.go b/pkg/storage/storagepb/lease_status.pb.go
index b7bb29982d7d..660589b9a003 100644
--- a/pkg/storage/storagepb/lease_status.pb.go
+++ b/pkg/storage/storagepb/lease_status.pb.go
@@ -29,12 +29,27 @@ const (
 	LeaseState_ERROR LeaseState = 0
 	// VALID indicates that the lease can be used.
 	LeaseState_VALID LeaseState = 1
-	// STASIS indicates that the lease has not expired, but can't be used.
+	// STASIS indicates that the lease has not expired, but can't be
+	// used because it is close to expiration (a stasis period at the
+	// end of each lease is one of the ways we handle clock
+	// uncertainty). A lease in STASIS may become VALID for the same
+	// leaseholder after a successful RequestLease (for expiration-based
+	// leases) or Heartbeat (for epoch-based leases). A lease may not
+	// change hands while it is in stasis; would-be acquirers must wait
+	// for the stasis period to expire.
 	LeaseState_STASIS LeaseState = 2
-	// EXPIRED indicates that the lease can't be used.
+	// EXPIRED indicates that the lease can't be used. An expired lease
+	// may become VALID for the same leaseholder on RequestLease or
+	// Heartbeat, or it may be replaced by a new leaseholder with a
+	// RequestLease (for expiration-based leases) or
+	// IncrementEpoch+RequestLease (for epoch-based leases).
 	LeaseState_EXPIRED LeaseState = 3
-	// PROSCRIBED indicates that the lease's proposed timestamp is earlier than
-	// allowed.
+	// PROSCRIBED indicates that the lease's proposed timestamp is
+	// earlier than allowed. This is used to detect node restarts: a
+	// node that has restarted will see its former incarnation's leases
+	// as PROSCRIBED so it will renew them before using them. Note that
+	// the PROSCRIBED state is only visible to the leaseholder; other
+	// nodes will see this as a VALID lease.
 	LeaseState_PROSCRIBED LeaseState = 4
 )
 
@@ -57,7 +72,7 @@ func (x LeaseState) String() string {
 	return proto.EnumName(LeaseState_name, int32(x))
 }
 func (LeaseState) EnumDescriptor() ([]byte, []int) {
-	return fileDescriptor_lease_status_95911d6a50f866a6, []int{0}
+	return fileDescriptor_lease_status_0cae162a9ab6b5cd, []int{0}
 }
 
 // LeaseStatus holds the lease state, the timestamp at which the state
@@ -80,7 +95,7 @@ func (m *LeaseStatus) Reset()         { *m = LeaseStatus{} }
 func (m *LeaseStatus) String() string { return proto.CompactTextString(m) }
 func (*LeaseStatus) ProtoMessage()    {}
 func (*LeaseStatus) Descriptor() ([]byte, []int) {
-	return fileDescriptor_lease_status_95911d6a50f866a6, []int{0}
+	return fileDescriptor_lease_status_0cae162a9ab6b5cd, []int{0}
 }
 func (m *LeaseStatus) XXX_Unmarshal(b []byte) error {
 	return m.Unmarshal(b)
@@ -468,10 +483,10 @@ var (
 )
 
 func init() {
-	proto.RegisterFile("storage/storagepb/lease_status.proto", fileDescriptor_lease_status_95911d6a50f866a6)
+	proto.RegisterFile("storage/storagepb/lease_status.proto", fileDescriptor_lease_status_0cae162a9ab6b5cd)
 }
 
-var fileDescriptor_lease_status_95911d6a50f866a6 = []byte{
+var fileDescriptor_lease_status_0cae162a9ab6b5cd = []byte{
 	// 346 bytes of a gzipped FileDescriptorProto
 	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x6c, 0x91, 0xcf, 0x4a, 0xeb, 0x40,
 	0x14, 0x87, 0x33, 0xfd, 0x77, 0x6f, 0x4f, 0xa1, 0xc4, 0xc1, 0x45, 0xa8, 0x18, 0x8b, 0xba, 0x28,
diff --git a/pkg/storage/storagepb/lease_status.proto b/pkg/storage/storagepb/lease_status.proto
index c798ccb438bd..98dee4768c88 100644
--- a/pkg/storage/storagepb/lease_status.proto
+++ b/pkg/storage/storagepb/lease_status.proto
@@ -27,12 +27,27 @@ enum LeaseState {
   ERROR = 0;
   // VALID indicates that the lease can be used.
   VALID = 1;
-  // STASIS indicates that the lease has not expired, but can't be used.
+  // STASIS indicates that the lease has not expired, but can't be
+  // used because it is close to expiration (a stasis period at the
+  // end of each lease is one of the ways we handle clock
+  // uncertainty). A lease in STASIS may become VALID for the same
+  // leaseholder after a successful RequestLease (for expiration-based
+  // leases) or Heartbeat (for epoch-based leases). A lease may not
+  // change hands while it is in stasis; would-be acquirers must wait
+  // for the stasis period to expire.
   STASIS = 2;
-  // EXPIRED indicates that the lease can't be used.
+  // EXPIRED indicates that the lease can't be used. An expired lease
+  // may become VALID for the same leaseholder on RequestLease or
+  // Heartbeat, or it may be replaced by a new leaseholder with a
+  // RequestLease (for expiration-based leases) or
+  // IncrementEpoch+RequestLease (for epoch-based leases).
   EXPIRED = 3;
-  // PROSCRIBED indicates that the lease's proposed timestamp is earlier than
-  // allowed.
+  // PROSCRIBED indicates that the lease's proposed timestamp is
+  // earlier than allowed. This is used to detect node restarts: a
+  // node that has restarted will see its former incarnation's leases
+  // as PROSCRIBED so it will renew them before using them. Note that
+  // the PROSCRIBED state is only visible to the leaseholder; other
+  // nodes will see this as a VALID lease.
   PROSCRIBED = 4;
 }