From 9d44ec2283dd0a19591ef49e358d4ec0da96bf7f Mon Sep 17 00:00:00 2001
From: Alberto Benegiamo <alberto.benegiamo@gmail.com>
Date: Mon, 16 Oct 2023 08:53:59 -0700
Subject: [PATCH] Validator Diffs: docs and UTs cleanup (#2037)

Co-authored-by: Stephen Buttolph <stephen@avalabs.org>
---
 vms/platformvm/docs/validators_versioning.md  | 113 ++++++++++++++++++
 vms/platformvm/validator_set_property_test.go |  74 +++++++-----
 2 files changed, 159 insertions(+), 28 deletions(-)
 create mode 100644 vms/platformvm/docs/validators_versioning.md

diff --git a/vms/platformvm/docs/validators_versioning.md b/vms/platformvm/docs/validators_versioning.md
new file mode 100644
index 000000000000..c4fce00399c5
--- /dev/null
+++ b/vms/platformvm/docs/validators_versioning.md
@@ -0,0 +1,113 @@
+# Validators versioning
+
+One of the main responsibilities of the P-chain is to register and expose the validator set of any Subnet at every height.
+
+This information helps Subnets to bootstrap securely, downloading information from active validators only; moreover it supports validated cross-chain communication via Warp.
+
+In this brief document we dive into the technicalities of how `platformVM` tracks and versions the validator set of any Subnet.
+
+## The tracked content
+
+The entry point to retrieve validator information at a given height is the `GetValidatorSet` method in the `validators` package. Here is its signature:
+
+```golang
+GetValidatorSet(ctx context.Context, height uint64, subnetID ids.ID) (map[ids.NodeID]*GetValidatorOutput, error)
+```
+
+`GetValidatorSet` lets any VM specify a Subnet and a height and returns the data of all Subnet validators active at the requested height, and only those.
+
+Validator data are collected in a struct named `validators.GetValidatorOutput` which holds for each active validator, its `NodeID`, its `Weight` and its `BLS Public Key` if it was registered.
+
+Note that a validator `Weight` is not just its stake; its the aggregate value of the validator's own stake and all of its delegators' stake. A validator's `Weight` gauges how relevant its preference should be in consensus or Warp operations.
+
+We will see in the next section how the P-chain keeps track of this information over time as the validator set changes.
+
+## Validator diffs content
+
+Every new block accepted by the P-chain can potentially alter the validator set of any Subnet, including the primary one. New validators may be added; some of them may have reached their end of life and are therefore removed. Moreover a validator can register itself again once its staking time is done, possibly with a `Weight` and a `BLS Public key` different from the previous staking period.
+
+Whenever the block at height `H` adds or removes a validator, the P-chain does, among others, the following operations:
+
+1. it updates the current validator set to add the new validator or remove it if expired;
+2. it explicitly records the validator set diffs with respect to the validator set at height `H-1`.
+
+These diffs are key to rebuilding the validator set at a given past height. In this section we illustrate their content. In next ones, We'll see how the diffs are stored and used.
+
+The validators diffs track changes in a validator's `Weight` and `BLS Public key`. Along with the `NodeID` this is the data exposed by the `GetValidatorSet` method.
+
+Note that `Weight` and `BLS Public key` behave differently throughout the validator lifetime:
+
+1. `BLS Public key` cannot change through a validator's lifetime. It can only change when a validator is added/re-added and removed.
+2. `Weight` can change throughout a validator's lifetime by the creation and removal of its delegators as well as by validator's own creation and removal.
+
+Here is a scheme of what `Weight` and `BLS Public key` diff content we record upon relevant scenarios:
+
+|                    | Weight Diff (forward looking)                                                                           | BLS Key Diff (backward looking)                                                       |
+|--------------------|---------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------|
+| Validator creation | record ```golang state.ValidatorWeightDiff{       Decrease: false,     Weight: validator.Weight, }``` | record an empty byte slice if validator.BlsKey is specified; otherwise record nothing |
+| Delegator creation | record ```golang state.ValidatorWeightDiff{      Decrease: false,     Weight: validator.Weight, }```  | No entry is recorded                                                                  |
+| Delegator removal  | record ```golang state.ValidatorWeightDiff{      Decrease: true,     Weight: validator.Weight, }```  | No entry is recorded                                                                  |
+| Validator removal  | record ```golang state.ValidatorWeightDiff{      Decrease: true,     Weight: validator.Weight, }```  | record validator.BlsKey if it is specified; otherwise record nothing                  |
+
+Note that `Weight` diffs are encoded `state.ValidatorWeightDiff` and are *forward-looking*: a diff recorded at height `H` stores the change that transforms validator weight at height `H-1` into validator weight at height `H`.
+
+In contrast, `BLS Public Key` diffs are *backward-looking*: a diff recorded at height `H` stores the change that transforms validator `BLS Public Key` at height `H` into validator `BLS Public key` at height `H-1`.
+
+Finally, if no changes are made to the validator set no diff entry is recorded. This implies that a validator `Weight` or `BLS Public Key` diff may not be stored for every height `H`.
+
+## Validator diffs layout
+
+Validator diffs layout is optimized to support iteration. Validator sets are rebuilt by accumulating `Weight` and `BLS Public Key` diffs from the top-most height down to the requested height. So validator diffs are stored so that it's fast to iterate them in this order.
+
+`Weight` diffs are stored as a contiguous block of key-value pairs as follows:
+
+| Key                                | Value                                |
+|------------------------------------|--------------------------------------|
+| SubnetID + Reverse_Height + NodeID | serialized state.ValidatorWeightDiff |
+
+Note that:
+
+1. `Weight` diffs related to a Subnet are stored contiguously.
+2. Diff height is serialized as `Reverse_Height`. It is stored with big endian format and has its bits flipped too. Big endianess ensures that heights are stored in order, bit flipping ensures that the top-most height is always the first.
+3. `NodeID` is part of the key and `state.ValidatorWeightDiff` is part of the value.
+
+`BLS Public` diffs are stored as follows:
+
+| Key                                | Value                         |
+|------------------------------------|-------------------------------|
+| SubnetID + Reverse_Height + NodeID | validator.BlsKey bytes or nil |
+
+Note that:
+
+1. `BLS Public Key` diffs have the same keys as `Weight` diffs. This implies that the same ordering is guaranteed.
+2. Value is either validator `BLS Public Key` bytes or an empty byte slice, as illustrated in the previous section.
+
+## Validators diff usage in rebuilding validators state
+
+Now let's see how diffs are used to rebuild the validator set at a given height. The procedure varies slightly between Primary Network and Subnet validator, so we'll describe them separately.
+We assume that the reader knows that, as of the Cortina fork, every Subnet validator must also be a Primary Network validator.
+
+### Primary network validator set rebuild
+
+If the P-Chain's current height is `T` and we want to retrieve the Primary Network validators at height `H < T`. We proceed as follows:
+
+1. We retrieve the Primary Network validator set at current height `T`. This is the base state on top of which diffs will be applied.
+2. We apply weight diffs first. Specifically:
+   - `Weight` diff iteration starts from the top-most height smaller or equal to `T`. Remember that entry heights do not need to be contiguous, so the iteration starts from the highest height smaller or equal to `T`, in case `T` does not have a diff entry.
+   - Since `Weight` diffs are forward-looking, each diff is applied in reverse. A validator's weight is decreased if `state.ValidatorWeightDiff.Decrease` is `false` and it is increased if it is `true`.
+   - We take care of adding or removing a validator from the base set based on its weight. Whenever a validator weight, following diff application, becomes zero, we drop it; conversely whenever we encounter a diff increasing weight for a currently-non-existing validator, we add the validator to the base set.
+   - The iteration stops at the first height smaller or equal to `H+1`. Note that a `Weight` diff stored at height `K` holds the content to turn validator state at height `K-1` into validator state at height `K`. So to get validator state at height `K` we must apply diff content at height `K+1`.
+3. Once all `Weight` diffs have been applied, the resulting validator set will contain all Primary Network validators active at height `H` and only those. We still need to compute the correct `BLS Public Keys` registered at height `H` for these validators, as each validator may have restaked between height `H` and `T`. They may have a different (or no) `BLS Public Key` at either height. We solve this by applying `BLS Public Key` diffs to the validator set:
+   - Once again we iterate `BLS Public Key` diffs from the top-most height smaller or equal to `T` till the first height smaller or equal to `H+1`.
+   - Since `BLS Public Key` diffs are *backward-looking*, we simply nil the BLS key when diff is nil and we restore the BLS Key when it is specified in the diff.
+
+### Subnet validator set rebuild
+
+Let's see first the reason why Subnet validators needs to have handled differently. As of `Cortina` fork, we allow `BLS Public Key` registration only for Primary network validators. A given `NodeID` may be both a Primary Network validator and a Subnet validator, but it'll register its `BLS Public Key` only when it registers as Primary Network validator. Despite this, we want to provide a validator `BLS Public Key` when `validators.GetValidatorOutput` is called. So we need to fetch it from the Primary Network validator set.
+
+Say P-chain current height is `T` and we want to retrieve Primary network validators at height `H < T`. We proceed as follows:
+
+1. We retrieve both Subnet and Primary Network validator set at current height `T`,
+2. We apply `Weight` diff on top of the Subnet validator set, exactly as described in the previous section,
+3. Before applying `BLS Public Key` diffs, we retrieve `BLS Public Key` from the current Primary Network validator set for each of the current Subnet validators. This ensures the `BLS Public Key`s are duly initialized before applying the diffs,
+4. Finally we apply the `BLS Public Key` diffs exactly as described in the previous section.
diff --git a/vms/platformvm/validator_set_property_test.go b/vms/platformvm/validator_set_property_test.go
index bccbd0f77fd5..bfcdea3c0683 100644
--- a/vms/platformvm/validator_set_property_test.go
+++ b/vms/platformvm/validator_set_property_test.go
@@ -8,6 +8,7 @@ import (
 	"errors"
 	"fmt"
 	"reflect"
+	"sort"
 	"testing"
 	"time"
 
@@ -15,6 +16,8 @@ import (
 	"github.com/leanovate/gopter/gen"
 	"github.com/leanovate/gopter/prop"
 
+	"golang.org/x/exp/maps"
+
 	"github.com/ava-labs/avalanchego/chains"
 	"github.com/ava-labs/avalanchego/chains/atomic"
 	"github.com/ava-labs/avalanchego/database/manager"
@@ -93,8 +96,8 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 				return fmt.Sprintf("failed building events sequence: %s", err.Error())
 			}
 
-			validatorsSetByHeightAndSubnet := make(map[uint64]map[ids.ID]map[ids.NodeID]*validators.GetValidatorOutput)
-			if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+			validatorSetByHeightAndSubnet := make(map[uint64]map[ids.ID]map[ids.NodeID]*validators.GetValidatorOutput)
+			if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 				return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 			}
 
@@ -104,7 +107,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 				currentSubnetValidator  = (*state.Staker)(nil)
 			)
 			for _, ev := range validatorsTimes {
-				// at each we remove at least a subnet validator
+				// at each step we remove at least a subnet validator
 				if currentSubnetValidator != nil {
 					err := terminateSubnetValidator(vm, currentSubnetValidator)
 					if err != nil {
@@ -112,7 +115,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 					}
 					currentSubnetValidator = nil
 
-					if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+					if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 						return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 					}
 				}
@@ -123,7 +126,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 					if err != nil {
 						return fmt.Sprintf("could not add subnet validator: %s", err.Error())
 					}
-					if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+					if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 						return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 					}
 
@@ -138,7 +141,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 						// no need to nil current primary validator, we'll
 						// reassign immediately
 
-						if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+						if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 							return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 						}
 					}
@@ -146,7 +149,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 					if err != nil {
 						return fmt.Sprintf("could not add primary validator without BLS key: %s", err.Error())
 					}
-					if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+					if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 						return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 					}
 
@@ -161,7 +164,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 						// no need to nil current primary validator, we'll
 						// reassign immediately
 
-						if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+						if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 							return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 						}
 					}
@@ -169,7 +172,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 					if err != nil {
 						return fmt.Sprintf("could not add primary validator with BLS key: %s", err.Error())
 					}
-					if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
+					if err := takeValidatorsSnapshotAtCurrentHeight(vm, validatorSetByHeightAndSubnet); err != nil {
 						return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
 					}
 
@@ -177,9 +180,37 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 					return fmt.Sprintf("unexpected staker type: %v", ev.eventType)
 				}
 			}
-			if err := takeValidatorsSnapshotAtCurrentHeightAndTest(vm, validatorsSetByHeightAndSubnet); err != nil {
-				return fmt.Sprintf("could not take validators snapshot: %s", err.Error())
+
+			// Checks: let's look back at validator sets at previous heights and
+			// make sure they match the snapshots already taken
+			snapshotHeights := maps.Keys(validatorSetByHeightAndSubnet)
+			sort.Slice(snapshotHeights, func(i, j int) bool { return snapshotHeights[i] < snapshotHeights[j] })
+			for idx, snapShotHeight := range snapshotHeights {
+				lastAcceptedHeight, err := vm.GetCurrentHeight(context.Background())
+				if err != nil {
+					return err.Error()
+				}
+
+				nextSnapShotHeight := lastAcceptedHeight + 1
+				if idx != len(snapshotHeights)-1 {
+					nextSnapShotHeight = snapshotHeights[idx+1]
+				}
+
+				// within [snapShotHeight] and [nextSnapShotHeight], the validator set
+				// does not change and must be equal to snapshot at [snapShotHeight]
+				for height := snapShotHeight; height < nextSnapShotHeight; height++ {
+					for subnetID, validatorsSet := range validatorSetByHeightAndSubnet[snapShotHeight] {
+						res, err := vm.GetValidatorSet(context.Background(), height, subnetID)
+						if err != nil {
+							return fmt.Sprintf("failed GetValidatorSet at height %v: %v", height, err)
+						}
+						if !reflect.DeepEqual(validatorsSet, res) {
+							return "failed validators set comparison"
+						}
+					}
+				}
 			}
+
 			return ""
 		},
 		gen.SliceOfN(
@@ -198,7 +229,7 @@ func TestGetValidatorsSetProperty(t *testing.T) {
 	properties.TestingRun(t)
 }
 
-func takeValidatorsSnapshotAtCurrentHeightAndTest(vm *VM, validatorsSetByHeightAndSubnet map[uint64]map[ids.ID]map[ids.NodeID]*validators.GetValidatorOutput) error {
+func takeValidatorsSnapshotAtCurrentHeight(vm *VM, validatorsSetByHeightAndSubnet map[uint64]map[ids.ID]map[ids.NodeID]*validators.GetValidatorOutput) error {
 	if validatorsSetByHeightAndSubnet == nil {
 		validatorsSetByHeightAndSubnet = make(map[uint64]map[ids.ID]map[ids.NodeID]*validators.GetValidatorOutput)
 	}
@@ -219,8 +250,9 @@ func takeValidatorsSnapshotAtCurrentHeightAndTest(vm *VM, validatorsSetByHeightA
 	if err != nil {
 		return err
 	}
+	defer stakerIt.Release()
 	for stakerIt.Next() {
-		v := *stakerIt.Value()
+		v := stakerIt.Value()
 		validatorsSet, ok := validatorsSetBySubnet[v.SubnetID]
 		if !ok {
 			validatorsSetBySubnet[v.SubnetID] = make(map[ids.NodeID]*validators.GetValidatorOutput)
@@ -243,19 +275,6 @@ func takeValidatorsSnapshotAtCurrentHeightAndTest(vm *VM, validatorsSetByHeightA
 			Weight:    v.Weight,
 		}
 	}
-
-	// test the validator sets
-	for height, subnetSets := range validatorsSetByHeightAndSubnet {
-		for subnet, validatorsSet := range subnetSets {
-			res, err := vm.GetValidatorSet(context.Background(), height, subnet)
-			if err != nil {
-				return fmt.Errorf("failed GetValidatorSet: %w", err)
-			}
-			if !reflect.DeepEqual(validatorsSet, res) {
-				return errors.New("failed validators set comparison")
-			}
-		}
-	}
 	return nil
 }
 
@@ -469,8 +488,7 @@ type validatorInputData struct {
 }
 
 // buildTimestampsList creates validators start and end time, given the event list.
-// output is returned as a list of state.Stakers, just because it's a convenient object to
-// collect all relevant information.
+// output is returned as a list of validatorInputData
 func buildTimestampsList(events []uint8, currentTime time.Time, nodeID ids.NodeID) ([]*validatorInputData, error) {
 	res := make([]*validatorInputData, 0, len(events))