From b60a6ce831a7555b99c58e58d3b8f71b3159ce3b Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Wed, 1 Aug 2018 00:51:37 -0400
Subject: [PATCH] storage: truncate log only between first index and truncate
 index

Raft log truncations currently perform two steps (there may be
others, but for the sake of this discussion, let's consider only
these two):
1. above raft, they compute the stats of all raft log entries up
   to the truncation entry.
2. beneath raft, they use ClearIterRange to clear all raft log
   entries up to the truncation entry.

In both steps, operations are performed on all entries up to the
truncation entry, and in both steps these operations start from
entry 0. A comment added in #16993 gives some idea as to why:

> // We start at index zero because it's always possible that a previous
> // truncation did not clean up entries made obsolete by the previous
> // truncation.

My current understanding is that this case where a Raft log has
been truncated but its entries not cleaned up is only possible if
a node crashes between `applyRaftCommand` and `handleEvalResultRaftMuLocked`.
This brings up the question: why don't we truncate raft entries
downstream of raft in `applyRaftCommand`? That way, the entries
could be deleted atomically with the update to the `RaftTruncatedStateKey`
and we wouldn't have to worry about them ever diverging or Raft entries
being leaked. That seems like a trivial change, and if that was the
case, would the approach here be safe? I don't see a reason why
not.

For motivation on why we should explore this, I've found that when
running `sysbench oltp_insert` on a fresh cluster without pre-splits to
measure single range write through, raft log truncation accounts for
about 20% of CPU utilization.

If we switch the ClearIterRange to a ClearRange downstream of raft,
we improve throughput by 13% and reduce the amount of CPU that raft
log truncation uses to about 5%. It's obvious why this speeds up the
actual truncation itself downstream of raft. The reason why it speeds
up the stats computation is less clear, but it may be allowing a RocksDB
iterator to more easily skip over the deleted entry keys.

If we make the change proposed here, we improve throughput by 28% and
reduce the amount of CPU that raft log truncation uses to a negligible
amount (< 1%, hard to tell exactly). The reason this speeds both the
truncation and the stats computation is because it avoids iterating
over RocksDB tombstones for all Raft entries that have ever existed
on the range.

The throughput improvements are of course exaggerated because we are
isolating the throughput of a single range, but they're significant
enough to warrant exploration about whether we can make this approach
work.

Finally, the outsized impact of this small change naturally justifies
further exploration. If we could make the change here safe (i.e. if we
could depend on replica.FirstIndex() to always be a lower bound on raft
log entry keys), could we make similar changes elsewhere? Are there
other places where we iterate over an entire raft log keyspace and
inadvertently run into all of the deletion tombstones when we could
simply skip to the `replica.FirstIndex()`? At a minimum, I believe
that `clearRangeData` fits this description, so there may be room to
speed up snapshots and replica GC.

Release note (performance improvement): Reduce the cost of Raft log
truncations and increase single-range throughput.
---
 pkg/storage/batcheval/cmd_truncate_log.go |  5 +--
 pkg/storage/replica.go                    | 30 ++++++++++++++++++
 pkg/storage/replica_proposal.go           | 38 -----------------------
 3 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/pkg/storage/batcheval/cmd_truncate_log.go b/pkg/storage/batcheval/cmd_truncate_log.go
index bf9ad3861923..c45491e027ae 100644
--- a/pkg/storage/batcheval/cmd_truncate_log.go
+++ b/pkg/storage/batcheval/cmd_truncate_log.go
@@ -81,10 +81,7 @@ func TruncateLog(
 		return result.Result{}, err
 	}
 
-	// We start at index zero because it's always possible that a previous
-	// truncation did not clean up entries made obsolete by the previous
-	// truncation.
-	start := engine.MakeMVCCMetadataKey(keys.RaftLogKey(rangeID, 0))
+	start := engine.MakeMVCCMetadataKey(keys.RaftLogKey(rangeID, firstIndex))
 	end := engine.MakeMVCCMetadataKey(keys.RaftLogKey(rangeID, args.Index).PrefixEnd())
 
 	var ms enginepb.MVCCStats
diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
index 9ff4f69650dd..cb955890baeb 100644
--- a/pkg/storage/replica.go
+++ b/pkg/storage/replica.go
@@ -5590,6 +5590,7 @@ func (r *Replica) applyRaftCommand(
 	usingAppliedStateKey := r.mu.state.UsingAppliedStateKey
 	oldRaftAppliedIndex := r.mu.state.RaftAppliedIndex
 	oldLeaseAppliedIndex := r.mu.state.LeaseAppliedIndex
+	oldTruncatedState := r.mu.state.TruncatedState
 
 	// Exploit the fact that a split will result in a full stats
 	// recomputation to reset the ContainsEstimates flag.
@@ -5678,6 +5679,35 @@ func (r *Replica) applyRaftCommand(
 		}
 	}
 
+	if rResult.State != nil && rResult.State.TruncatedState != nil {
+		newTruncatedState := rResult.State.TruncatedState
+
+		if r.store.cfg.Settings.Version.IsActive(cluster.VersionRaftLogTruncationBelowRaft) {
+			// Truncate the Raft log from the previous truncation index to the
+			// new truncation index. This is performed atomically with the raft
+			// command application so that the TruncatedState index is always
+			// consistent with the state of the Raft log itself. We can use the
+			// distinct writer because we know all writes will be to distinct
+			// keys.
+			start := engine.MakeMVCCMetadataKey(
+				keys.RaftLogKey(r.RangeID, oldTruncatedState.Index),
+			)
+			end := engine.MakeMVCCMetadataKey(
+				keys.RaftLogKey(r.RangeID, newTruncatedState.Index).PrefixEnd(),
+			)
+			// Clear the log entries. Intentionally don't use range deletion
+			// tombstones (ClearRange()) due to performance concerns connected
+			// to having many range deletion tombstones. There is a chance that
+			// ClearRange will perform well here because the tombstones could be
+			// "collapsed", but it is hardly worth the risk at this point.
+			iter := r.store.Engine().NewIterator(engine.IterOptions{UpperBound: end.Key})
+			if err := writer.ClearIterRange(iter, start, end); err != nil {
+				log.Errorf(ctx, "unable to clear truncated Raft entries for %+v: %s", rResult.State.TruncatedState, err)
+			}
+			iter.Close()
+		}
+	}
+
 	// TODO(peter): We did not close the writer in an earlier version of
 	// the code, which went undetected even though we used the batch after
 	// (though only to commit it). We should add an assertion to prevent that in
diff --git a/pkg/storage/replica_proposal.go b/pkg/storage/replica_proposal.go
index 07991f0b8fe2..a84e160ea105 100644
--- a/pkg/storage/replica_proposal.go
+++ b/pkg/storage/replica_proposal.go
@@ -545,44 +545,6 @@ func (r *Replica) handleReplicatedEvalResult(
 			r.mu.state.TruncatedState = newTruncState
 			r.mu.Unlock()
 
-			// TODO(tschottdorf): everything below doesn't need to be on this
-			// goroutine. Worth moving out -- truncations are frequent and missing
-			// one of the side effects below doesn't matter. Need to be careful
-			// about the interaction with `evalTruncateLog` though, which computes
-			// some stats based on the log entries it sees. Also, sideloaded storage
-			// needs to hold the raft mu. Perhaps it should just get its own mutex
-			// (which is usually held together with raftMu, except when accessing
-			// the storage for a truncation). Or, even better, make use of the fact
-			// that all we need to synchronize is disk i/o, and there is no overlap
-			// between files *removed* during truncation and those active in Raft.
-
-			if r.store.cfg.Settings.Version.IsActive(cluster.VersionRaftLogTruncationBelowRaft) {
-				// Truncate the Raft log.
-				batch := r.store.Engine().NewWriteOnlyBatch()
-				// We know that all of the deletions from here forward will be to distinct keys.
-				writer := batch.Distinct()
-				start := engine.MakeMVCCMetadataKey(keys.RaftLogKey(r.RangeID, 0))
-				end := engine.MakeMVCCMetadataKey(
-					keys.RaftLogKey(r.RangeID, newTruncState.Index).PrefixEnd(),
-				)
-				iter := r.store.Engine().NewIterator(engine.IterOptions{UpperBound: end.Key})
-				// Clear the log entries. Intentionally don't use range deletion
-				// tombstones (ClearRange()) due to performance concerns connected
-				// to having many range deletion tombstones. There is a chance that
-				// ClearRange will perform well here because the tombstones could be
-				// "collapsed", but it is hardly worth the risk at this point.
-				if err := writer.ClearIterRange(iter, start, end); err != nil {
-					log.Errorf(ctx, "unable to clear truncated Raft entries for %+v: %s", newTruncState, err)
-				}
-				iter.Close()
-				writer.Close()
-
-				if err := batch.Commit(false); err != nil {
-					log.Errorf(ctx, "unable to clear truncated Raft entries for %+v: %s", newTruncState, err)
-				}
-				batch.Close()
-			}
-
 			// Clear any entries in the Raft log entry cache for this range up
 			// to and including the most recently truncated index.
 			r.store.raftEntryCache.clearTo(r.RangeID, newTruncState.Index+1)