From 060c0ef0b8c5892cfbda9715a968f9daf548c3e8 Mon Sep 17 00:00:00 2001
From: David Taylor <tinystatemachine@gmail.com>
Date: Wed, 30 Mar 2022 01:10:45 +0000
Subject: [PATCH] kv/bulk: chunk SSTs to row boundaries

Previously an sstable might end due to size at /table/i/rowX/col/Y, if
some, but not all, families for rowX fit in that file. This is OK as far
as KV and SQL are concerned, since after we add the next file which will
start with rowX/colZ, the row is complete from the point of view of any
scan. However it does mean that if, after adding this file we determine
that we need to split before adding the next file, that split, as it
must be at a row boundary, will be at rowX, not rowX/colZ. This too is
OK, but has the slight downside of meaning that when we scatter the new
RHS, starting at rowX, we have to move the colY family KV we just added
in the prior prior file. While it is typically a trivial amount of data,
it does make the RHS non-empty and thus require _some_ cost to move.

This changes the size-based limit that triggers a file flush to wait for
the next row boundary after the size is exceeded, so that SST bounds now
also fall on row, and thus any future range split, bounds.

This is particularly relevant in conjunction with #78218.

Release note: none.
---
 pkg/kv/bulk/sst_batcher.go | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pkg/kv/bulk/sst_batcher.go b/pkg/kv/bulk/sst_batcher.go
index 6485b413638f..92d6115e80a2 100644
--- a/pkg/kv/bulk/sst_batcher.go
+++ b/pkg/kv/bulk/sst_batcher.go
@@ -316,6 +316,23 @@ func (b *SSTBatcher) flushIfNeeded(ctx context.Context, nextKey roachpb.Key) err
 	}
 
 	if b.sstWriter.DataSize >= ingestFileSize(b.settings) {
+		// We're at/over size target, so we want to flush, but first check if we are
+		// at a new row boundary. Having row-aligned boundaries is not actually
+		// required by anything, but has the nice property of meaning a split will
+		// fall between files. This is particularly useful mid-IMPORT when we split
+		// at the beginning of the next file, as that split is at the row boundary
+		// at *or before* that file's start. If the file boundary is not aligned,
+		// the prior file overhangs beginning of the row in which the next file
+		// starts, so when we split at that row, that overhang into the RHS that we
+		// just wrote will be rewritten by the subsequent scatter. By waiting for a
+		// row boundary, we ensure any split is actually between files.
+		prevRow, prevErr := keys.EnsureSafeSplitKey(b.batchEndKey)
+		nextRow, nextErr := keys.EnsureSafeSplitKey(nextKey)
+		if prevErr == nil && nextErr == nil && bytes.Equal(prevRow, nextRow) {
+			// An error decoding either key implies it is not a valid row key and thus
+			// not the same row for our purposes; we don't care what the error is.
+			return nil // keep going to row boundary.
+		}
 		if err := b.doFlush(ctx, sizeFlush); err != nil {
 			return err
 		}