storage: be tighter with allocations when TargetBytes is present

Previously, while `put`ting the key into the `repr`, we could make an allocation that was too large given the remaining `TargetBytes` budget. This is the case since we're exponentially increasing the capacities of the buffers until 128MiB and because we're only accounting for the length of the slice even though the whole capacity would have a memory footprint. For example, with 10MiB of `TargetBytes` (which is used by SQL in many cases) and a ScanResponse that exceeds that limit, we'd allocate capacities that are powers of two, starting at, say, 256B, and would go all the way up to 8MiB; however, given that 10MiB limit, we'd only use 2MiB of that last 8MiB `repr` when we encounter the target bytes limit and stop. Such behavior is kinda ok if the response is marshalled by the gRPC to be sent to the other node, but it is quite unfortunate in the local fast-path cases (meaning the SQL and the KV are part of the same physical machine). In the latter scenario SQL would only account for the lengths of slices while keeping the whole slices alive for a while, leading to significant unaccounted for memory. In the example above, on the order of 6MiB would be unaccounted for - multiply that by some concurrency, and we have unaccounted memory on the order of hundreds of MiBs. The current behavior seems especially bad for the streamer use case where we issue many requests with the `TargetBytes` set and use `ScanResponse.NumBytes` field (which tracks the lengths of the slices) for the memory accounting purposes. In order to improve here, this commit teaches `put` method about the maximum capacity it can use. In the example above, the last slice would be on the order of 2MiB making everything happy: we stay very close to TargetBytes limit and don't have any wasted space. Release note: None
cockroachdb · Jul 22, 2022 · fb00b25 · fb00b25
1 parent eb28a3e
commit fb00b25
Showing 1 changed file with 48 additions and 10 deletions.
diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go
@@ -82,8 +82,11 @@ func (p *pebbleResults) clear() {
 // The repr that MVCCScan / MVCCGet expects to provide as output goes:
 // <valueLen:Uint32><keyLen:Uint32><Key><Value>
 // This function adds to repr in that format.
+// - maxNewSize, if positive, indicates the maximum capacity for a new repr that
+// can be allocated. It is assumed that maxNewSize (when positive) is sufficient
+// for the new key-value pair.
 func (p *pebbleResults) put(
-	ctx context.Context, key []byte, value []byte, memAccount *mon.BoundAccount,
+	ctx context.Context, key []byte, value []byte, memAccount *mon.BoundAccount, maxNewSize int,
 ) error {
 	const minSize = 16
 	const maxSize = 128 << 20 // 128 MB
@@ -97,19 +100,38 @@ func (p *pebbleResults) put(
 	lenValue := len(value)
 	lenToAdd := p.sizeOf(lenKey, lenValue)
 	if len(p.repr)+lenToAdd > cap(p.repr) {
+		// Exponential increase by default, while ensuring that we respect
+		// - a hard lower bound of lenToAdd
+		// - a soft upper bound of maxSize
+		// - a hard upper bound of maxNewSize (if set).
+		if maxNewSize > 0 && maxNewSize < lenToAdd {
+			// Hard upper bound is greater than hard lower bound - this is a
+			// violation of our assumptions.
+			return errors.AssertionFailedf("maxNewSize %dB is not sufficient, %dB required", maxNewSize, lenToAdd)
+		}
+		// Exponential growth to ensure newSize >= lenToAdd.
 		newSize := 2 * cap(p.repr)
 		if newSize == 0 || newSize > maxSize {
-			// If the previous buffer exceeded maxSize, we don't double its capacity
-			// for next allocation, and instead reset the exponential increase, in
-			// case we had a stray huge key-value.
+			// If the previous buffer exceeded maxSize, we don't double its
+			// capacity for next allocation, and instead reset the exponential
+			// increase, in case we had a stray huge key-value.
 			newSize = minSize
 		}
-		if lenToAdd >= maxSize {
+		for newSize < lenToAdd {
+			newSize *= 2
+		}
+		// Respect soft upper-bound before hard lower-bound, since it could be
+		// lower than hard lower-bound.
+		if newSize > maxSize {
+			newSize = maxSize
+		}
+		// Respect hard upper-bound.
+		if maxNewSize > 0 && newSize > maxNewSize {
+			newSize = maxNewSize
+		}
+		// Now respect hard lower-bound.
+		if newSize < lenToAdd {
 			newSize = lenToAdd
-		} else {
-			for newSize < lenToAdd && newSize < maxSize {
-				newSize *= 2
-			}
 		}
 		if len(p.repr) > 0 {
 			p.bufs = append(p.bufs, p.repr)
@@ -974,6 +996,7 @@ func (p *pebbleMVCCScanner) addAndAdvance(
 		p.resumeReason = roachpb.RESUME_KEY_LIMIT
 	}
 
+	var mustPutKey bool
 	if p.resumeReason != 0 {
 		// If we exceeded a limit, but we're not allowed to return an empty result,
 		// then make sure we include the first key in the result. If wholeRows is
@@ -982,6 +1005,7 @@ func (p *pebbleMVCCScanner) addAndAdvance(
 			(p.results.count == 0 || (p.wholeRows && p.results.continuesFirstRow(key))) {
 			p.resumeReason = 0
 			p.resumeNextBytes = 0
+			mustPutKey = true
 		} else {
 			p.resumeKey = key
 
@@ -1000,7 +1024,21 @@ func (p *pebbleMVCCScanner) addAndAdvance(
 		}
 	}
 
-	if err := p.results.put(ctx, rawKey, rawValue, p.memAccount); err != nil {
+	// We are here due to one of the following cases:
+	// A. No limits were exceeded
+	// B. Limits were exceeded, but we need to put a key, so mustPutKey = true.
+	//
+	// For B we will never set maxNewSize.
+	// For A, we may set maxNewSize, but we already know that
+	//   p.targetBytes >= p.results.bytes + lenToAdd
+	// so maxNewSize will be sufficient.
+	var maxNewSize int
+	if p.targetBytes > 0 && p.targetBytes > p.results.bytes && !mustPutKey {
+		// INVARIANT: !mustPutKey => maxNewSize is sufficient for key-value
+		// pair.
+		maxNewSize = int(p.targetBytes - p.results.bytes)
+	}
+	if err := p.results.put(ctx, rawKey, rawValue, p.memAccount, maxNewSize); err != nil {
 		p.err = errors.Wrapf(err, "scan with start key %s", p.start)
 		return false
 	}