-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
mvcc_range_tombstone_iterator.go
291 lines (261 loc) · 10.7 KB
/
mvcc_range_tombstone_iterator.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package storage
import (
"bytes"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/errors"
)
// MVCCRangeTombstoneIterOptions are options for an MVCCRangeTombstoneIterator.
type MVCCRangeTombstoneIterOptions struct {
// LowerBound sets the inclusive lower bound of the iterator. Tombstones that
// straddle the it will have their start key truncated to the lower bound.
//
// NB: It may be tempting to use an MVCCKey here and include a timestamp, but
// this would be useless: giving e.g. a@4 would skip a tombstone starting at
// a@5, but the tombstone would logically exist at the adjacent a.Next()@5 so
// it would be emitted almost immediately anyway.
LowerBound roachpb.Key
// UpperBound sets the exclusive upper bound of the iterator. Tombstones that
// straddle it will have their end key truncated to the upper bound.
UpperBound roachpb.Key
// MinTimestamp sets the inclusive lower timestamp bound for the iterator.
MinTimestamp hlc.Timestamp
// MaxTimestamp sets the inclusive upper timestamp bound for the iterator.
MaxTimestamp hlc.Timestamp
// Fragmented will emit tombstone fragments as they are stored in Pebble.
// Fragments typically begin and end where a tombstone bound overlaps with
// another tombstone, for all overlapping tombstones. However, fragmentation
// is non-deterministic as it also depends on Pebble's internal SST structure
// and mutation history.
//
// When enabled, this results in an iteration order of StartKey,Timestamp as
// opposed to the normal EndKey,Timestamp order for range tombstones. This may
// be useful for partial results and resumption, e.g. resume spans.
Fragmented bool
}
// MVCCRangeTombstoneIterator iterates over range tombstones in an engine and
// defragments them into contiguous range tombstones. It does not support
// seeking or backtracking, see RangeTombstoneIterOptions for lower/upper bounds
// and other options.
//
// Iteration uses EndKey,Timestamp order rather than StartKey,Timestamp. For
// example, [a-z)@3 will be emitted after [c-e)@2, but before [x-z)@1. This is a
// memory optimization when defragmenting Pebble range keys, to allow emitting
// tombstones as soon as possible. Otherwise, a single tombstone across the the
// entire key span would require all other tombstones at other timestamps to be
// buffered in memory before they could be emitted. However, see the Fragmented
// option which emits non-deterministic fragments in StartKey,Timestamp order.
type MVCCRangeTombstoneIterator struct {
iter MVCCIterator
opts MVCCRangeTombstoneIterOptions
incomplete []*MVCCRangeKey // defragmentation buffer
complete []MVCCRangeKey // queued for emission
completeIdx int // current Key()
iterDone bool // TODO(erikgrinaker): remove this
err error
}
// NewMVCCRangeTombstoneIterator sets up a new MVCRangeTombstoneIterator and
// seeks to the first range tombstone. The caller must call Close() when done.
func NewMVCCRangeTombstoneIterator(
r Reader, opts MVCCRangeTombstoneIterOptions,
) *MVCCRangeTombstoneIterator {
iter := &MVCCRangeTombstoneIterator{
iter: r.NewMVCCIterator(MVCCKeyIterKind, IterOptions{
KeyTypes: IterKeyTypeRangesOnly,
LowerBound: EncodeMVCCKey(MVCCKey{Key: opts.LowerBound}),
UpperBound: EncodeMVCCKey(MVCCKey{Key: opts.UpperBound}),
// TODO(erikgrinaker): We do not set Min/MaxTimestampHint here, because
// both are required and it's apparently not always safe to use.
}),
opts: opts,
incomplete: make([]*MVCCRangeKey, 0),
complete: make([]MVCCRangeKey, 0),
}
// Seek the iterator to the lower bound and iterate until we've collected
// the first complete range tombstone (if any).
iter.iter.SeekGE(MVCCKey{Key: opts.LowerBound})
iter.findCompleteTombstones()
return iter
}
// findCompleteTombstones processes range keys at the current iterator position
// and any subsequent iterator positions until it completes one or more
// tombstones, populating completeTombstones. Current completeTombstones are
// discarded.
func (p *MVCCRangeTombstoneIterator) findCompleteTombstones() {
p.complete = p.complete[:0]
p.completeIdx = 0
p.updateTombstones()
for len(p.complete) == 0 && !p.iterDone {
if ok, err := p.iter.Valid(); err != nil {
p.err = err
return
} else if !ok {
break
}
p.iter.Next()
// NB: We update tombstones even if Next() invalidates the iterator, because
// there may be incomplete tombstones that become complete when the iterator
// is exhausted.
p.updateTombstones()
}
}
// updateTombstones inspects the range keys at the current Pebble iterator
// position, tracks tombstones in incompleteTombstones, and moves any
// completed tombstones into completeTombstones.
func (p *MVCCRangeTombstoneIterator) updateTombstones() {
var startKey, endKey roachpb.Key
var rangeKeys []MVCCRangeKeyValue
// If the iterator is exhausted, we still want to complete any remaining
// incomplete tombstones.
if ok, err := p.iter.Valid(); err != nil {
p.err = err
return
} else if ok {
startKey, endKey = p.iter.RangeBounds()
rangeKeys = p.iter.RangeKeys()
// TODO(erikgrinaker): Pebble does not yet truncate range keys to the
// LowerBound or UpperBound of the range, so we truncate them here.
if p.opts.LowerBound != nil && bytes.Compare(startKey, p.opts.LowerBound) < 0 {
startKey = p.opts.LowerBound
}
if p.opts.UpperBound != nil && bytes.Compare(endKey, p.opts.UpperBound) > 0 {
endKey = p.opts.UpperBound
}
}
// TODO(erikgrinaker): Pebble does not yet respect UpperBound for range keys,
// and seems to go into an infinite loop if we try to exhaust the iterator
// here, so we use p.iterDone to mark it as done.
if len(p.opts.UpperBound) > 0 && bytes.Compare(startKey, p.opts.UpperBound) >= 0 {
p.iterDone = true
startKey, endKey, rangeKeys = nil, nil, nil
}
// Both RangeKeys and incompleteTombstones are sorted in descending suffix
// order, so we iterate over them in lockstep and insert/update/delete
// incompleteTombstones as appropriate.
var tsIdx, rkIdx int
for rkIdx < len(rangeKeys) {
rangeKey := rangeKeys[rkIdx]
// Error on non-tombstone range keys. We expect all range keys to be range
// tombstones currently.
//
// TODO(erikgrinaker): Pebble returns []byte{}, even though we wrote nil.
if len(rangeKey.Value) != 0 {
p.err = errors.Errorf("unexpected value for range key %s, expected nil: %x",
rangeKey.Key, rangeKey.Value)
return
}
// Filter rangekeys by suffix.
//
// TODO(erikgrinaker): This can be optimized by skipping unnecessary
// comparisons since rangeKeys is sorted by suffix. Maybe later.
if !p.opts.MinTimestamp.IsEmpty() && rangeKey.Key.Timestamp.Less(p.opts.MinTimestamp) {
rkIdx++
continue
}
if !p.opts.MaxTimestamp.IsEmpty() && p.opts.MaxTimestamp.Less(rangeKey.Key.Timestamp) {
rkIdx++
continue
}
// If we're at the end of incompleteTombstones, this range tombstone must be new.
if tsIdx >= len(p.incomplete) {
p.incomplete = append(p.incomplete, &MVCCRangeKey{
StartKey: append(make([]byte, 0, len(startKey)), startKey...),
EndKey: append(make([]byte, 0, len(endKey)), endKey...),
Timestamp: rangeKey.Key.Timestamp,
})
rkIdx++
tsIdx++
continue
}
incomplete := p.incomplete[tsIdx]
cmp := incomplete.Timestamp.Compare(rangeKey.Key.Timestamp)
switch {
// If the timestamps match and the key spans are adjacent or overlapping,
// this range key extends the incomplete tombstone.
case cmp == 0 && bytes.Compare(startKey, incomplete.EndKey) <= 0:
incomplete.EndKey = append(incomplete.EndKey[:0], endKey...)
tsIdx++
rkIdx++
// This is a different tombstone at the same suffix: complete the existing
// tombstone and start a new one.
case cmp == 0:
p.complete = append(p.complete, *incomplete)
// NB: can't reuse slices, as they were placed in the completed tombstone.
incomplete.StartKey = append(make([]byte, 0, len(startKey)), startKey...)
incomplete.EndKey = append(make([]byte, 0, len(endKey)), endKey...)
// This incomplete tombstone is not present at this range key: complete it
// and remove it from the list, then try again.
case cmp == 1:
p.complete = append(p.complete, *incomplete)
p.incomplete = append(p.incomplete[:tsIdx], p.incomplete[tsIdx+1:]...)
// This range key is a new incomplete tombstone: start tracking it.
case cmp == -1:
p.incomplete = append(p.incomplete[:tsIdx+1], p.incomplete[tsIdx:]...)
p.incomplete[tsIdx] = &MVCCRangeKey{
StartKey: append(make(roachpb.Key, 0, len(startKey)), startKey...),
EndKey: append(make(roachpb.Key, 0, len(endKey)), endKey...),
Timestamp: rangeKey.Key.Timestamp,
}
tsIdx++
rkIdx++
default:
p.err = errors.Errorf("unexpected comparison result %d", cmp)
return
}
}
// If the caller has requested tombstone fragments, we complete all tombstones
// we found during this iteration by resetting tsIdx to 0. The loop below will
// handle the rest.
if p.opts.Fragmented {
tsIdx = 0
}
// If there are any remaining incomplete tombstones, they must be complete:
// make them so.
for _, ts := range p.incomplete[tsIdx:] {
p.complete = append(p.complete, *ts)
}
p.incomplete = p.incomplete[:tsIdx]
}
// Close frees up resources held by the iterator.
func (p *MVCCRangeTombstoneIterator) Close() {
p.iter.Close()
p.complete = nil
p.completeIdx = 0
}
// Next iterates to the next range tombstone. Note the unusual iteration
// order, see struct comment for details.
func (p *MVCCRangeTombstoneIterator) Next() {
p.completeIdx++
if p.completeIdx >= len(p.complete) {
p.iter.Next()
// NB: Called even if Next() fails, because we may have incomplete
// tombstones that become complete when the iterator is exhausted.
p.findCompleteTombstones()
}
}
// Key returns the current range tombstone. It will not be invalidated by the
// iterator, but will be shared by all callers.
func (p *MVCCRangeTombstoneIterator) Key() MVCCRangeKey {
return p.complete[p.completeIdx]
}
// Valid returns (true, nil) if the iterator points to a valid key, (false, nil)
// if the iterator is exhausted, or (false, error) if an error occurred during
// iteration.
func (p *MVCCRangeTombstoneIterator) Valid() (bool, error) {
if p.err != nil {
return false, p.err
}
if _, err := p.iter.Valid(); err != nil {
return false, err
}
return p.completeIdx < len(p.complete), nil
}