-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
proposer_kv.proto
170 lines (153 loc) · 8.48 KB
/
proposer_kv.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
// Copyright 2016 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
syntax = "proto2";
package cockroach.storage.storagebase;
option go_package = "storagebase";
import "cockroach/pkg/roachpb/api.proto";
import "cockroach/pkg/roachpb/data.proto";
import "cockroach/pkg/roachpb/metadata.proto";
import "cockroach/pkg/storage/engine/enginepb/mvcc.proto";
import "cockroach/pkg/storage/storagebase/state.proto";
import "cockroach/pkg/util/hlc/timestamp.proto";
import "gogoproto/gogo.proto";
// TODO(bdarnell): could do away with Merge and ChangeReplicas and use their
// roachpb counterparts directly. See how this turns out as the protos here
// stabilize: the original intention is being more flexible with future
// additions to the local proto without having to touch the roachpb one, but
// that may not be required.
// Split is emitted when a Replica commits a split trigger. It signals that the
// Replica has prepared the on-disk state for both the left and right hand
// sides of the split, and that the left hand side Replica should be updated as
// well as the right hand side created.
message Split {
optional roachpb.SplitTrigger trigger = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// RHSDelta holds the statistics for what was written to what is now the
// right-hand side of the split during the batch which executed it.
// The on-disk state of the right-hand side is already correct, but the
// Store must learn about this delta to update its counters appropriately.
optional storage.engine.enginepb.MVCCStats rhs_delta = 2 [(gogoproto.nullable) = false,
(gogoproto.customname) = "RHSDelta"];
}
// Merge is emitted by a Replica which commits a transaction with
// a MergeTrigger (i.e. absorbs its right neighbor).
message Merge {
optional roachpb.MergeTrigger trigger = 1 [(gogoproto.nullable) = false,
(gogoproto.embed) = true];
}
// ChangeReplicas is emitted by a Replica which commits a transaction with
// a ChangeReplicasTrigger.
message ChangeReplicas {
optional roachpb.ChangeReplicasTrigger trigger = 1 [(gogoproto.nullable) = false,
(gogoproto.embed) = true];
}
// ReplicatedEvalResult is the structured information which together with
// a RocksDB WriteBatch constitutes the proposal payload in proposer-evaluated
// KV. For the majority of proposals, we expect ReplicatedEvalResult to be
// trivial; only changes to the metadata state (splits, merges, rebalances,
// leases, log truncation, ...) of the Replica or certain special commands must
// sideline information here based on which all Replicas must take action.
//
// TODO(tschottdorf): We may need to add a lease identifier to allow the
// followers to reliably produce errors for proposals which apply after a
// lease change.
message ReplicatedEvalResult {
// Whether to block concurrent readers while processing the proposal data.
optional bool block_reads = 10001 [(gogoproto.nullable) = false];
// Updates to the Replica's ReplicaState. By convention and as outlined on
// the comment on the ReplicaState message, this field is sparsely populated
// and any field set overwrites the corresponding field in the state, perhaps
// which additional side effects (for instance on a descriptor update).
optional storage.storagebase.ReplicaState state = 10002 [(gogoproto.nullable) = false];
optional Split split = 10003;
optional Merge merge = 10004;
// TODO(tschottdorf): trim this down; we shouldn't need the whole request.
optional roachpb.ComputeChecksumRequest compute_checksum = 10005;
optional bool is_lease_request = 10006 [(gogoproto.nullable) = false];
optional bool is_freeze = 10007 [(gogoproto.nullable) = false];
// Denormalizes BatchRequest.Timestamp during the transition period for
// proposer-evaluated KV. Only used to verify lease coverage.
optional util.hlc.Timestamp timestamp = 10008 [(gogoproto.nullable) = false];
optional bool is_consistency_related = 10009 [(gogoproto.nullable) = false];
// The stats delta corresponding to the data in this WriteBatch. On
// a split, contains only the contributions to the left-hand side.
optional storage.engine.enginepb.MVCCStats delta = 10010 [(gogoproto.nullable) = false];
optional ChangeReplicas change_replicas = 10012;
optional int64 raft_log_delta = 10013;
}
// WriteBatch is the serialized representation of a RocksDB write
// batch. A wrapper message is used so that the absence of the field
// can be distinguished from a zero-length batch, and so structs
// containing pointers to it can be compared with the == operator (we
// rely on this in storage.EvalResult)
message WriteBatch {
optional bytes data = 1;
}
// RaftCommand is the message written to the raft log. It contains
// some metadata about the proposal itself, then either a BatchRequest
// (legacy mode) or a ReplicatedEvalResult + WriteBatch
// (proposer-evaluated KV mode).
message RaftCommand {
// Metadata about the proposal itself. These fields exist at
// top-level instead of being grouped in a sub-message for
// backwards-compatibility.
// origin_lease is provided to verify at raft command apply-time that
// the lease under which the command was proposed remains in effect.
// If the command doesn't require a lease, origin_lease will be empty,
// with the exception of the Replica field.
optional roachpb.Lease origin_lease = 5 [(gogoproto.nullable) = false];
// When the command is applied, its result is an error if the lease log
// counter has already reached (or exceeded) max_lease_index.
//
// The lease index is a replay protection mechanism. Similar to the Raft
// applied index, it is strictly increasing, but may have gaps. A command
// will only apply successfully if its max_lease_index has not been surpassed
// by the Range's applied lease index (in which case the command may need
// to be retried, that is, regenerated with a higher max_lease_index).
// When the command applies, the new lease index will increase to
// max_lease_index (so a potential later replay will fail).
//
// This mechanism was introduced as a simpler alternative to using the Raft
// applied index, which is fraught with complexity due to the need to predict
// exactly the log position at which a command will apply, even when the Raft
// leader is not colocated with the lease holder (which usually proposes all
// commands).
//
// Pinning the lease-index to the assigned slot (as opposed to allowing gaps
// as we do now) is an interesting venue to explore from the standpoint of
// parallelization: One could hope to enforce command ordering in that way
// (without recourse to a higher-level locking primitive such as the command
// queue). This is a hard problem: First of all, managing the pending
// commands gets more involved; a command must not be removed if others have
// been added after it, and on removal, the assignment counters must be
// updated accordingly. Managing retry of proposals becomes trickier as
// well as that uproots whatever ordering was originally envisioned.
optional uint64 max_lease_index = 4 [(gogoproto.nullable) = false];
// Legacy mode (post-raft evaluation):
// batch_request is the KV command to apply.
// TODO(bdarnell): Should not be set when propEvalKV is used, but is currently
// required to support test filters.
optional roachpb.BatchRequest batch_request = 3;
// Proposer-evaluated KV mode.
// These are not stable. While general proto compatibility rules
// apply, these are intentionally kept at high tag numbers for now so that
// a stabilized version can be inserted at low tag numbers in the future.
// These fields are only populated if proposer-evaluated KV was in effect when
// the command was proposed.
optional ReplicatedEvalResult replicated_eval_result = 10013;
// TODO(tschottdorf): using an extra message here (and not just `bytes`) to
// allow the generated RaftCommand to be compared directly. If
// this costs an extra large allocation, we need to do something different.
optional WriteBatch write_batch = 10014;
reserved 1, 2, 10001 to 10012;
}