-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
proposer_kv.proto
139 lines (129 loc) · 7.23 KB
/
proposer_kv.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// Copyright 2016 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
syntax = "proto2";
package cockroach.storage.storagebase;
option go_package = "storagebase";
import "cockroach/pkg/roachpb/api.proto";
import "cockroach/pkg/roachpb/data.proto";
import "cockroach/pkg/roachpb/metadata.proto";
import "cockroach/pkg/storage/engine/enginepb/mvcc.proto";
import "cockroach/pkg/storage/storagebase/state.proto";
import "cockroach/pkg/util/hlc/timestamp.proto";
import "gogoproto/gogo.proto";
// Split is emitted when a Replica commits a split trigger. It signals that the
// Replica has prepared the on-disk state for both the left and right hand
// sides of the split, and that the left hand side Replica should be updated as
// well as the right hand side created.
message Split {
optional roachpb.SplitTrigger trigger = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// RHSDelta holds the statistics for what was written to what is now the
// right-hand side of the split during the batch which executed it.
// The on-disk state of the right-hand side is already correct, but the
// Store must learn about this delta to update its counters appropriately.
optional storage.engine.enginepb.MVCCStats rhs_delta = 2 [(gogoproto.nullable) = false,
(gogoproto.customname) = "RHSDelta"];
}
// Merge is emitted by a Replica which commits a transaction with
// a MergeTrigger (i.e. absorbs its right neighbor).
message Merge {
optional roachpb.MergeTrigger trigger = 1 [(gogoproto.nullable) = false,
(gogoproto.embed) = true];
}
// ChangeReplicas is emitted by a Replica which commits a transaction with
// a ChangeReplicasTrigger.
message ChangeReplicas {
optional roachpb.ChangeReplicasTrigger trigger = 1 [(gogoproto.nullable) = false,
(gogoproto.embed) = true];
}
// ReplicaProposalData is the structured information which together with
// a RocksDB WriteBatch constitutes the proposal payload in proposer-evaluated
// KV. For the majority of proposals, we expect ReplicatedProposalData to be
// trivial; only changes to the metadata state (splits, merges, rebalances,
// leases, log truncation, ...) of the Replica or certain special commands must
// sideline information here based on which all Replicas must take action.
//
// TODO(tschottdorf): We may need to add a lease identifier to allow the
// followers to reliably produce errors for proposals which apply after a
// lease change.
message ReplicatedProposalData {
// ======================================
// Beginning of what was formerly RaftCommand.
// ======================================
optional int64 range_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "RangeID",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"];
optional roachpb.ReplicaDescriptor origin_replica = 2 [(gogoproto.nullable) = false];
optional roachpb.BatchRequest cmd = 3;
// When the command is applied, its result is an error if the lease log
// counter has already reached (or exceeded) max_lease_index.
//
// The lease index is a replay protection mechanism. Similar to the Raft
// applied index, it is strictly increasing, but may have gaps. A command
// will only apply successfully if its max_lease_index has not been surpassed
// by the Range's applied lease index (in which case the command may need
// to be retried, that is, regenerated with a higher max_lease_index).
// When the command applies, the new lease index will increase to
// max_lease_index (so a potential later replay will fail).
//
// This mechanism was introduced as a simpler alternative to using the Raft
// applied index, which is fraught with complexity due to the need to predict
// exactly the log position at which a command will apply, even when the Raft
// leader is not colocated with the lease holder (which usually proposes all
// commands).
//
// Pinning the lease-index to the assigned slot (as opposed to allowing gaps
// as we do now) is an interesting venue to explore from the standpoint of
// parallelization: One could hope to enforce command ordering in that way
// (without recourse to a higher-level locking primitive such as the command
// queue). This is a hard problem: First of all, managing the pending
// commands gets more involved; a command must not be removed if others have
// been added after it, and on removal, the assignment counters must be
// updated accordingly. Managing retry of proposals becomes trickier as
// well as that uproots whatever ordering was originally envisioned.
optional uint64 max_lease_index = 4 [(gogoproto.nullable) = false];
// ======================================
// End of what was formerly RaftCommand and beginning of proposer-evaluated
// KV protos. These are not stable. While general proto compatibility rules
// apply, these are intentionally kept at high tag numbers for now so that
// a stabilized version can be inserted at low tag numbers in the future.
// ======================================
// Whether to block concurrent readers while processing the proposal data.
optional bool block_reads = 10001 [(gogoproto.nullable) = false];
// Updates to the Replica's ReplicaState. By convention and as outlined on
// the comment on the ReplicaState message, this field is sparsely populated
// and any field set overwrites the corresponding field in the state, perhaps
// which additional side effects (for instance on a descriptor update).
optional storage.storagebase.ReplicaState state = 10002 [(gogoproto.nullable) = false];
optional Split split = 10003;
optional Merge merge = 10004;
// TODO(tschottdorf): trim this down; we shouldn't need the whole request.
optional roachpb.ComputeChecksumRequest compute_checksum = 10005;
optional bool is_lease_request = 10006 [(gogoproto.nullable) = false];
optional bool is_freeze = 10007 [(gogoproto.nullable) = false];
// Denormalizes BatchRequest.Timestamp during the transition period for
// proposer-evaluated KV. Only used to verify lease coverage.
optional util.hlc.Timestamp timestamp = 10008 [(gogoproto.nullable) = false];
optional bool is_consistency_related = 10009 [(gogoproto.nullable) = false];
// The stats delta corresponding to the data in this WriteBatch. On
// a split, contains only the contributions to the left-hand side.
optional storage.engine.enginepb.MVCCStats delta = 10010 [(gogoproto.nullable) = false];
message WriteBatch {
optional bytes data = 1;
}
// TODO(tschottdorf): using an extra message here (and not just `bytes`) to
// allow the generated ReplicatedProposalData to be compared directly. If
// this costs an extra large allocation, we need to do something different.
optional WriteBatch write_batch = 10011;
optional ChangeReplicas change_replicas = 10012;
}