-
Notifications
You must be signed in to change notification settings - Fork 236
/
Copy pathriak_kv_vnode.erl
4598 lines (4276 loc) · 194 KB
/
riak_kv_vnode.erl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%% -------------------------------------------------------------------
%%
%% riak_kv_vnode: VNode Implementation
%%
%% Copyright (c) 2007-2016 Basho Technologies, Inc. All Rights Reserved.
%%
%% This file is provided to you under the Apache License,
%% Version 2.0 (the "License"); you may not use this file
%% except in compliance with the License. You may obtain
%% a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing,
%% software distributed under the License is distributed on an
%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
%% KIND, either express or implied. See the License for the
%% specific language governing permissions and limitations
%% under the License.
%%
%% -------------------------------------------------------------------
-module(riak_kv_vnode).
-behaviour(riak_core_vnode).
%% API
-export([test_vnode/1, put/7]).
-export([start_vnode/1,
start_vnodes/1,
get/3,
get/4,
head/3,
head/4,
del/3,
reap/3,
put/6,
local_get/2,
local_put/2,
local_put/3,
coord_put/6,
readrepair/6,
list_keys/4,
fold/3,
fold/4,
get_vclocks/2,
vnode_status/1,
ack_keys/1,
repair/1,
repair_status/1,
repair_filter/1,
hashtree_pid/1,
rehash/3,
refresh_index_data/4,
request_hashtree_pid/1,
request_hashtree_pid/2,
upgrade_hashtree/1,
reformat_object/2,
stop_fold/1,
get_modstate/1,
aae_send/1]).
%% riak_core_vnode API
-export([init/1,
terminate/2,
handle_command/3,
handle_overload_command/3,
handle_coverage/4,
is_empty/1,
delete/1,
request_hash/1,
object_info/1,
nval_map/1,
handle_handoff_command/3,
handoff_starting/2,
handoff_started/2, %% Note: optional function of the behaviour
handoff_cancelled/1,
handoff_finished/2,
handle_handoff_data/2,
encode_handoff_item/2,
handle_exit/3,
handle_info/2,
handle_overload_info/2,
ready_to_exit/0,%% Note: optional function of the behaviour
add_vnode_pool/2]). %% Note: optional function of the behaviour
-export([handoff_data_encoding_method/0]).
-export([set_vnode_forwarding/2]).
-include_lib("riak_kv_vnode.hrl").
-include_lib("riak_kv_index.hrl").
-include_lib("riak_kv_map_phase.hrl").
-include_lib("riak_core_pb.hrl").
-include("riak_kv_types.hrl").
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
-include_lib("riak_core/include/riak_core_bg_manager.hrl").
-export([put_merge/6]). %% For fsm_eqc_vnode
-endif.
-record(mrjob, {cachekey :: term(),
bkey :: term(),
reqid :: term(),
target :: pid()}).
-record(counter_state, {
%% kill switch, if for any reason one wants disable per-key-epoch, then set
%% [{riak_kv, [{per_key_epoch, false}]}].
use = true :: boolean(),
%% The number of new epoch writes co-ordinated by this vnode
%% What even is a "key epoch?" It is any time a key is
%% (re)created. A new write, a write not yet coordinated by
%% this vnode, a write where local state is unreadable.
cnt = 0 :: non_neg_integer(),
%% Counter leased up-to. For totally new state/id
%% this will be that flush threshold See config value
%% `{riak_kv, counter_lease_size}'
lease = 0 :: non_neg_integer(),
lease_size = 0 :: non_neg_integer(),
%% Has a lease been requested but not granted yet
leasing = false :: boolean()
}).
-type update_hook() :: module() | undefined.
-record(state, {idx :: partition(),
mod :: module(),
async_put :: boolean(),
modstate :: term(),
mrjobs :: term(),
vnodeid :: undefined | binary(),
delete_mode :: keep | immediate | pos_integer(),
bucket_buf_size :: pos_integer(),
index_buf_size :: pos_integer(),
key_buf_size :: pos_integer(),
async_folding :: boolean(),
in_handoff = false :: boolean(),
handoff_target :: node(),
handoffs_rejected = 0 :: integer(),
forward :: node() | [{integer(), node()}],
hashtrees :: pid(),
upgrade_hashtree = false :: boolean(),
md_cache :: ets:tab(),
md_cache_size :: pos_integer(),
counter :: #counter_state{},
status_mgr_pid :: pid(), %% a process that manages vnode status persistence
tictac_aae = false :: boolean(),
aae_controller :: undefined|pid(),
tictac_exchangequeue = []
:: list(riak_kv_entropy_manager:exchange()),
tictac_exchangecount = 0 :: integer(),
tictac_deltacount = 0 :: integer(),
tictac_exchangetime = 0 :: integer(),
tictac_startqueue = os:timestamp() :: erlang:timestamp(),
tictac_rebuilding = false :: erlang:timestamp()|false,
tictac_skiptick = 0 :: non_neg_integer(),
tictac_startup = true :: boolean(),
aae_tokenbucket = true :: boolean(),
worker_pool_strategy = single :: none|single|dscp,
vnode_pool_pid :: undefined|pid(),
update_hook :: update_hook(),
max_aae_queue_time :: non_neg_integer(),
enable_nextgenreplsrc = false :: boolean(),
sizelimit_nextgenreplsrc = 0 :: non_neg_integer()
}).
-type index_op() :: add | remove.
-type index_value() :: integer() | binary().
-type index() :: non_neg_integer().
-type state() :: #state{}.
-type vnodeid() :: binary().
-type counter_lease_error() :: {error, counter_lease_max_errors | counter_lease_timeout}.
-type old_object() :: riak_object:riak_object()|
confirmed_no_old_object|
assumed_no_old_object|
unchanged_no_old_object|
unknown_no_old_object.
% Hooks use no_old_object, but no_old_object can mean four things.
% 1 - A GET was done before the PUT, and no old object was found
% 2 - The path used assumes there is no old object
% 3 - The old object hasn't changed - so the new object is the old object
% 4 - The path doesn't consider an old object to be relevant
% This creates a type to represent these three cases separately, as
% well as the scenario where the is an old object.
% The function maybe_old-object/1 can be called to normalise the three
% cases back to the single case of no_old_object for hooks.
-type hook_old_object() :: riak_object:riak_object()|no_old_object.
% Type for old objects to be passed into hooks
-define(MD_CACHE_BASE, "riak_kv_vnode_md_cache").
-define(DEFAULT_HASHTREE_TOKENS, 90).
%% default value for `counter_lease' in `#counter_state{}'
%% NOTE: these MUST be positive integers!
%% @see non_neg_env/3
-define(DEFAULT_CNTR_LEASE, 10000).
%% On advise/review from Scott decided to cap the size of leases. 50m
%% is a lot of new epochs for a single vnode, and it saves us from
%% buring through vnodeids in the worst case.
-define(MAX_CNTR_LEASE, 50000000).
%% Should these cuttlefish-able? If it takes more than 20 attempts to
%% fsync the vnode counter to disk, die. (NOTE this is not ERRS*TO but
%% first to trip see blocking_lease_counter/3)
-define(DEFAULT_CNTR_LEASE_ERRS, 20).
%% If it takes more than 20 seconds to fsync the vnode counter to disk,
%% die
-define(DEFAULT_CNTR_LEASE_TO, 20000). % 20 seconds!
-define(MAX_REBUILD_TIME, 86400).
-define(MAX_AAE_QUEUE_TIME, 1000).
%% Queue time in ms to prompt a sync ping.
-define(AAE_SKIP_COUNT, 10).
-define(AAE_LOADING_WAIT, 5000).
-define(AF1_QUEUE, riak_core_node_worker_pool:af1()).
%% Assured Forwarding - pool 1
%% Hot backups and aae tree cache rebuils
-define(AF2_QUEUE, riak_core_node_worker_pool:af2()).
%% Assured Forwarding - pool 2
%% Any other handle_coverage that responds queue (e.g. leveled keylisting)
-define(AF3_QUEUE, riak_core_node_worker_pool:af3()).
%% Assured Forwarding - pool 3
%% AAE queries (per-bucket with/without key_range). AAE queries against
%% the cached tree do not use a pool (e.g. n_val queries)
-define(AF4_QUEUE, riak_core_node_worker_pool:af4()).
%% Assured Forwarding - pool 4
%% operational information queries (e.g. object_stats). Replication folds
%% for transition. Reaping operations
-define(BE_QUEUE, riak_core_node_worker_pool:be()).
%% Best efforts (aka scavenger) pool
%% Erlang's if Bool -> thing; true -> thang end. syntax hurts my
%% brain. It scans as if true -> thing; true -> thang end. So, here is
%% a macro, ?ELSE to use in if statements. You're welcome.
-define(ELSE, true).
-record(putargs, {returnbody :: boolean(),
coord:: boolean(),
lww :: boolean(),
bkey :: {binary(), binary()},
robj :: term(),
index_specs=[] :: [{index_op(), binary(), index_value()}],
reqid :: non_neg_integer(),
bprops :: riak_kv_bucket:props(),
starttime :: non_neg_integer(),
prunetime :: undefined| non_neg_integer(),
readrepair=false :: boolean(),
is_index=false :: boolean(), %% set if the b/end supports indexes
crdt_op = undefined :: undefined | term(), %% if set this is a crdt operation
hash_ops = no_hash_ops
}).
-type putargs() :: #putargs{}.
-spec maybe_create_hashtrees(state()) -> state().
maybe_create_hashtrees(State) ->
maybe_create_hashtrees(riak_kv_entropy_manager:enabled(), State).
-spec maybe_create_hashtrees(boolean(), state()) -> state().
maybe_create_hashtrees(false, State) ->
State#state{upgrade_hashtree=false};
maybe_create_hashtrees(true, State=#state{idx=Index, upgrade_hashtree=Upgrade,
mod=Mod, modstate=ModState}) ->
%% Only maintain a hashtree if a primary vnode
{ok, Ring} = riak_core_ring_manager:get_my_ring(),
case riak_core_ring:vnode_type(Ring, Index) of
primary ->
{ok, ModCaps} = Mod:capabilities(ModState),
Empty = case is_empty(State) of
{true, _} -> true;
{false, _, _} -> false
end,
Opts = [use_2i || lists:member(indexes, ModCaps)]
++ [vnode_empty || Empty]
++ [upgrade || Upgrade],
case riak_kv_index_hashtree:start(Index, self(), Opts) of
{ok, Trees} ->
monitor(process, Trees),
State#state{hashtrees=Trees, upgrade_hashtree=false};
Error ->
lager:info("riak_kv/~p: unable to start index_hashtree: ~p",
[Index, Error]),
erlang:send_after(1000, self(), retry_create_hashtree),
State#state{hashtrees=undefined}
end;
_ ->
State#state{upgrade_hashtree=false}
end.
-spec maybe_start_aaecontroller(active|passive, state()) -> state().
%% @doc
%% Start an AAE controller if riak_kv has been configured to use cached
%% tictac tree based AAE. Note that a controller will always start, and
%% receive updates, even if the vnode is not a primary (and will not be
%% involved in exchanges).
maybe_start_aaecontroller(passive, State) ->
State#state{tictac_aae=false, aae_controller=undefined};
maybe_start_aaecontroller(active, State=#state{mod=Mod,
idx=Partition,
modstate=ModState}) ->
{ok, ModCaps} = Mod:capabilities(ModState),
IsEmpty =
case is_empty(State) of
{true, _} -> true;
{false, _, _} -> false
end,
KeyStoreType =
case lists:member(leveled, ModCaps) of
true ->
Bookie = Mod:return_self(ModState),
{native, leveled_nko, Bookie};
false ->
ParallelStore =
app_helper:get_env(riak_kv, tictacaae_parallelstore),
{parallel, ParallelStore}
end,
Preflists = riak_kv_util:responsible_preflists(Partition),
RootPath = determine_aaedata_root(Partition),
RD = app_helper:get_env(riak_kv, tictacaae_rebuilddelay),
RW = app_helper:get_env(riak_kv, tictacaae_rebuildwait),
XTick = app_helper:get_env(riak_kv, tictacaae_exchangetick),
RTick = app_helper:get_env(riak_kv, tictacaae_rebuildtick),
StepInitialTick =
app_helper:get_env(riak_kv, tictacaae_stepinitialtick, true),
StoreHead = app_helper:get_env(riak_kv, tictacaae_storeheads),
ObjSplitFun = riak_object:aae_from_object_binary(StoreHead),
{ok, AAECntrl} =
aae_controller:aae_start(KeyStoreType,
IsEmpty,
{RW, RD},
Preflists,
RootPath,
ObjSplitFun),
lager:info("AAE Controller started with pid=~w", [AAECntrl]),
InitD = erlang:phash2(Partition, 256),
% Space out the initial poke to avoid over-coordination between vnodes,
% each of up to 256 vnodes will end on a different point in the slot, with
% the points wrapping every 256 vnodes (assuming coordinated restart)
FirstRebuildDelay = RTick + ((RTick div 256) * InitD),
FirstExchangeDelay = XTick + ((XTick div 256) * InitD),
riak_core_vnode:send_command_after(FirstRebuildDelay,
tictacaae_rebuildpoke),
riak_core_vnode:send_command_after(FirstExchangeDelay,
tictacaae_exchangepoke),
InitalStep =
case StepInitialTick of
true ->
% Stops each vnode from re-filling the AAE work queue at the
% same time, creating a pause in AAE across the cluster if all
% nodes in the cluster were started concurrently
erlang:phash2(Partition, 8);
false ->
% During riak_test we set this to false
0
end,
State#state{tictac_aae = true,
aae_controller = AAECntrl,
modstate = ModState,
tictac_rebuilding = false,
tictac_skiptick = InitalStep}.
-spec determine_aaedata_root(integer()) -> list().
%% @doc
%% Get a filepath to be used by the AAE store
determine_aaedata_root(Partition) ->
DataRoot = app_helper:get_env(riak_kv, tictacaae_dataroot),
filename:join(DataRoot, integer_to_list(Partition)).
-spec preflistfun(binary(), binary()) -> riak_kv_util:index_n().
%% @doc
%% Function to calculate preflist from Bucket and Key
preflistfun(Bucket, Key) -> riak_kv_util:get_index_n({Bucket, Key}).
-spec tictac_returnfun(partition(), store|trees|exchange) -> fun().
%% @doc
%% Function to be passed to return a response once an operation is complete
tictac_returnfun(Partition, exchange) ->
Vnode = {Partition, node()},
StartTime = os:timestamp(),
ReturnFun =
fun(ExchangeResult) ->
ok = tictacexchange_complete(Vnode, StartTime, ExchangeResult)
end,
ReturnFun;
tictac_returnfun(Partition, RebuildType) ->
Vnode = {Partition, node()},
StartTime = os:timestamp(),
ReturnFun =
fun(ok) ->
ok = tictacrebuild_complete(Vnode, StartTime, RebuildType)
end,
ReturnFun.
-spec tictac_rebuild(binary(), binary(), binary()) ->
{riak_kv_util:index_n(), vclock:vclock()}.
%% @doc
%% Return a function that takes [B, K, v] as arguements and converts that into
%% a {Preflist, Clock} output
tictac_rebuild(B, K, V) ->
IndexN = preflistfun(B, K),
Clock = element(1, riak_object:summary_from_binary(V)),
{IndexN, Clock}.
%% @doc
%% Queue a tictac tree rebuild. There are occasions when all vnodes queue this
%% at the same time, so important that the snapshot for the rebuild is taken
%% only when the fold is initiated. Otherwise the snapshot may expire whilst
%% sat on the queue
-spec queue_tictactreerebuild(pid(), partition(), boolean(), state()) -> ok.
queue_tictactreerebuild(AAECntrl, Partition, OnlyIfBroken, State) ->
Preflists = riak_kv_util:responsible_preflists(Partition),
Sender = self(),
ReturnFun = tictac_returnfun(Partition, trees),
FoldFun =
fun() ->
lager:info("Starting tree rebuild for partition=~w", [Partition]),
SW = os:timestamp(),
case when_loading_complete(AAECntrl,
Preflists,
fun preflistfun/2,
OnlyIfBroken) of
{ok, StoreFold, FinishFun} ->
Output = StoreFold(),
FinishFun(Output),
Duration =
timer:now_diff(os:timestamp(), SW) div (1000 * 1000),
lager:info("Tree rebuild complete for partition=~w" ++
" in duration=~w seconds",
[Partition, Duration]);
skipped ->
lager:info("Tree rebuild skipped for partition=~w",
[Partition])
end,
ok
end,
JustReturnFun =
fun(ok) ->
ReturnFun(ok)
end,
Pool = select_queue(?AF1_QUEUE, State),
riak_core_vnode:queue_work(Pool,
{fold, FoldFun, JustReturnFun},
Sender,
State#state.vnode_pool_pid).
when_loading_complete(AAECntrl, Preflists, PreflistFun, OnlyIfBroken) ->
case is_process_alive(AAECntrl) of
true ->
R = aae_controller:aae_rebuildtrees(AAECntrl,
Preflists, PreflistFun,
OnlyIfBroken),
case R of
loading ->
timer:sleep(?AAE_LOADING_WAIT),
when_loading_complete(AAECntrl,
Preflists,
PreflistFun,
OnlyIfBroken);
_ ->
R
end;
_ ->
% May have queued a rebuild for a vnode aae controller for an
% exited vnode (e.g. one which has completed handoff)
skipped
end.
%% @doc Reveal the underlying module state for testing
-spec(get_modstate(state()) -> {atom(), state()}).
get_modstate(_State=#state{mod=Mod, modstate=ModState}) ->
{Mod, ModState}.
-spec get_asyncopts(state(), binary()|all) -> list(atom()|tuple()).
%% @doc
%% Return a start to the options list based on the async capability of the
%% vnode and the backend
get_asyncopts(State, Bucket) ->
{Mod, ModState} = get_modstate(State),
AsyncFolding = State#state.async_folding,
{{ok, Capabilities}, Opts0} =
case Bucket of
all ->
{Mod:capabilities(ModState), []};
_ ->
{Mod:capabilities(Bucket, ModState), [{bucket, Bucket}]}
end,
AsyncBackend = lists:member(async_fold, Capabilities),
case AsyncFolding andalso AsyncBackend of
true ->
[async_fold|Opts0];
false ->
Opts0
end.
%% API
start_vnode(I) ->
riak_core_vnode_master:get_vnode_pid(I, riak_kv_vnode).
start_vnodes(IdxList) ->
riak_core_vnode_master:get_vnode_pid(IdxList, riak_kv_vnode).
test_vnode(I) ->
riak_core_vnode:start_link(riak_kv_vnode, I, infinity).
-spec aae_send(tuple()) -> fun().
%% @doc
%% Return a function which will send an aae request to a given vnode, and can
%% prompt the response to be received by sender
aae_send(Preflist) ->
fun(AAERequest, IndexNs, Colour) ->
Sender = {fsm, undefined, self()},
riak_core_vnode_master:command(Preflist,
{aae, AAERequest, IndexNs, Colour},
Sender,
riak_kv_vnode_master)
end.
-spec tictacrebuild_complete({partition(), node()},
erlang:timestamp(),
store|trees) -> ok.
%% @doc
%% Inform the vnode that an aae rebuild is complete
tictacrebuild_complete(Vnode, StartTime, ProcessType) ->
riak_core_vnode_master:command(Vnode,
{rebuild_complete,
ProcessType,
StartTime},
riak_kv_vnode_master).
-spec tictacexchange_complete({partition(), node()},
erlang:timestamp(),
{atom(), non_neg_integer()}) -> ok.
%% @doc
%% Infor the vnode that an aae exchange is complete
tictacexchange_complete(Vnode, StartTime, ExchangeResult) ->
riak_core_vnode_master:command(Vnode,
{exchange_complete,
ExchangeResult,
StartTime},
riak_kv_vnode_master).
get(Preflist, BKey, ReqId) ->
%% Assuming this function is called from a FSM process
%% so self() == FSM pid
get(Preflist, BKey, ReqId, {fsm, undefined, self()}).
get(Preflist, BKey, ReqId, Sender) ->
Req = riak_kv_requests:new_get_request(sanitize_bkey(BKey), ReqId),
riak_core_vnode_master:command(Preflist,
Req,
Sender,
riak_kv_vnode_master).
head(Preflist, BKey, ReqId) ->
%% Assuming this function is called from a FSM process
%% so self() == FSM pid
head(Preflist, BKey, ReqId, {fsm, undefined, self()}).
head(Preflist, BKey, ReqId, Sender) ->
Req = riak_kv_requests:new_head_request(sanitize_bkey(BKey), ReqId),
riak_core_vnode_master:command(Preflist,
Req,
Sender,
riak_kv_vnode_master).
del(Preflist, BKey, ReqId) ->
Req = riak_kv_requests:new_delete_request(sanitize_bkey(BKey), ReqId),
riak_core_vnode_master:command(Preflist, Req, riak_kv_vnode_master).
%% @doc
%% Reap a tombstone, assuming a preflist of UP primaries.
-spec reap(riak_core_apl:preflist(),
{riak_object:bucket(), riak_object:key()},
non_neg_integer()) -> ok.
reap(Preflist, {Bucket, Key}, DeleteHash) ->
Req = riak_kv_requests:new_reap_request({Bucket, Key}, DeleteHash),
[{Idx, Node}|Rest] = Preflist,
%% For the head of the preflist we do this sync, to regulate the pace of
%% reaps and help prevent overloading of vnodes.
ok = riak_core_vnode_master:sync_command({Idx, Node},
Req,
riak_kv_vnode_master),
riak_core_vnode_master:command(Rest, Req, riak_kv_vnode_master).
%% Issue a put for the object to the preflist, expecting a reply
%% to an FSM.
put(Preflist, BKey, Obj, ReqId, StartTime, Options) when is_integer(StartTime) ->
put(Preflist, BKey, Obj, ReqId, StartTime, Options, {fsm, undefined, self()}).
put(Preflist, BKey, Obj, ReqId, StartTime, Options, Sender)
when is_integer(StartTime) ->
Req = riak_kv_requests:new_put_request(
sanitize_bkey(BKey), Obj, ReqId, StartTime, Options),
riak_core_vnode_master:command(Preflist,
Req,
Sender,
riak_kv_vnode_master).
local_put(Index, Obj) ->
local_put(Index, Obj, []).
local_put(Index, Obj, Options) ->
BKey = {riak_object:bucket(Obj), riak_object:key(Obj)},
Ref = make_ref(),
ReqId = erlang:phash2(erlang:now()),
StartTime = riak_core_util:moment(),
Sender = {raw, Ref, self()},
put({Index, node()}, BKey, Obj, ReqId, StartTime, Options, Sender),
receive
{Ref, Reply} ->
Reply
end.
local_get(Index, BKey) ->
Ref = make_ref(),
ReqId = erlang:phash2(erlang:now()),
Sender = {raw, Ref, self()},
get({Index,node()}, BKey, ReqId, Sender),
receive
{Ref, {r, Result, Index, ReqId}} ->
Result;
{Ref, Reply} ->
{error, Reply}
end.
refresh_index_data(Partition, BKey, IdxData, TimeOut) ->
riak_core_vnode_master:sync_command({Partition, node()},
{refresh_index_data, BKey, IdxData},
riak_kv_vnode_master,
TimeOut).
%% Issue a put for the object to the preflist, expecting a reply
%% to an FSM.
coord_put(IndexNode, BKey, Obj, ReqId, StartTime, Options) when is_integer(StartTime) ->
coord_put(IndexNode, BKey, Obj, ReqId, StartTime, Options, {fsm, undefined, self()}).
coord_put(IndexNode, BKey, Obj, ReqId, StartTime, Options, Sender)
when is_integer(StartTime) ->
put([IndexNode], BKey, Obj, ReqId, StartTime, [coord | Options], Sender).
%% Do a put without sending any replies
readrepair(Preflist, BKey, Obj, ReqId, StartTime, Options) ->
put(Preflist, BKey, Obj, ReqId, StartTime, [rr | Options], ignore).
list_keys(Preflist, ReqId, Caller, Bucket) ->
riak_core_vnode_master:command(Preflist,
#riak_kv_listkeys_req_v2{
bucket=Bucket,
req_id=ReqId,
caller=Caller},
ignore,
riak_kv_vnode_master).
fold(Preflist, Fun, Acc0) ->
Req = riak_core_util:make_fold_req(Fun, Acc0),
riak_core_vnode_master:sync_spawn_command(Preflist,
Req,
riak_kv_vnode_master).
fold(Preflist, Fun, Acc0, Options) ->
Req = riak_core_util:make_fold_req(Fun, Acc0, false, Options),
riak_core_vnode_master:sync_spawn_command(Preflist,
Req,
riak_kv_vnode_master).
get_vclocks(Preflist, BKeyList) ->
riak_core_vnode_master:sync_spawn_command(Preflist,
riak_kv_requests:new_vclock_request(BKeyList),
riak_kv_vnode_master).
%% @doc Get status information about the node local vnodes.
vnode_status(PrefLists) ->
ReqId = erlang:phash2({self(), os:timestamp()}),
%% Get the status of each vnode
riak_core_vnode_master:command(PrefLists,
riak_kv_requests:new_vnode_status_request(),
{raw, ReqId, self()},
riak_kv_vnode_master),
wait_for_vnode_status_results(PrefLists, ReqId, []).
%% @doc Repair the given `Partition'.
-spec repair(partition()) ->
{ok, Pairs::[{partition(), node()}]} |
{down, Down::[{partition(), node()}]}.
repair(Partition) ->
Service = riak_kv,
MP = {riak_kv_vnode, Partition},
FilterModFun = {?MODULE, repair_filter},
riak_core_vnode_manager:repair(Service, MP, FilterModFun).
%% @doc Get the status of the repair process for the given `Partition'.
-spec repair_status(partition()) -> not_found | in_progress.
repair_status(Partition) ->
riak_core_vnode_manager:repair_status({riak_kv_vnode, Partition}).
%% @doc Given a `Target' partition generate a `Filter' fun to use
%% during partition repair.
-spec repair_filter(partition()) -> Filter::function().
repair_filter(Target) ->
{ok, Ring} = riak_core_ring_manager:get_my_ring(),
riak_core_repair:gen_filter(Target,
Ring,
nval_map(Ring),
riak_core_bucket:default_object_nval(),
fun object_info/1).
-spec hashtree_pid(index()) -> {ok, pid()} | {error, wrong_node}.
hashtree_pid(Partition) ->
riak_core_vnode_master:sync_command({Partition, node()},
{hashtree_pid, node()},
riak_kv_vnode_master,
infinity).
%% Asynchronous version of {@link hashtree_pid/1} that sends a message back to
%% the calling process. Used by the {@link riak_kv_entropy_manager}.
-spec request_hashtree_pid(index()) -> ok.
request_hashtree_pid(Partition) ->
ReqId = {hashtree_pid, Partition},
request_hashtree_pid(Partition, {raw, ReqId, self()}).
%% Version of {@link request_hashtree_pid/1} that takes a sender argument,
%% which could be a raw process, fsm, gen_server, etc.
request_hashtree_pid(Partition, Sender) ->
riak_core_vnode_master:command({Partition, node()},
{hashtree_pid, node()},
Sender,
riak_kv_vnode_master).
%% @doc Destroy and restart the hashtrees associated with Partitions vnode.
-spec upgrade_hashtree(index()) -> ok | {error, wrong_node}.
upgrade_hashtree(Partition) ->
riak_core_vnode_master:command({Partition, node()},
{upgrade_hashtree, node()},
ignore,
riak_kv_vnode_master).
%% Used by {@link riak_kv_exchange_fsm} to force a vnode to update the hashtree
%% for repaired keys. Typically, repairing keys will trigger read repair that
%% will update the AAE hash in the write path. However, if the AAE tree is
%% divergent from the KV data, it is possible that AAE will try to repair keys
%% that do not have divergent KV replicas. In that case, read repair is never
%% triggered. Always rehashing keys after any attempt at repair ensures that
%% AAE does not try to repair the same non-divergent keys over and over.
rehash(Preflist, Bucket, Key) ->
riak_core_vnode_master:command(Preflist,
{rehash, Bucket, Key},
ignore,
riak_kv_vnode_master).
-spec reformat_object(index(), {riak_object:bucket(), riak_object:key()}) ->
ok | {error, term()}.
reformat_object(Partition, BKey) ->
riak_core_vnode_master:sync_spawn_command({Partition, node()},
{reformat_object,
sanitize_bkey(BKey)},
riak_kv_vnode_master).
%% VNode callbacks
init([Index]) ->
Mod = app_helper:get_env(riak_kv, storage_backend),
Configuration = app_helper:get_env(riak_kv),
BucketBufSize = app_helper:get_env(riak_kv, bucket_buffer_size, 1000),
IndexBufSize = app_helper:get_env(riak_kv, index_buffer_size, 100),
KeyBufSize = app_helper:get_env(riak_kv, key_buffer_size, 100),
WorkerPoolSize = app_helper:get_env(riak_kv, worker_pool_size, 10),
UseEpochCounter = app_helper:get_env(riak_kv, use_epoch_counter, true),
%% This _has_ to be a non_neg_integer(), and really, if it is
%% zero, you are fsyncing every.single.key epoch.
CounterLeaseSize = min(?MAX_CNTR_LEASE,
non_neg_env(riak_kv, counter_lease_size, ?DEFAULT_CNTR_LEASE)),
{ok, StatusMgr} = riak_kv_vnode_status_mgr:start_link(self(), Index, UseEpochCounter),
{ok, {VId, CounterState}} = get_vnodeid_and_counter(StatusMgr, CounterLeaseSize, UseEpochCounter),
DeleteMode = app_helper:get_env(riak_kv, delete_mode, 3000),
AsyncFolding = app_helper:get_env(riak_kv, async_folds, true) == true,
MDCacheSize = app_helper:get_env(riak_kv, vnode_md_cache_size),
MDCache =
case MDCacheSize of
N when is_integer(N),
N > 0 ->
lager:debug("Initializing metadata cache with size limit: ~p bytes",
[MDCacheSize]),
new_md_cache(VId);
_ ->
lager:debug("No metadata cache size defined, not starting"),
undefined
end,
EnableTictacAAE =
app_helper:get_env(riak_kv, tictacaae_active, passive),
WorkerPoolStrategy =
app_helper:get_env(riak_kv, worker_pool_strategy),
TokenBucket =
app_helper:get_env(riak_kv, aae_tokenbucket, true),
MaxAAEQueueTime =
app_helper:get_env(riak_kv, max_aae_queue_time, ?MAX_AAE_QUEUE_TIME),
EnableNextGenReplSrc =
app_helper:get_env(riak_kv, replrtq_enablesrc, false),
SizeLimitNextGenReplSrc =
app_helper:get_env(riak_kv, replrtq_srcobjectsize, 0),
case catch Mod:start(Index, Configuration) of
{ok, ModState} ->
%% Get the backend capabilities
DoAsyncPut = case app_helper:get_env(riak_kv, allow_async_put, true) of
true ->
erlang:function_exported(Mod, async_put, 5);
_ ->
false
end,
State = #state{idx=Index,
async_folding=AsyncFolding,
mod=Mod,
async_put = DoAsyncPut,
modstate=ModState,
vnodeid=VId,
counter=CounterState,
status_mgr_pid=StatusMgr,
delete_mode=DeleteMode,
bucket_buf_size=BucketBufSize,
index_buf_size=IndexBufSize,
key_buf_size=KeyBufSize,
mrjobs=dict:new(),
md_cache=MDCache,
md_cache_size=MDCacheSize,
worker_pool_strategy=WorkerPoolStrategy,
update_hook=update_hook(),
max_aae_queue_time=MaxAAEQueueTime,
aae_tokenbucket=TokenBucket,
enable_nextgenreplsrc = EnableNextGenReplSrc,
sizelimit_nextgenreplsrc = SizeLimitNextGenReplSrc},
try_set_vnode_lock_limit(Index),
case AsyncFolding of
true ->
%% Create worker pool initialization tuple
FoldWorkerPool = {pool, riak_kv_worker, WorkerPoolSize, []},
State2 = maybe_create_hashtrees(State),
State3 =
maybe_start_aaecontroller(EnableTictacAAE, State2),
{ok, State3, [FoldWorkerPool]};
false ->
{ok, State}
end;
{error, Reason} ->
lager:error("Failed to start ~p backend for index ~p error: ~p",
[Mod, Index, Reason]),
riak:stop("backend module failed to start."),
{error, Reason};
{'EXIT', Reason1} ->
lager:error("Failed to start ~p backend for index ~p crash: ~p",
[Mod, Index, Reason1]),
riak:stop("backend module failed to start."),
{error, Reason1}
end.
handle_overload_command(Req, Sender, Idx) ->
handle_overload_request(riak_kv_requests:request_type(Req), Req, Sender, Idx).
handle_overload_request(kv_put_request, _Req, Sender, Idx) ->
riak_core_vnode:reply(Sender, {fail, Idx, overload});
handle_overload_request(kv_get_request, Req, Sender, Idx) ->
ReqId = riak_kv_requests:get_request_id(Req),
riak_core_vnode:reply(Sender, {r, {error, overload}, Idx, ReqId});
handle_overload_request(kv_head_request, Req, Sender, Idx) ->
ReqId = riak_kv_requests:get_request_id(Req),
riak_core_vnode:reply(Sender, {r, {error, overload}, Idx, ReqId});
handle_overload_request(kv_w1c_put_request, Req, Sender, _Idx) ->
Type = riak_kv_requests:get_replica_type(Req),
riak_core_vnode:reply(Sender, ?KV_W1C_PUT_REPLY{reply={error, overload}, type=Type});
handle_overload_request(kv_vnode_status_request, _Req, Sender, Idx) ->
riak_core_vnode:reply(Sender, {vnode_status, Idx, [{error, overload}]});
handle_overload_request(_, _Req, Sender, _Idx) ->
riak_core_vnode:reply(Sender, {error, mailbox_overload}).
%% Handle all SC overload messages here
handle_overload_info({ensemble_ping, _From}, _Idx) ->
%% Don't respond to pings in overload
ok;
handle_overload_info({ensemble_get, _, From}, _Idx) ->
riak_kv_ensemble_backend:reply(From, {error, vnode_overload});
handle_overload_info({ensemble_put, _, _, From}, _Idx) ->
riak_kv_ensemble_backend:reply(From, {error, vnode_overload});
handle_overload_info({raw_forward_put, _, _, From}, _Idx) ->
riak_kv_ensemble_backend:reply(From, {error, vnode_overload});
handle_overload_info({raw_forward_get, _, From}, _Idx) ->
riak_kv_ensemble_backend:reply(From, {error, vnode_overload});
handle_overload_info(_, _) ->
ok.
handle_command({aae, AAERequest, IndexNs, Colour}, Sender, State) ->
ReturnFun =
fun(R) ->
riak_core_vnode:reply(Sender, {reply, R, Colour})
end,
case State#state.tictac_aae of
false ->
ReturnFun(not_supported);
true ->
Cntrl = State#state.aae_controller,
case AAERequest of
fetch_root ->
aae_controller:aae_mergeroot(Cntrl,
IndexNs,
ReturnFun);
{fetch_branches, BranchIDs} ->
aae_controller:aae_mergebranches(Cntrl,
IndexNs,
BranchIDs,
ReturnFun);
{fetch_clocks, SegmentIDs} ->
IndexNFun =
fun(B, K) -> riak_kv_util:get_index_n({B, K}) end,
aae_controller:aae_fetchclocks(Cntrl,
IndexNs,
SegmentIDs,
ReturnFun,
IndexNFun)
end
end,
{noreply, State};
handle_command(#riak_kv_listkeys_req_v2{bucket=Input, req_id=ReqId, caller=Caller}, _Sender,
State=#state{key_buf_size=BufferSize,
mod=Mod,
modstate=ModState,
idx=Idx}) ->
case Input of
{filter, Bucket, Filter} ->
ok;
Bucket ->
Filter = none
end,
BufferMod = riak_kv_fold_buffer,
case Bucket of
'_' ->
Opts = get_asyncopts(State, all),
BufferFun =
fun(Results) ->
UniqueResults = lists:usort(Results),
Caller ! {ReqId, {kl, Idx, UniqueResults}}
end,
FoldFun = fold_fun(buckets, BufferMod, Filter, undefined),
ModFun = fold_buckets;
_ ->
Opts = get_asyncopts(State, Bucket),
BufferFun =
fun(Results) ->
Caller ! {ReqId, {kl, Idx, Results}}
end,
Extras = fold_extras_keys(Idx, Bucket),
FoldFun = fold_fun(keys, BufferMod, Filter, Extras),
ModFun = fold_keys
end,
Buffer = riak_kv_fold_buffer:new(BufferSize, BufferFun),
FinishFun =
fun(Buffer1) ->
riak_kv_fold_buffer:flush(Buffer1),
Caller ! {ReqId, Idx, done}
end,
case list(FoldFun, FinishFun, Mod, ModFun, ModState, Opts, Buffer) of
{async, AsyncWork} ->
{async, {fold, AsyncWork, FinishFun}, Caller, State};
_ ->
{noreply, State}
end;
handle_command(#riak_core_fold_req_v1{} = ReqV1,
Sender, State) ->
%% Use make_fold_req() to upgrade to the most recent ?FOLD_REQ
handle_command(riak_core_util:make_newest_fold_req(ReqV1), Sender, State);
handle_command(?FOLD_REQ{foldfun=FoldFun, acc0=Acc0,
forwardable=_Forwardable, opts=Opts}, Sender, State) ->
%% The riak_core layer takes care of forwarding/not forwarding, so
%% we ignore forwardable here.
%%
%% The function in riak_core used for object folding expects the
%% bucket and key pair to be passed as the first parameter, but in
%% riak_kv the bucket and key have been separated. This function
%% wrapper is to address this mismatch.
FoldWrapper = fun(Bucket, Key, Value, Acc) ->
FoldFun({Bucket, Key}, Value, Acc)
end,
do_fold(FoldWrapper, Acc0, Sender, Opts, State);
%% entropy exchange commands
handle_command({hashtree_pid, Node}, _, State=#state{hashtrees=HT}) ->
%% Handle riak_core request forwarding during ownership handoff.
case node() of