Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove randomized startup delays #3075

Merged
merged 1 commit into from
Jun 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 21 additions & 15 deletions deps/rabbit/priv/schema/rabbit.schema
Original file line number Diff line number Diff line change
Expand Up @@ -945,31 +945,37 @@ fun(Conf) ->
end}.

%% Cluster formation: Randomized startup delay
%%
%% DEPRECATED: This is a no-op. Old configs are still allowed, but a warning will be printed.

{mapping, "cluster_formation.randomized_startup_delay_range.min", "rabbit.cluster_formation.randomized_startup_delay_range",
[{datatype, integer}]}.
{mapping, "cluster_formation.randomized_startup_delay_range.max", "rabbit.cluster_formation.randomized_startup_delay_range",
[{datatype, integer}]}.
{mapping, "cluster_formation.randomized_startup_delay_range.min", "rabbit.cluster_formation.randomized_startup_delay_range", []}.
{mapping, "cluster_formation.randomized_startup_delay_range.max", "rabbit.cluster_formation.randomized_startup_delay_range", []}.

{translation, "rabbit.cluster_formation.randomized_startup_delay_range",
fun(Conf) ->
Min = cuttlefish:conf_get("cluster_formation.randomized_startup_delay_range.min", Conf, undefined),
Max = cuttlefish:conf_get("cluster_formation.randomized_startup_delay_range.max", Conf, undefined),

case {Min, Max} of
{undefined, undefined} ->
cuttlefish:unset();
{undefined, Max} ->
%% fallback default
{5, Max};
{Min, undefined} ->
%% fallback default
{Min, 60};
{Min, Max} ->
{Min, Max}
end
{undefined, undefined} ->
ok;
_ ->
cuttlefish:warn("cluster_formation.randomized_startup_delay_range.min and "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FTR, Cuttlefish schema translation happens so early that this won't be visible in the regular or prelaunch log, at least that what I observed. This is a general problem with logging at such really early node boot stages, we shouldn't try to address it in this PR (or even for 3.9).

The point is that existing config files with these options will pass validation 👍

"cluster_formation.randomized_startup_delay_range.max are deprecated")
end,
cuttlefish:unset()
end}.

%% Cluster formation: lock acquisition retries as passed to https://erlang.org/doc/man/global.html#set_lock-3
%%
%% Currently used in classic, k8s, and aws peer discovery backends.

{mapping, "cluster_formation.internal_lock_retries", "rabbit.cluster_formation.internal_lock_retries",
[
{datatype, integer},
{validators, ["non_zero_positive_integer"]}
]}.

%% Cluster formation: discovery failure retries

{mapping, "cluster_formation.lock_retry_limit", "rabbit.cluster_formation.lock_retry_limit",
Expand Down
15 changes: 6 additions & 9 deletions deps/rabbit/src/rabbit_mnesia.erl
Original file line number Diff line number Diff line change
Expand Up @@ -105,21 +105,18 @@ init_with_lock(Retries, Timeout, RunPeerDiscovery) ->
rabbit_log:debug("rabbit_peer_discovery:lock returned ~p", [LockResult]),
case LockResult of
not_supported ->
rabbit_log:info("Peer discovery backend does not support locking, falling back to randomized delay"),
%% See rabbitmq/rabbitmq-server#1202 for details.
rabbit_peer_discovery:maybe_inject_randomized_delay(),
RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register();
{error, _Reason} ->
timer:sleep(Timeout),
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery);
{ok, Data} ->
try
RunPeerDiscovery(),
rabbit_peer_discovery:maybe_register()
after
rabbit_peer_discovery:unlock(Data)
end
end;
{error, _Reason} ->
timer:sleep(Timeout),
init_with_lock(Retries - 1, Timeout, RunPeerDiscovery)
end.

-spec run_peer_discovery() -> ok | {[node()], node_type()}.
Expand Down Expand Up @@ -178,7 +175,7 @@ join_discovered_peers(TryNodes, NodeType) ->
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterval).

join_discovered_peers_with_retries(TryNodes, _NodeType, 0, _DelayInterval) ->
rabbit_log:warning(
rabbit_log:info(
"Could not successfully contact any node of: ~s (as in Erlang distribution). "
"Starting as a blank standalone node...",
[string:join(lists:map(fun atom_to_list/1, TryNodes), ",")]),
Expand All @@ -193,7 +190,7 @@ join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft, DelayInterva
rabbit_node_monitor:notify_joined_cluster();
none ->
RetriesLeft1 = RetriesLeft - 1,
rabbit_log:error("Trying to join discovered peers failed. Will retry after a delay of ~b ms, ~b retries left...",
rabbit_log:info("Trying to join discovered peers failed. Will retry after a delay of ~b ms, ~b retries left...",
[DelayInterval, RetriesLeft1]),
timer:sleep(DelayInterval),
join_discovered_peers_with_retries(TryNodes, NodeType, RetriesLeft1, DelayInterval)
Expand Down
20 changes: 20 additions & 0 deletions deps/rabbit/src/rabbit_nodes.erl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
boot/0]).
-export([persistent_cluster_id/0, seed_internal_cluster_id/0, seed_user_provided_cluster_name/0]).
-export([all_running_with_hashes/0]).
-export([lock_id/1, lock_retries/0]).

-include_lib("kernel/include/inet.hrl").
-include_lib("rabbit_common/include/rabbit.hrl").
Expand All @@ -23,6 +24,12 @@

-define(INTERNAL_CLUSTER_ID_PARAM_NAME, internal_cluster_id).

% Retries as passed to https://erlang.org/doc/man/global.html#set_lock-3
% To understand how retries map to the timeout, read
% https://github.com/erlang/otp/blob/d256ae477014158a49bb860b283df9c040011197/lib/kernel/src/global.erl#L2062-L2075
% 80 corresponds to a timeout of ca 300 seconds.
-define(DEFAULT_LOCK_RETRIES, 80).

%%----------------------------------------------------------------------------
%% API
%%----------------------------------------------------------------------------
Expand Down Expand Up @@ -160,3 +167,16 @@ await_running_count_with_retries(TargetCount, Retries) ->
-spec all_running_with_hashes() -> #{non_neg_integer() => node()}.
all_running_with_hashes() ->
maps:from_list([{erlang:phash2(Node), Node} || Node <- all_running()]).

-spec lock_id(Node :: node()) -> {ResourceId :: string(), LockRequesterId :: node()}.
lock_id(Node) ->
{cookie_hash(), Node}.

-spec lock_retries() -> integer().
lock_retries() ->
case application:get_env(rabbit, cluster_formation) of
{ok, PropList} ->
proplists:get_value(internal_lock_retries, PropList, ?DEFAULT_LOCK_RETRIES);
undefined ->
?DEFAULT_LOCK_RETRIES
end.
61 changes: 1 addition & 60 deletions deps/rabbit/src/rabbit_peer_discovery.erl
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
-export([maybe_init/0, discover_cluster_nodes/0, backend/0, node_type/0,
normalize/1, format_discovered_nodes/1, log_configured_backend/0,
register/0, unregister/0, maybe_register/0, maybe_unregister/0,
maybe_inject_randomized_delay/0, lock/0, unlock/1,
discovery_retries/0]).
lock/0, unlock/1, discovery_retries/0]).
-export([append_node_prefix/1, node_prefix/0, locking_retry_timeout/0,
lock_acquisition_failure_mode/0]).

Expand All @@ -28,9 +27,6 @@
%% default node prefix to attach to discovered hostnames
-define(DEFAULT_PREFIX, "rabbit").

%% default randomized delay range, in seconds
-define(DEFAULT_STARTUP_RANDOMIZED_DELAY, {5, 60}).

%% default discovery retries and interval.
-define(DEFAULT_DISCOVERY_RETRY_COUNT, 10).
-define(DEFAULT_DISCOVERY_RETRY_INTERVAL_MS, 500).
Expand Down Expand Up @@ -159,61 +155,6 @@ discovery_retries() ->
{?DEFAULT_DISCOVERY_RETRY_COUNT, ?DEFAULT_DISCOVERY_RETRY_INTERVAL_MS}
end.


-spec maybe_inject_randomized_delay() -> ok.
maybe_inject_randomized_delay() ->
Backend = backend(),
case Backend:supports_registration() of
true ->
rabbit_log:info("Peer discovery backend ~s supports registration.", [Backend]),
inject_randomized_delay();
false ->
rabbit_log:info("Peer discovery backend ~s does not support registration, skipping randomized startup delay.", [Backend]),
ok
end.

-spec inject_randomized_delay() -> ok.

inject_randomized_delay() ->
{Min, Max} = randomized_delay_range_in_ms(),
case {Min, Max} of
%% When the max value is set to 0, consider the delay to be disabled.
%% In addition, `rand:uniform/1` will fail with a "no function clause"
%% when the argument is 0.
{_, 0} ->
rabbit_log:info("Randomized delay range's upper bound is set to 0. Considering it disabled."),
ok;
{_, N} when is_number(N) ->
rand:seed(exsplus),
RandomVal = rand:uniform(round(N)),
rabbit_log:debug("Randomized startup delay: configured range is from ~p to ~p milliseconds, PRNG pick: ~p...",
[Min, Max, RandomVal]),
Effective = case RandomVal < Min of
true -> Min;
false -> RandomVal
end,
rabbit_log:info("Will wait for ~p milliseconds before proceeding with registration...", [Effective]),
timer:sleep(Effective),
ok
end.

-spec randomized_delay_range_in_ms() -> {integer(), integer()}.

randomized_delay_range_in_ms() ->
Backend = backend(),
Default = case erlang:function_exported(Backend, randomized_startup_delay_range, 0) of
true -> Backend:randomized_startup_delay_range();
false -> ?DEFAULT_STARTUP_RANDOMIZED_DELAY
end,
{Min, Max} = case application:get_env(rabbit, cluster_formation) of
{ok, Proplist} ->
proplists:get_value(randomized_startup_delay_range, Proplist, Default);
undefined ->
Default
end,
{Min * 1000, Max * 1000}.


-spec register() -> ok.

register() ->
Expand Down
56 changes: 27 additions & 29 deletions deps/rabbit/src/rabbit_peer_discovery_classic_config.erl
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,36 @@ list_nodes() ->
Nodes when is_list(Nodes) -> {ok, {Nodes, disc}}
end.

-spec lock(Node :: node()) -> {ok, {ResourceId :: string(), LockRequesterId :: node()}} | {error, Reason :: string()}.

lock(Node) ->
{ok, {Nodes, _NodeType}} = list_nodes(),
case lists:member(Node, Nodes) of
false when Nodes =/= [] ->
rabbit_log:warning("Local node ~s is not part of configured nodes ~p. "
"This might lead to incorrect cluster formation.", [Node, Nodes]);
_ -> ok
end,
LockId = rabbit_nodes:lock_id(Node),
Retries = rabbit_nodes:lock_retries(),
case global:set_lock(LockId, Nodes, Retries) of
true ->
{ok, LockId};
false ->
{error, io_lib:format("Acquiring lock taking too long, bailing out after ~b retries", [Retries])}
end.

-spec unlock({ResourceId :: string(), LockRequesterId :: node()}) -> ok.

unlock(LockId) ->
{ok, {Nodes, _NodeType}} = list_nodes(),
global:del_lock(LockId, Nodes),
ok.

-spec supports_registration() -> boolean().

supports_registration() ->
%% If we don't have any nodes configured, skip randomized delay and similar operations
%% as we don't want to delay startup for no reason. MK.
has_any_peer_nodes_configured().
false.

-spec register() -> ok.

Expand All @@ -47,29 +71,3 @@ unregister() ->

post_registration() ->
ok.

-spec lock(Node :: atom()) -> not_supported.

lock(_Node) ->
not_supported.

-spec unlock(Data :: term()) -> ok.

unlock(_Data) ->
ok.

%%
%% Helpers
%%

has_any_peer_nodes_configured() ->
case application:get_env(rabbit, cluster_nodes, []) of
{[], _NodeType} ->
false;
{Nodes, _NodeType} when is_list(Nodes) ->
true;
[] ->
false;
Nodes when is_list(Nodes) ->
true
end.
17 changes: 8 additions & 9 deletions deps/rabbit/test/config_schema_SUITE_data/rabbit.snippets
Original file line number Diff line number Diff line change
Expand Up @@ -540,23 +540,22 @@ tcp_listen_options.exit_on_close = false",
{cluster_formation_randomized_startup_delay_both_values,
"cluster_formation.randomized_startup_delay_range.min = 10
cluster_formation.randomized_startup_delay_range.max = 30",
[{rabbit, [{cluster_formation, [
{randomized_startup_delay_range, {10, 30}}
]}]}],
[],
[]},

{cluster_formation_randomized_startup_delay_min_only,
"cluster_formation.randomized_startup_delay_range.min = 10",
[{rabbit, [{cluster_formation, [
{randomized_startup_delay_range, {10, 60}}
]}]}],
[],
[]},

{cluster_formation_randomized_startup_delay_max_only,
"cluster_formation.randomized_startup_delay_range.max = 30",
[{rabbit, [{cluster_formation, [
{randomized_startup_delay_range, {5, 30}}
]}]}],
[],
[]},

{cluster_formation_internal_lock_retries,
"cluster_formation.internal_lock_retries = 10",
[{rabbit,[{cluster_formation,[{internal_lock_retries,10}]}]}],
[]},

{cluster_formation_dns,
Expand Down
Loading