diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml index c8c97a7f..7c07194d 100644 --- a/.github/workflows/test-job.yaml +++ b/.github/workflows/test-job.yaml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - otp_version: ['25', '26', '27'] + otp_version: ['26', '27'] os: [ubuntu-latest, windows-latest] steps: diff --git a/rebar.config b/rebar.config index 6e8c9e03..a6d5a72f 100644 --- a/rebar.config +++ b/rebar.config @@ -1,7 +1,7 @@ %% vim:ft=erlang: -{minimum_otp_vsn, "24.0"}. +{minimum_otp_vsn, "26.0"}. -{deps, [{ra, "2.15.0"}, +{deps, [{ra, "2.16.0"}, {horus, "0.3.1"}]}. {project_plugins, [rebar3_proper, diff --git a/rebar.lock b/rebar.lock index c48aae2d..fa9d7756 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,20 +1,20 @@ {"1.2.0", [{<<"aten">>,{pkg,<<"aten">>,<<"0.6.0">>},1}, - {<<"gen_batch_server">>,{pkg,<<"gen_batch_server">>,<<"0.8.8">>},1}, + {<<"gen_batch_server">>,{pkg,<<"gen_batch_server">>,<<"0.8.9">>},1}, {<<"horus">>,{pkg,<<"horus">>,<<"0.3.1">>},0}, - {<<"ra">>,{pkg,<<"ra">>,<<"2.15.0">>},0}, + {<<"ra">>,{pkg,<<"ra">>,<<"2.16.0">>},0}, {<<"seshat">>,{pkg,<<"seshat">>,<<"0.6.0">>},1}]}. [ {pkg_hash,[ {<<"aten">>, <<"7A57B275A6DAF515AC3683FB9853E280B4D0DCDD74292FD66AC4A01C8694F8C7">>}, - {<<"gen_batch_server">>, <<"7840A1FA63EE1EFFC83E8A91D22664847A2BA1192D30EAFFFD914ACB51578068">>}, + {<<"gen_batch_server">>, <<"1C6BC0F530BF8C17E8B4ACC20C2CC369FFA5BEE2B46DE01E21410745F24B1BC9">>}, {<<"horus">>, <<"A5274C96E15924C28413752617B06050E4B08C04628B88209AFF9EA076F2BCB5">>}, - {<<"ra">>, <<"23D77CE1D4A8F69271467A847BA989815D1DAFC4F0762527F566E61D240E1EE2">>}, + {<<"ra">>, <<"E3E63AD1359B97E027DBD77ED2DAAFCCD881F19200416E6F0AFB2F725CCC84CF">>}, {<<"seshat">>, <<"3172EB1D7A2A4F66108CD6933A4E465AFF80F84AA90ED83F047B92F636123CCD">>}]}, {pkg_hash_ext,[ {<<"aten">>, <<"5F39A164206AE3F211EF5880B1F7819415686436E3229D30B6A058564FBAA168">>}, - {<<"gen_batch_server">>, <<"C3E6A1A2A0FB62AEE631A98CFA0FD8903E9562422CBF72043953E2FB1D203017">>}, + {<<"gen_batch_server">>, <<"C8581FE4A4B6BCCF91E53CE6A8C7E6C27C8C591BAB5408B160166463F5579C22">>}, {<<"horus">>, <<"D564D30EBC274F0D92C3D44A336D0B892F000BE159912AE4E6838701E85495EC">>}, - {<<"ra">>, <<"CFC0DBE5EBBD54F44081F95EA6A1DAEB28A89DF82AA9BAA234F68ABBB36BDC67">>}, + {<<"ra">>, <<"7CDF7894F1F542AEAA3D9E6F3209AAB6EFE9A1CDD1D81DE9587C3EA23629B0E3">>}, {<<"seshat">>, <<"7CEF700F92831DD7CAE6A6DD223CCC55AC88ECCE0631EE9AB0F2B5FB70E79B90">>}]} ]. diff --git a/src/khepri_cluster.erl b/src/khepri_cluster.erl index 67657528..19533993 100644 --- a/src/khepri_cluster.erl +++ b/src/khepri_cluster.erl @@ -866,13 +866,13 @@ do_join_locked(StoreId, ThisMember, RemoteNode, Timeout) -> end. wait_for_cluster_change_permitted(RaMemberOrStoreId, Timeout) -> - Ret = wait_for_leader(RaMemberOrStoreId, Timeout), + Ret = do_wait_for_leader(RaMemberOrStoreId, false, Timeout), %% We wait for an additional fixed amount of time because the %% cluster could have a leader and still not be ready to accept %% a cluster change. This avoids too many retries that will %% just eat resources. - timer:sleep(200), + timer:sleep(?TRANSIENT_ERROR_RETRY_INTERVAL), Ret. @@ -954,8 +954,18 @@ do_reset(RaSystem, StoreId, ThisMember, Timeout) -> Ret2 = wait_for_cluster_change_permitted(StoreId, Timeout1), Timeout2 = khepri_utils:end_timeout_window(Timeout1, T2), case Ret2 of - ok -> do_reset(RaSystem, StoreId, ThisMember, Timeout2); - Error -> Error + ok -> + do_reset(RaSystem, StoreId, ThisMember, Timeout2); + {error, noproc} -> + ?LOG_DEBUG( + "The local Ra server exited while we were waiting " + "for it to be ready for a membership change. It " + "means it was removed from the cluster by another " + "member; we can proceed with the reset."), + forget_store(StoreId), + ok; + Error -> + Error end catch exit:{normal, _} -> @@ -1183,7 +1193,8 @@ do_query_members(StoreId, RaServer, QueryType, Timeout) -> when ?HAS_TIME_LEFT(Timeout) andalso (Reason == noconnection orelse Reason == nodedown orelse - Reason == shutdown) -> + Reason == shutdown orelse + Reason == normal) -> NewTimeout0 = khepri_utils:end_timeout_window(Timeout, T0), NewTimeout = khepri_utils:sleep( ?TRANSIENT_ERROR_RETRY_INTERVAL, NewTimeout0), @@ -1317,24 +1328,29 @@ wait_for_leader(StoreIdOrRaServer) -> %% %% @returns `ok' if a leader was elected or an `{error, Reason}' tuple. -wait_for_leader(StoreId, Timeout) when is_atom(StoreId) -> +wait_for_leader(StoreIdOrRaServer, Timeout) -> + do_wait_for_leader(StoreIdOrRaServer, true, Timeout). + +do_wait_for_leader(StoreId, WaitForProcToStart, Timeout) + when is_atom(StoreId) -> ThisMember = this_member(StoreId), - wait_for_leader(ThisMember, Timeout); -wait_for_leader(RaServer, Timeout) -> + do_wait_for_leader(ThisMember, WaitForProcToStart, Timeout); +do_wait_for_leader(RaServer, WaitForProcToStart, Timeout) -> T0 = khepri_utils:start_timeout_window(Timeout), case ra:members(RaServer, Timeout) of {ok, _Members, _LeaderId} -> ok; {error, Reason} when ?HAS_TIME_LEFT(Timeout) andalso - (Reason == noproc orelse + ((Reason == noproc andalso WaitForProcToStart) orelse Reason == noconnection orelse Reason == nodedown orelse - Reason == shutdown) -> + Reason == shutdown orelse + Reason == normal) -> NewTimeout0 = khepri_utils:end_timeout_window(Timeout, T0), NewTimeout = khepri_utils:sleep( ?TRANSIENT_ERROR_RETRY_INTERVAL, NewTimeout0), - wait_for_leader(RaServer, NewTimeout); + do_wait_for_leader(RaServer, WaitForProcToStart, NewTimeout); {timeout, _} -> {error, timeout}; {error, _} = Error -> diff --git a/test/cluster_SUITE.erl b/test/cluster_SUITE.erl index fc33167a..8199b32f 100644 --- a/test/cluster_SUITE.erl +++ b/test/cluster_SUITE.erl @@ -423,9 +423,15 @@ fail_to_start_with_bad_ra_server_config(Config) -> _}, _}} -> true; + {error, + {{bad_action_from_state_function, + {{timeout, tick}, not_a_timeout, tick_timeout}}, + _}} -> + true; {error, noproc} -> true; _ -> + ct:pal("Unexpected return value:~n~p", [Ret]), false end),