-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DPE-3684] Reinitialise raft #611
base: main
Are you sure you want to change the base?
Changes from all commits
05833c7
5753d1f
e2b082e
df0c37e
c31c0d5
6ae0dcc
538f721
29aaf18
d050e6e
7626c99
c869331
774ea95
76bbe84
bdae203
fb922e5
e0bbb08
895a8c4
e739a56
6f71b36
ed56f3d
7c8c95c
c7b26d2
a4f0b39
02d08ff
b454d06
a281f7a
0f9ec81
c1f9596
47b2b6a
ad5b8e8
3951708
6afe354
243f1fb
73abcd9
15dfeb7
1717f27
d08eb1d
862660b
c3bcac1
90e0d83
507665d
0dfa199
1b8c2d8
ad2d7a9
a7bbe67
89c368f
9efc6ca
ec6f88c
de838b5
1c56adf
85d0ba3
c317104
887f008
24a3fe0
ac7e217
26a2bc6
63b5d2c
3a4eda7
bca3e3a
0e9686a
3974ea2
4048015
819748f
761d28d
0163017
db875ad
89a1fee
95c5884
09b3fea
5d253c7
92dd704
723306d
c86e7c7
42049d3
4fae252
c9b1211
cf77dfe
d59626e
400529a
45f98e6
c413f76
5a57952
2ee87ed
aa61f37
5396607
ca127cb
cf4e8ff
88edde1
d11b87e
546549b
5934150
2cb218e
439073b
1783170
a7912fc
bd67a39
ac6c13f
c769b8e
0896245
5535896
a307bd8
d155abf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -421,6 +421,10 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: | |
logger.debug("Early exit on_peer_relation_departed: Skipping departing unit") | ||
return | ||
|
||
if self.has_raft_keys(): | ||
logger.debug("Early exit on_peer_relation_departed: Raft recovery in progress") | ||
return | ||
|
||
# Remove the departing member from the raft cluster. | ||
try: | ||
departing_member = event.departing_unit.name.replace("/", "-") | ||
|
@@ -432,6 +436,11 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: | |
) | ||
event.defer() | ||
return | ||
except RetryError: | ||
logger.warning( | ||
f"Early exit on_peer_relation_departed: Cannot get {event.departing_unit.name} member IP" | ||
) | ||
return | ||
|
||
# Allow leader to update the cluster members. | ||
if not self.unit.is_leader(): | ||
|
@@ -511,6 +520,137 @@ def _on_pgdata_storage_detaching(self, _) -> None: | |
if self.primary_endpoint: | ||
self._update_relation_endpoints() | ||
|
||
def _stuck_raft_cluster_check(self) -> None: | ||
"""Check for stuck raft cluster and reinitialise if safe.""" | ||
raft_stuck = False | ||
all_units_stuck = True | ||
candidate = self.app_peer_data.get("raft_selected_candidate") | ||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
if "raft_stuck" in data: | ||
raft_stuck = True | ||
else: | ||
all_units_stuck = False | ||
if not candidate and "raft_candidate" in data: | ||
candidate = key | ||
|
||
if not raft_stuck: | ||
return | ||
|
||
if not all_units_stuck: | ||
logger.warning("Stuck raft not yet detected on all units") | ||
return | ||
|
||
if not candidate: | ||
logger.warning("Stuck raft has no candidate") | ||
return | ||
if "raft_selected_candidate" not in self.app_peer_data: | ||
logger.info(f"{candidate.name} selected for new raft leader") | ||
self.app_peer_data["raft_selected_candidate"] = candidate.name | ||
|
||
def _stuck_raft_cluster_rejoin(self) -> None: | ||
"""Reconnect cluster to new raft.""" | ||
primary = None | ||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
if "raft_primary" in data: | ||
primary = key | ||
break | ||
if primary and "raft_reset_primary" not in self.app_peer_data: | ||
logger.info("Updating the primary endpoint") | ||
self.app_peer_data.pop("members_ips", None) | ||
for unit in self._peers.units: | ||
self._add_to_members_ips(self._get_unit_ip(unit)) | ||
self._add_to_members_ips(self._get_unit_ip(self.unit)) | ||
Comment on lines
+564
to
+566
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the primary and leader are different, cluster will be unable to reconfigure, since the leader patroni is down and outside the cluster, so we have to keep the list here. |
||
self.app_peer_data["raft_reset_primary"] = "True" | ||
self._update_relation_endpoints() | ||
if ( | ||
"raft_rejoin" not in self.app_peer_data | ||
and "raft_followers_stopped" in self.app_peer_data | ||
and "raft_reset_primary" in self.app_peer_data | ||
): | ||
logger.info("Notify units they can rejoin") | ||
self.app_peer_data["raft_rejoin"] = "True" | ||
|
||
def _stuck_raft_cluster_stopped_check(self) -> None: | ||
"""Check that the cluster is stopped.""" | ||
if "raft_followers_stopped" in self.app_peer_data: | ||
return | ||
|
||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
if "raft_stopped" not in data: | ||
return | ||
|
||
logger.info("Cluster is shut down") | ||
self.app_peer_data["raft_followers_stopped"] = "True" | ||
|
||
def _stuck_raft_cluster_cleanup(self) -> None: | ||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
for flag in data: | ||
if flag.startswith("raft_"): | ||
return | ||
|
||
logger.info("Cleaning up raft app data") | ||
self.app_peer_data.pop("raft_rejoin", None) | ||
self.app_peer_data.pop("raft_reset_primary", None) | ||
self.app_peer_data.pop("raft_selected_candidate", None) | ||
self.app_peer_data.pop("raft_followers_stopped", None) | ||
|
||
def _raft_reinitialisation(self) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there's only one unit (sync standby and leader), this should execute in one go. |
||
"""Handle raft cluster loss of quorum.""" | ||
# Skip to cleanup if rejoining | ||
if "raft_rejoin" not in self.app_peer_data: | ||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_check() | ||
|
||
if ( | ||
candidate := self.app_peer_data.get("raft_selected_candidate") | ||
) and "raft_stopped" not in self.unit_peer_data: | ||
self.unit_peer_data.pop("raft_stuck", None) | ||
self.unit_peer_data.pop("raft_candidate", None) | ||
self._patroni.remove_raft_data() | ||
logger.info(f"Stopping {self.unit.name}") | ||
self.unit_peer_data["raft_stopped"] = "True" | ||
|
||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_stopped_check() | ||
|
||
if ( | ||
candidate == self.unit.name | ||
and "raft_primary" not in self.unit_peer_data | ||
and "raft_followers_stopped" in self.app_peer_data | ||
): | ||
logger.info(f"Reinitialising {self.unit.name} as primary") | ||
self._patroni.reinitialise_raft_data() | ||
self.unit_peer_data["raft_primary"] = "True" | ||
|
||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_rejoin() | ||
|
||
if "raft_rejoin" in self.app_peer_data: | ||
logger.info("Cleaning up raft unit data") | ||
self.unit_peer_data.pop("raft_primary", None) | ||
self.unit_peer_data.pop("raft_stopped", None) | ||
self.update_config() | ||
self._patroni.start_patroni() | ||
Comment on lines
+640
to
+641
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Restarting the non-primary units. |
||
|
||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_cleanup() | ||
|
||
def has_raft_keys(self): | ||
"""Checks for the presence of raft recovery keys in peer data.""" | ||
for key in self.app_peer_data: | ||
if key.startswith("raft_"): | ||
return True | ||
|
||
return any(key.startswith("raft_") for key in self.unit_peer_data) | ||
|
||
def _peer_relation_changed_checks(self, event: HookEvent) -> bool: | ||
"""Split of to reduce complexity.""" | ||
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader. | ||
|
@@ -519,6 +659,12 @@ def _peer_relation_changed_checks(self, event: HookEvent) -> bool: | |
event.defer() | ||
return False | ||
|
||
# Check whether raft is stuck. | ||
if self.has_raft_keys(): | ||
self._raft_reinitialisation() | ||
logger.debug("Early exit on_peer_relation_changed: stuck raft recovery") | ||
return False | ||
Comment on lines
+662
to
+666
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will hijack execution until recovery completes. We should think of a ways to detect manual recovery. |
||
|
||
# If the unit is the leader, it can reconfigure the cluster. | ||
if self.unit.is_leader() and not self._reconfigure_cluster(event): | ||
event.defer() | ||
|
@@ -954,6 +1100,10 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: | |
if self.get_secret(APP_SCOPE, key) is None: | ||
self.set_secret(APP_SCOPE, key, new_password()) | ||
|
||
if self.has_raft_keys(): | ||
self._raft_reinitialisation() | ||
return | ||
|
||
# Update the list of the current PostgreSQL hosts when a new leader is elected. | ||
# Add this unit to the list of cluster members | ||
# (the cluster should start with only this member). | ||
|
@@ -1442,6 +1592,10 @@ def _can_run_on_update_status(self) -> bool: | |
if "cluster_initialised" not in self._peers.data[self.app]: | ||
return False | ||
|
||
if self.has_raft_keys(): | ||
logger.debug("Early exit on_update_status: Raft recovery in progress") | ||
return False | ||
|
||
if not self.upgrade.idle: | ||
logger.debug("Early exit on_update_status: upgrade in progress") | ||
return False | ||
|
@@ -1488,7 +1642,8 @@ def _handle_workload_failures(self) -> bool: | |
return False | ||
|
||
if ( | ||
not is_primary | ||
not self.has_raft_keys() | ||
and not is_primary | ||
and not is_standby_leader | ||
and not self._patroni.member_started | ||
and "postgresql_restarted" in self._peers.data[self.unit] | ||
|
@@ -1688,7 +1843,7 @@ def _can_connect_to_postgresql(self) -> bool: | |
return False | ||
return True | ||
|
||
def update_config(self, is_creating_backup: bool = False) -> bool: | ||
def update_config(self, is_creating_backup: bool = False, no_peers: bool = False) -> bool: | ||
"""Updates Patroni config file based on the existence of the TLS files.""" | ||
enable_tls = self.is_tls_enabled | ||
limit_memory = None | ||
|
@@ -1712,7 +1867,11 @@ def update_config(self, is_creating_backup: bool = False) -> bool: | |
stanza=self.app_peer_data.get("stanza", self.unit_peer_data.get("stanza")), | ||
restore_stanza=self.app_peer_data.get("restore-stanza"), | ||
parameters=pg_parameters, | ||
no_peers=no_peers, | ||
) | ||
if no_peers: | ||
return True | ||
|
||
if not self._is_workload_running: | ||
# If Patroni/PostgreSQL has not started yet and TLS relations was initialised, | ||
# then mark TLS as enabled. This commonly happens when the charm is deployed | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't proceed with automatic recovery if there's no sync standby, since the first unit on the new raft cluster will be leader and there may be data loss if promoting an async replica. We should consider setting a status and ways for manual recovery, if the user wants to promote a given replica anyway.