This repository has been archived by the owner on Feb 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 178
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge "Serialize shutdown of pacemaker nodes"
- Loading branch information
Showing
4 changed files
with
161 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/bin/bash | ||
|
||
# pacemaker_mutex_shutdown.sh --acquire | ||
# pacemaker_mutex_shutdown.sh --release | ||
|
||
set -u | ||
|
||
usage() { | ||
echo "Shutdown a cluster node in a coordinated way across the cluster" | ||
echo "Usage:" | ||
echo " $0 --acquire # prevent other node from shutting down until we hold the lock" | ||
echo " $0 --release # release the lock, other node can compete for the shutdown lock" | ||
echo | ||
} | ||
|
||
log() { | ||
echo "$(date -u): $1" | ||
} | ||
|
||
error() { | ||
echo "$(date -u): $1" 1>&2 | ||
exit 1 | ||
} | ||
|
||
# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually | ||
shutdown_lock_acquire() { | ||
local lockname=$1 | ||
local requester=$2 | ||
local ttl=$3 | ||
local rc=1 | ||
local current_owner | ||
local owner_stopped | ||
local owner_rc | ||
|
||
log "Acquiring the shutdown lock" | ||
while [ $rc -ne 0 ]; do | ||
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire-once $lockname $requester $ttl | ||
rc=$? | ||
if [ $rc -ne 0 ]; then | ||
if [ $rc -eq 2 ]; then | ||
error "Could not acquire the shutdown lock due to unrecoverable error (rc: $rc), bailing out" | ||
else | ||
# The lock is held by another node. | ||
current_owner=$(/var/lib/container-config-scripts/pacemaker_resource_lock.sh --owner $lockname) | ||
owner_rc=$? | ||
if [ $owner_rc -eq 2 ]; then | ||
error "Could not get the shutdown lock owner due to unrecoverable error (rc: $owner_rc), bailing out" | ||
fi | ||
if [ $owner_rc -eq 0 ]; then | ||
# If the owner is marked as offline, that means it has shutdown and | ||
# we can clean the lock preemptively and try to acquire it. | ||
owner_stopped=$(crm_mon -1X | xmllint --xpath 'count(//nodes/node[@name="'${current_owner}'" and @online="false" and @unclean="false"])' -) | ||
if [ "${owner_stopped}" = "1" ]; then | ||
log "Shutdown lock held by stopped node '${current_owner}', lock can be released" | ||
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $lockname $current_owner | ||
continue | ||
fi | ||
fi | ||
log "Shutdown lock held by another node (rc: $rc), retrying" | ||
sleep 10 | ||
fi | ||
fi | ||
done | ||
log "Shutdown lock acquired" | ||
return 0 | ||
} | ||
|
||
|
||
# Release the lock if we still own it. Not owning it anymore is not fatal | ||
shutdown_lock_release() { | ||
local lockname=$1 | ||
local requester=$2 | ||
local rc | ||
|
||
log "Releasing the shutdown lock" | ||
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $lockname $requester | ||
rc=$? | ||
if [ $rc -ne 0 ]; then | ||
if [ $rc -gt 1 ]; then | ||
error "Could not release the shutdown lock due to unrecoverable error (rc: $rc), bailing out" | ||
else | ||
log "Shutdown lock no longer held, nothing to do" | ||
fi | ||
else | ||
log "Shutdown lock released" | ||
fi | ||
return 0 | ||
} | ||
|
||
|
||
ACTION=$1 | ||
if [ -z "$ACTION" ]; then | ||
error "Action must be specified" | ||
fi | ||
|
||
LOCK_NAME=tripleo-shutdown-lock | ||
LOCK_OWNER=$(crm_node -n 2>/dev/null) | ||
rc=$? | ||
if [ $rc -ne 0 ]; then | ||
if [ $rc -eq 102 ]; then | ||
log "Cluster is not running locally, no need to aquire the shutdown lock" | ||
exit 0 | ||
else | ||
error "Unexpected error while connecting to the cluster (rc: $rc), bailing out" | ||
fi | ||
fi | ||
|
||
# We start with a very high TTL, that long enough to accomodate a cluster stop. | ||
# As soon as the node will get offline, the other competing node will be entitled | ||
# to steal the lock, so they should never wait that long in practice. | ||
LOCK_TTL=600 | ||
|
||
|
||
case $ACTION in | ||
--help) usage; exit 0;; | ||
--acquire|-a) shutdown_lock_acquire ${LOCK_NAME} ${LOCK_OWNER} ${LOCK_TTL};; | ||
--release|-r) shutdown_lock_release ${LOCK_NAME} ${LOCK_OWNER};; | ||
*) error "Invalid action";; | ||
esac | ||
exit $? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters