-
Notifications
You must be signed in to change notification settings - Fork 18
/
rolling-restart.sh
executable file
·223 lines (188 loc) · 7.12 KB
/
rolling-restart.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/bin/bash
# Parameters
# -m - MASTER - The master node to use for coordinating the update.
# -n - NODE_FILE - A file containing the list of node hostnames one per line.
# -s SCRIPT - Script to run on each node to process the update. Script will have access to $MASTER and $NODE
# -d SHUTDOWN_SCRIPT - Script to run to stop elasticsearch node.
while getopts ":d:m:n:s:h" opt; do
case $opt in
d)
SHUTDOWN_SCRIPT=${OPTARG}
echo "Shutdown script $SHUTDOWN_SCRIPT will be run for each node" >&2
;;
m)
export MASTER=${OPTARG}
echo "Using master node: $MASTER" >&2
;;
n)
NODE_FILE=${OPTARG}
echo "Reading list of node names from $NODE_FILE" >&2
;;
s)
SCRIPT=$OPTARG
echo "Script $SCRIPT will be run for each node" >&2
;;
h)
echo "Usage: $0 [-h] [-m master node host name] [-d node name domains] [-p node port] [-n file containing list of nodes] [-s update script]"
exit 1
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit -1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit -1
;;
esac
done
if [ -z $MASTER ]; then
echo "Master node name including port must be provided. ex: -m localhost:9200"
exit -1
fi
if [ -z $NODE_FILE ]; then
echo "Path to a file containing the list of nodes must be provided. ex: -n /path/to/nodes_names.txt"
exit -1
fi
if [ -z $SCRIPT ]; then
echo "Path to the update script must be provided. ex: /path/to/update_script.sh"
exit -1
fi
if [ -z $SHUTDOWN_SCRIPT ]; then
echo "Path to the shutdown script must be provided ex: -d /path/to/shutdown_script.sh"
exit -1
fi
# Test for existance of required files.
for file in $NODE_FILE $SCRIPT $SHUTDOWN_SCRIPT
do
if [ ! -f $file ]; then
echo "File not found: $file"
exit -1
fi
done
# Test that files to be called with eval in in the path or have full or
# relative paths specified
for file in $SCRIPT $SHUTDOWN_SCRIPT
do
path_to_file=$(which ${file})
if [ "$path_to_file" == "" ]; then
echo "File not found: $file"
echo "Perhaps you need to prepend ./ to the filename."
exit -1
fi
done
# Read the list of nodes into an array.
IFS=$'\r\n' GLOBIGNORE='*' :; NODES=($(< $NODE_FILE))
# Loop through the list
for NODE_CFG in ${NODES[@]}; do
NODE=${NODE_CFG%%,*}
export NODE
export HOST=${NODE%%:*} # keep everything before the ':', hostname
export PORT=${NODE##*:} # keep everything after the ':', port number
echo ">>>>>> Restarting ${NODE} at $(date)"
STATUS=""
echo ">>>>>> Verifying green cluster status"
while [ -z "$STATUS" ];
do
# verify cluster is green
STATUS=`curl -sS -G $MASTER/_cat/health -d h=status | grep green`
sleep 1
done
# if green, disable routing allocation for everything except newly
# created primary shards
echo ">>>>>> Disabling routing allocation"
STATUS=`curl -XPUT -sS $MASTER/_cluster/settings -d '{ "transient" : { "cluster.routing.allocation.enable" : "new_primaries" } }'`
if ! [[ "$STATUS" =~ (\"acknowledged\":true) ]] ; then
echo "Failed acknowledge of allocation disable for ${NODE}"
continue
fi
# Run the specified SHUTDOWN_SCRIPT to request shutdown of NODE.
echo ">>>>>> Running shutdown script for ${NODE}"
eval $SHUTDOWN_SCRIPT $NODE_CFG
result=$?
if [ $result != 0 ]; then
printf ">>>>>> Error: [%d] when executing command: '$SHUTDOWN_SCRIPT' for node $NODE" $result
exit -1
fi
# wait for the node to stop
echo ">>>>>> Waiting for node to stop."
STATUS=`curl -sS -XGET http://${NODE}/`
while [[ "$STATUS" =~ (\"status\" : 200) ]];
do
STATUS=`curl -sS -XGET http://${NODE}/`
sleep 1
done
echo ">>>>>> Waiting for cluster to reach yellow status"
# wait for cluster status yellow
STATUS=""
while [ -z "$STATUS" ];
do
STATUS=`curl -sS -G $MASTER/_cat/health -d h=status | grep yellow`
sleep 1
done
# Perform changes to the node
echo ">>>>>> Running updates on ${NODE}"
eval $SCRIPT $NODE_CFG
result=$?
if [ $result != 0 ]; then
printf ">>>>>> Error: [%d] when executing command: '$SCRIPT' on node $NODE" $result
fi
echo ">>>>>> Waiting for node ${NODE} to respond after restart. Connection refused messages expected."
# verify node respond
STATUS=""
while ! [[ "$STATUS" =~ (\"tagline\" : \"You Know, for Search\") ]];
do
echo "fetching http://${NODE}/"
STATUS=`curl -sS -XGET http://${NODE}/`
sleep 1
done
echo ">>>>>> Verify restarted node sees cluster as yellow"
# wait for cluster status yellow by talking directly to the restarted node.
STATUS=""
while [ -z "$STATUS" ];
do
STATUS=`curl -sS -G http://${NODE}/_cat/health -d h=status | grep yellow`
sleep 1
done
# We've had problems on larger clusters with nodes behaving as if routing
# was re-enabled before the node was fully joined to the cluster (e.g.
# the shards were vacated from the node when the should not have been)
# This has led us to believe that the check for cluster yellow on the node
# is not truly sufficient to ensure the shards won't be reallocated.
# in ES 2.X and 5.X we have been unable to find an API endpoint that will
# provide us with a better indication of node availability, so we sleep
# longer.
sleep 60
echo ">>>>>> Re-enabling routing allocation"
# re-enable routing allocation
STATUS=`curl -sS -XPUT ${MASTER}/_cluster/settings -d '{ "transient" : { "cluster.routing.allocation.enable" : "all" } }'`
if ! [[ "$STATUS" =~ (\"acknowledged\":true) ]] ; then
echo "Failed acknowledge of allocation enable for ${NODE}. Will try again."
fi
sleep 15
echo ">>>>>> Re-enabling routing allocation one more time"
# re-enable routing allocation
STATUS=`curl -sS -XPUT ${MASTER}/_cluster/settings -d '{ "transient" : { "cluster.routing.allocation.enable" : "all" } }'`
if ! [[ "$STATUS" =~ (\"acknowledged\":true) ]] ; then
echo "Failed acknowledge of allocation enable for ${NODE}. Continuing but manual intervention may be required."
fi
echo ">>>>>> Waiting for green cluster status"
# wait for cluster status green by talking directly to the restarted node.
STATUS=""
COUNT=0
ITERATIONS=0
while [ -z "$STATUS" ];
do
# verify cluster is green
STATUS=`curl -sS -G ${MASTER}/_cat/health -d h=status| grep green`
COUNT=$((COUNT + 1))
if [ $COUNT -gt 60 ] && [ $ITERATIONS -lt 5 ]; then
echo ">>>>>> Still waiting. verifying routing allocation enabled."
UPDATE=`curl -sS -XPUT ${MASTER}/_cluster/settings -d '{ "transient" : { "cluster.routing.allocation.enable" : "all" } }'`
COUNT=0
ITERATIONS=$((ITERATIONS + 1))
fi
sleep 1
done
echo ">>>>>> Node ${NODE} restart completed at $(date)"
done