-
Notifications
You must be signed in to change notification settings - Fork 1
/
search-errors.sh
266 lines (229 loc) · 9.19 KB
/
search-errors.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#/bin/bash
#How to run this script:
#wget -v -O search-errors.sh https://raw.githubusercontent.com/Cyreex/sh/master/search-errors.sh
#chmod +x search-errors.sh
#./search-errors.sh
#
# https://help.tpondemand.net/fix-heketi-lost-volumes/
#
#fixProblems YES - fix problems:
# - delete folders related with noexists bricks on glusterfs
# - delete lost LV
fixProblems="NO"
timestamp=$(date +%s)
logFileName="/tmp/SearchErrors_$timestamp.txt"
logs() {
case $2 in
red)
echo -en '\033[31m'
;;
yellow)
echo -en '\033[33m'
;;
*)
echo -en '\033[0;32m'
;;
esac
echo $1 | tee $logFileName
echo -en '\033[0m'
}
showCheckHeader() {
if [ $count -eq 0 ] && [ $logMarker -eq 0 ]; then
logs "$1"
logMarker=1
return $logMarker
fi
}
read -p "Can I Fix problems? yes/NO: " fixProblems
if [ "${fixProblems^^}" = "YES" ]; then
logs "We'll try to fix errors"
else
logs "Fix problems disabled"
fi
#get all gluster volumes
glusterPodName=$(kubectl get po -n glusterfs -l=name=glusterfs-gluster -o jsonpath='{.items[0].metadata.name}')
gv=$(kubectl exec -n glusterfs $glusterPodName gluster v list)
#Get list of heketi volumes
hv=$(kubectl exec -n glusterfs glusterfs-heketi-0 -c heketi -- heketi-cli volume list | awk -F":" '{print $4}')
#Get list of volumes created by PV
pvs=$(kubectl get pv -o jsonpath='{.items[*].spec.glusterfs.path}')
#You can delete these volumes from heketi manually:
logMarker=0
for i in $hv; do
count=$(echo $gv | grep $i -c)
showCheckHeader "These volumes we have in the Heketi but don't have in the Gluster (lost data):"
[ $count -eq 0 ] && logs $i yellow
if [ $count -eq 0 ] && [ "${fixProblems^^}" = "YES" ]; then
read -p "Heketi volume $i will be deleted. To continue press 'Enter'. "
heketi_volume_id=$(kubectl exec -n glusterfs glusterfs-heketi-0 -c heketi heketi-cli volume list | grep $i | awk '{print $1}' | cut -d":" -f2)
kubectl exec -n glusterfs glusterfs-heketi-0 -c heketi heketi-cli volume delete $heketi_volume_id
fi
done
logMarker=0
for i in $gv; do
count=$(echo $hv | grep $i -c)
showCheckHeader "These volumes we have in the Gluster but don't have in the Heketi (lost control):"
[ $count -eq 0 ] && logs $i red
done
#Looks like these PVs were created with errors
logMarker=0
for i in $pvs; do
count=$(echo $gv | grep $i -c)
showCheckHeader "These PV doesn't work, because volumes are absent in the Gluster:"
[ $count -eq 0 ] && logs $i yellow
done
#################### Functions for investigating and fixing problems ######################
backup_brick() {
sure="n"; read -p "Do we need to create backup of this brick? y/N: " sure
if [ "${sure^^}" = "Y" ]; then
logs "Copy brick $brick to $backup_dir on host"
mkdir -p $backup_dir/$brick
kubectl cp -n glusterfs $i:var/lib/heketi/mounts/$vol_name/$brick $backup_dir/$brick
fi
}
remove_brick_mount_points() {
logs "Remove mount points for brick $brick"
kubectl exec -n glusterfs $i -- rm -rf /var/lib/heketi/mounts/$vol_name/$brick
kubectl exec -n glusterfs $i -- sed -i.save "/${brick}/d" /var/lib/heketi/fstab
}
search_and_delete_lost_lv() {
YELLOW='\033[0;33m'
NORMAL='\033[0m'
echo -e ${YELLOW}
sure="n"; read -p "LV $brick will be deleted. Are you sure? y/N: " sure
if [ "${sure^^}" = "Y" ]; then
tp_name=$(kubectl exec -n glusterfs $i lvs | grep $brick | awk '{print $5}')
logs "LV $tp_name will be deleted!" red
kubectl exec -n glusterfs $i -- umount -f /var/lib/heketi/mounts/$vol_name/$brick
kubectl exec -n glusterfs $i -- lvremove -f $vol_name/$tp_name
remove_brick_mount_points
else
logs "You skip deleting the brick"
fi
echo -e ${NORMAL}
}
inspect_brick() {
logs "Try to inspect the brick:"
logs "kubectl exec -n glusterfs $i -- bash" yellow
kubectl exec -n glusterfs $i -- bash -c "mkdir -p /mnt/tmp && mount /dev/mapper/$vol_name-$brick \
/mnt/tmp && ls -la /mnt/tmp/brick && echo ... df ... && df -h /mnt/tmp && umount -f /mnt/tmp"
kubectl exec -n glusterfs $i -- bash -c "umount -f /mnt/tmp"
}
delete_gluster_volume() {
RED='\033[31m'
NORMAL='\033[0m'
echo -e ${RED}
sure="n"; read -p "Volume $volume will be deleted. Are you sure? y/N: " sure
echo -e ${NORMAL}
if [ "${sure^^}" = "Y" ]; then
kubectl exec -it -n glusterfs $glusterPodName gluster volume stop $volume force
kubectl exec -it -n glusterfs $glusterPodName gluster volume delete $volume
logs "After deleting the Gluster volume you must remove the bricks in the following steps" red
else
logs "You skip deleting the volume"
fi
}
######################## PAIN 00 ####################################################
# Sometimes we have Gluster Volumes that we don't use, but they take us some diskspace
#####################################################################################
echo ..................................................
logs "............. Check that volume using by PV ......................"
logMarker=0
for volume in $gv; do
count=$(echo $pvs | grep $volume -c)
[[ $volume = vol_* ]] && showCheckHeader "Gluster Volumes doesn't related with any PV. Maybe we have to delete them?"
if [ $count -eq 0 ] && [[ $volume = vol_* ]]; then
logs $volume yellow
logs "Try to mount this volume and inspect:"
logs "mount.glusterfs localhost:/$volume /mnt/$volume"
mkdir -p "/mnt/$volume"
mount.glusterfs localhost:/$volume /mnt/$volume
logs "Volume $volume is mounted to /mnt/$volume on localhost. You can check it out."
logs "$(ls -la /mnt/$volume)" yellow
echo "......... Volume disk usage ............."
logs "$(df -h /mnt/$volume)" yellow
read -p "Press 'Enter' to continue..."
umount /mnt/$volume && rm -r /mnt/$volume
[ "${fixProblems^^}" = "YES" ] && delete_gluster_volume
fi
done
######################## PAIN 01 #####################################
#Our primary problem - we don't use LV, but LV aren't deleted.
#We can have not enough free space in the Gluster volumes to creating a new volume or expand another one.
#This is really problem that we are need to fix ASAP.
######################################################################
echo ..................................................
gluster_pods=$(kubectl get po -n glusterfs -l=name=glusterfs-gluster -o jsonpath='{.items[*].metadata.name}')
for i in $gluster_pods; do
#Create the backup of fstab
if [ "${fixProblems^^}" = "YES" ]; then
pandora="NO"
read -p "Are you sure you want open the Pandora Box for the POD $i? This is the last chance to stop it! yes/NO: " pandora
if [ "${pandora^^}" = "YES" ]; then
logs "LET'S ROCK!"
else
logs "Good choise, man!"
exit 1
fi
backup_dir="/tmp/backup_$i_$timestamp"
mkdir $backup_dir
kubectl cp -n glusterfs $i:var/lib/heketi/fstab $backup_dir
fi
#Show GlusterFS POD which we use in this step
kubectl get po -n glusterfs $i -o wide | tee $logFileName
#Get information about ALL gluster volumes
gi=$(kubectl exec -n glusterfs $i gluster v info all)
#Get VG name that this GlusterFS container use
vol_name=$(kubectl exec -n glusterfs $i -- ls /var/lib/heketi/mounts/ | grep vg_)
#Get bricks registered in gluster on this container
bricks=$(kubectl exec -n glusterfs $i -- ls /var/lib/heketi/mounts/$vol_name)
#Find lost bricks and fix these
logs "............ Check bricks mount paths ..................."
logMarker=0
for brick in $bricks; do
count=$(echo $gi | grep $brick -c)
showCheckHeader "These BRICKS don't related with any VOLUMES (volumes maybe deleted):"
if [ $count -eq 0 ]; then
logs $brick yellow
lvcount=$(kubectl exec -n glusterfs $i -- lvdisplay | grep "LV Name" | grep $brick -c)
#if we didn't find LV with the name eq brick name, we can just delete folder and delete mount point in /var/lib/heketi/fstab
case $lvcount in
0)
logs "Mount point $brick don't related with any LV. We can just delete it from Gluster container"
if [ "${fixProblems^^}" = "YES" ]; then
backup_brick
remove_brick_mount_points
fi
;;
#if we find one LV, we need do backup and remove brick, then remove LV
1)
logs "Mount point $brick RELATED with ONE LV. We need to delete LV as well"
#Get files from the brick
inspect_brick
if [ "${fixProblems^^}" = "YES" ]; then
backup_brick
search_and_delete_lost_lv
fi
;;
#If we found more than one volume - this is so strange... Better check it manually
*)
logs "ERROR: Something went wrong with brick $brick - lvcount=$lvcount"
;;
esac
fi
done
#Search brick LVs which don't related with any gluster Volume
logs "............ Check bricks lvs ..................."
logMarker=0
#Get bricks logical volumes
bricks_lv=$(kubectl exec -n glusterfs $i -- lvdisplay | grep "LV Name" | grep "brick_" | awk '{print $3}')
for brick in $bricks_lv; do
count=$(echo $gi | grep $brick -c)
showCheckHeader "These Logical Volumes (LV) don't related with any Gluster volumes (volumes maybe deleted):"
if [ $count -eq 0 ]; then
logs $brick yellow
inspect_brick
[ "${fixProblems^^}" = "YES" ] && search_and_delete_lost_lv
fi
done
done