-
Notifications
You must be signed in to change notification settings - Fork 29
/
gaspi_run
executable file
·377 lines (330 loc) · 8.74 KB
/
gaspi_run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/bin/sh
# Copyright (c) Fraunhofer ITWM - Carsten Lojewski <[email protected]>, 2013-2021
# This file is part of GPI-2.
# GPI-2 is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License
# version 3 as published by the Free Software Foundation.
# GPI-2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with GPI-2. If not, see <http://www.gnu.org/licenses/>.
SET_NUMA=0
MFILE=""
MASTER_PRG=""
PRG=""
PRG_ARGS=""
TYPE=GASPI_MASTER
MAX_IMMED_CONN=10
MAX_NUMA_NODES=1
HAS_NUMA=0
HN=$(hostname)
TMP_PATTERN="/tmp/.gpi2.XXXXXXXX"
TMP_MFILE=$(mktemp ${TMP_PATTERN})
UNIQ_MFILE=$(mktemp ${TMP_PATTERN})
ORIG_MFILE=""
NNODES=0
DEBUG=0
GASPI_LAUNCHER="ssh"
VALIDATE_MACHINEFILE=0
remove_temp_file()
{
rm -f $TMP_MFILE
rm -f $UNIQ_MFILE
}
#helper functions
usage()
{
echo
echo "Usage: gaspi_run -m <machinefile> [OPTIONS] <path GASPI program>"
echo
echo "Available options:"
echo " -b <binary file> Use a different binary for first node (master)."
echo " -N Enable NUMA for processes on same node."
echo " -n <procs> Start as many <procs> from machine file."
echo " -d Run with GDB (debugger) on master node."
echo " -p Ping hosts before starting binary."
echo " -h This help."
echo
remove_temp_file
}
clean_exit()
{
if [ $1 = 1 ]; then
if [ -x $(dirname $0)/gaspi_cleanup ]; then
$(dirname $0)/gaspi_cleanup -m $UNIQ_MFILE
fi
fi
remove_temp_file
}
print_error_exit()
{
echo
echo "Error: $1"
echo
remove_temp_file
exit 1
}
validate_machinefile()
{
if [ ! -s $MFILE ]; then
print_error_exit "Empty machine file ($1)"
fi
# newline at the end
endl=`tail -c 1 $MFILE`
if [ "$endl" != "" ]; then
print_error_exit "No newline at end of machine file ($1)";
fi
# create tmp_file
rm -f $TMP_MFILE
touch $TMP_MFILE 2>/dev/null && chmod 777 $TMP_MFILE ||
{
print_error_exit "User permissions failure: $PWD is not writable"
}
ncount=0
while read LINE
do
i=`echo $LINE | gawk '{print $1}' `
if [ -n "$i" ]; then
if [ $VALIDATE_MACHINEFILE -eq 1 ]; then
ping -c 1 $i >/dev/null 2>&1 ||
{
print_error_exit "Host not reachable ($i)"
}
fi
ncount=$((ncount+1))
if [ $NNODES -lt $ncount ]; then
break
fi
echo $i >> $TMP_MFILE
fi
done < $MFILE
ORIG_MFILE=$MFILE
MFILE=$TMP_MFILE
uniq $MFILE > $UNIQ_MFILE
# number of nodes (NNODES) must fit number of hosts
n=$(wc -l < $MFILE)
if [ $n -lt $NNODES ]; then
print_error_exit "Not enough hosts ($n) for required number of nodes (-n $NNODES)"
fi
}
kill_procs()
{
echo "Killing GASPI procs..."
# clean_exit 1
for i in $(cat $UNIQ_MFILE)
do
P=`basename ${PRG}`
$GASPI_LAUNCHER $i "nohup killall -9 ${P} lt-${P} ${PRG} $(cat /dev/null)" > /dev/null 2>&1 &
done
remove_temp_file
wait
exit 1
}
#at least we need machinefile and binary
if [ $# -lt 3 ]; then
usage
exit 1
fi
#command line parsing
while [ -n "$1" ]; do
case $1 in
-m | --machinefile)
shift
if [ -r $1 ]; then
MFILE=$1
else
print_error_exit "Cannot read $1 (-m option) (or file does not exist)"
fi
;;
-N | --NUMA)
SET_NUMA=1
;;
-d | --debug)
DEBUG=1
;;
-h | --help)
usage
;;
-b | --binary)
shift
if [ -x $1 ]; then
MASTER_PRG=$1
else
print_error_exit "Cannot find $1 (-b option) (or file is not executable)"
fi
;;
-n | --nodes)
shift
NNODES=$1
;;
-p | --ping)
VALIDATE_MACHINEFILE=1
;;
*) #Sucks! we're taking a small risk here
if [ -x $1 ]; then
PRG=$(readlink -f $1)
shift
PRG_ARGS="$*"
break
else
print_error_exit "Cannot execute $1 (or file does not exist)"
fi
esac
shift
done
if [ -z "$PRG" ]; then
print_error_exit "No binary file provided. See help (-h option)"
fi
trap kill_procs TERM INT QUIT
#sanity check and settings
if [ $SET_NUMA -eq 1 ]; then
which numactl > /dev/null
if [ $? = 0 ]; then
MAX_NUMA_NODES=`numactl --hardware|grep available|gawk '{print $2}'`
HAS_NUMA=1
else
MAX_NUMA_NODES=1
HAS_NUMA=0
fi
else
MAX_NUMA_NODES=256
fi
#let's rock
#use all host in machines file
if [ $NNODES -eq 0 ]; then
NNODES=`sed /^$/d $MFILE |wc -l`
fi
validate_machinefile $MFILE
#master binary is the same
if [ -z "$MASTER_PRG" ]; then
MASTER_PRG=$PRG
fi
location=$(readlink -f `dirname $0`)
if [ ! -x ${location}/ssh.spawner ]; then
echo
echo "The required spawner is missing (or not executable)"
echo
clean_exit 0
exit 1
fi
#who's master
master_node=`head -n 1 $MFILE`
previous="$master_node"
j=0
node=-1
rank=0
for LINE in $(tail -n +2 $MFILE)
do
if [ "$LINE" = "$previous" ]; then
j=$((j+1))
if [ $j -eq $MAX_NUMA_NODES ]; then
echo
echo "Error: incorrect machine file (-m $ORIG_MFILE) (max procs per node: $MAX_NUMA_NODES)"
echo
clean_exit 0
exit 1
fi
else
location=$(readlink -f `dirname $0`)
if [ "$previous" != "$master_node" ]; then
cmd="${location}/ssh.spawner ${master_node} 0 $(readlink -f $MFILE) $j "
else
if [ $j -eq 0 ]; then
node=$((node+1))
previous="$LINE"
continue;
else
cmd="${location}/ssh.spawner ${master_node} 1 $(readlink -f $MFILE) $j "
fi
fi
rank=$j
node=$((node+1))
cmd="$cmd $node $rank $NNODES"
if [ $SET_NUMA -eq 1 ]; then
cmd="$cmd 1"
else
cmd="$cmd 0"
fi
cmd="$cmd ${PRG} ${PRG_ARGS}"
$GASPI_LAUNCHER $previous "nohup $cmd $(cat /dev/null)" &
j=0
fi
previous="$LINE"
done
node=$((node+1))
location=$(readlink -f `dirname $0`)
if [ "$previous" != "$master_node" ]; then
cmd="${location}/ssh.spawner ${master_node} 0 $(readlink -f $MFILE) $j"
else
cmd="${location}/ssh.spawner ${master_node} 1 $(readlink -f $MFILE) $j"
fi
cmd="$cmd $node $rank $NNODES"
if [ $SET_NUMA -eq 1 ]; then
cmd="$cmd 1"
else
cmd="$cmd 0"
fi
cmd="$cmd ${PRG} ${PRG_ARGS}"
$GASPI_LAUNCHER $previous "nohup $cmd $(cat /dev/null) 2>&1" &
echo $PRG > /tmp/.last_gaspi_prg
if [ -O /tmp/.last_gaspi_prg ]; then
chmod a+rw /tmp/.last_gaspi_prg > /dev/null 2>&1
fi
#start master
if [ "$master_node" != "$HN" ]; then
if [ $DEBUG != 0 ]; then
echo
echo "Running with debugger only allowed if current node ($HN) is the first node in machinefile"
echo
clean_exit 1
exit 1
fi
#prepare differently for remote node
chmod a+r $MFILE > /dev/null 2>&1
TMP_FILE=`readlink -f $MFILE`
MFILE=$TMP_FILE
scp $MFILE ${master_node}:${MFILE} > /dev/null 2>&1
if [ $? != 0 ]; then
echo "Warning: Failed to copy machinefile to remote host. Program might not start correctly."
fi
cmd="/bin/sh -c"
cmd="$cmd 'export GASPI_MASTER=$master_node"
cmd="$cmd; export GASPI_SOCKET=0"
cmd="$cmd; export GASPI_MFILE=$MFILE"
cmd="$cmd; export GASPI_RANK=0"
cmd="$cmd; export GASPI_NRANKS=$NNODES"
if [ $SET_NUMA -eq 1 ]; then
cmd="$cmd; export GASPI_SET_NUMA_SOCKET=1"
fi
cmd="$cmd; $MASTER_PRG $PRG_ARGS'"
$GASPI_LAUNCHER $master_node $cmd
if [ $? != 0 ]; then
echo "Error: Failed to start $MASTER_PRG on $master_node"
clean_exit 1
exit 1
fi
else
export GASPI_MASTER=$HN
if [ $SET_NUMA -eq 1 ]; then
export GASPI_SET_NUMA_SOCKET=1
fi
export GASPI_SOCKET=0
export GASPI_MFILE=$MFILE
export GASPI_RANK=0
export GASPI_NRANKS=$NNODES
if [ $DEBUG != 0 ]; then
gdb --args $MASTER_PRG $PRG_ARGS
else
$MASTER_PRG $PRG_ARGS
fi
if [ $? != 0 ]; then
clean_exit 1
exit 1
fi
fi
wait
#clean-up and exit
clean_exit 0
exit 0