-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrun.sh
executable file
·462 lines (449 loc) · 17.8 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
#!/bin/bash
# Copyright 2021 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -euo pipefail
export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source "$SCRIPT_DIR/common.sh"
# Print usage if requested
if [[ $# -lt 2 || ! "$1" =~ ^(1/)?[0-9]+(:[0-9]+)?$ ]]; then
echo "Usage: $(basename "${BASH_SOURCE[0]}") <num-nodes>[:<ranks-per-node>] <prog.py> <arg1> <arg2> ..."
echo "Positional arguments:"
echo " <num-nodes> : positive integer or ratio < 1 (e.g. 1/4, for partial-node runs)"
echo " <ranks-per-node> : positive integer (default: 1)"
echo " <argI> : arguments to the program itself, Legate or Legion"
echo " see legate -h for options accepted by Legate"
echo " see the Legion README for options accepted by Legion"
echo "Arguments read from the environment:"
echo " ACCOUNT : account/group/project to submit the job under (if applicable)"
echo " DRY_RUN : don't submit the job, just print out the job scheduler command (default: 0)"
echo " EXTRA_ARGS : extra arguments to pass to legate on each invocation of the program"
echo " the arguments for each run are separated by semicolons"
echo " (example: '--profile;--event --dataflow', default: '')"
echo " IMAGE : which image to use (for container-based clusters)"
echo " INTERACTIVE : submit an interactive rather than a batch job (defaut: 0)"
echo " ITERATIONS : how many times to run the program (defaut: 1)"
echo " MOUNTS : comma-separated list of volume mounts (for container-based clusters)"
echo " (syntax depends on cluster) (default: none)"
echo " NODRIVER : don't invoke the Legate driver script (defaut: 0)"
echo " NOWAIT : don't wait for batch jobs to start (default: 0)"
echo " PLATFORM : what machine we are executing on (default: auto-detected)"
echo " QUEUE : what queue/partition to submit the job to (default: depends on cluster)"
echo " RESERVED_CORES : physical cores to reserve for Legion & Realm meta-work (default: 4)"
echo " SCRATCH : where to create an output directory (default: .)"
echo " TIMELIMIT : how much time to request for the job, in minutes (defaut: 60)"
echo " USE_CUDA : run with CUDA enabled (defaut: enable if Legate was compiled with CUDA support)"
echo " USE_OPENMP : run with OpenMP enabled (defaut: enable if Legate was compiled with OpenMP"
echo " support, but skip if USE_CUDA==1)"
exit
fi
# Read arguments
NODE_STR="$1"
if [[ "$1" == *":"* ]]; then
RANKS_PER_NODE="${NODE_STR#*:}"
NODE_STR="${NODE_STR%%:*}"
else
RANKS_PER_NODE=1
fi
if [[ "$NODE_STR" =~ ^1/[0-9]+$ ]]; then
NUM_NODES=1
RATIO_OF_NODE_USED="$NODE_STR"
else
NUM_NODES="$NODE_STR"
RATIO_OF_NODE_USED=1
fi
NODE_RATIO="$RATIO_OF_NODE_USED / $RANKS_PER_NODE"
shift
detect_platform
if [[ "$PLATFORM" == summit ]]; then
CONTAINER_BASED=0
elif [[ "$PLATFORM" == perlmutter ]]; then
CONTAINER_BASED=0
elif [[ "$PLATFORM" == pizdaint ]]; then
CONTAINER_BASED=0
elif [[ "$PLATFORM" == sapling2 ]]; then
CONTAINER_BASED=0
elif [[ "$PLATFORM" == lassen ]]; then
CONTAINER_BASED=0
else
# Local run
CONTAINER_BASED=0
fi
export DRY_RUN="${DRY_RUN:-0}"
if [[ "$DRY_RUN" == 1 ]]; then
export NOWAIT=1
fi
# The default being " " rather than "" is by design
export EXTRA_ARGS="${EXTRA_ARGS:- }"
export INTERACTIVE="${INTERACTIVE:-0}"
if [[ -z "${ITERATIONS+x}" ]]; then
IFS=';' read -ra EXTRA_ARGS_ARR <<< "$EXTRA_ARGS"
export ITERATIONS="${#EXTRA_ARGS_ARR[@]}"
fi
export MOUNTS="${MOUNTS:-}"
export NODRIVER="${NODRIVER:-0}"
export NOWAIT="${NOWAIT:-0}"
export RESERVED_CORES="${RESERVED_CORES:-4}"
export SCRATCH="${SCRATCH:-.}"
export TIMELIMIT="${TIMELIMIT:-60}"
if [[ "$CONTAINER_BASED" == 1 ]]; then
# Can't auto-detect this outside the container, assume best-case scenario
export USE_CUDA="${USE_CUDA:-1}"
if [[ -z "${USE_OPENMP+x}" ]]; then
if [[ "$USE_CUDA" == 1 ]]; then
export USE_OPENMP=0
else
export USE_OPENMP=1
fi
fi
else
if [[ -z "${USE_CUDA+x}" ]]; then
if ! command -v legate &> /dev/null; then
# can't query; assume best-case scenario
export USE_CUDA=1
elif [[ "$(legate --info | grep use_cuda | awk '{ print $3 }')" == True ]]; then
export USE_CUDA=1
else
export USE_CUDA=0
fi
fi
if [[ -z "${USE_OPENMP+x}" ]]; then
if [[ "$USE_CUDA" == 1 ]]; then
export USE_OPENMP=0
elif ! command -v legate &> /dev/null; then
# can't query; assume best-case scenario
export USE_OPENMP=1
elif [[ "$(legate --info | grep use_openmp | awk '{ print $3 }')" == True ]]; then
export USE_OPENMP=1
else
export USE_OPENMP=0
fi
fi
fi
# Prepare output directory
DATE="$(date +%Y/%m/%d)"
TIME="$(date +%H%M%S)"
mkdir -p "$SCRATCH/$DATE"
export HOST_OUT_DIR="$SCRATCH/$DATE/$TIME"
mkdir "$HOST_OUT_DIR"
echo "Redirecting stdout, stderr and logs to $HOST_OUT_DIR"
if [[ "$CONTAINER_BASED" == 1 ]]; then
export CMD_OUT_DIR=/result
else
export CMD_OUT_DIR="$HOST_OUT_DIR"
fi
# Retrieve resources available per node
# The FB_PER_GPU setting should leave 1-2GB for other uses:
# - the needs of Realm itself
# - loading CUDA modules
# - memory allocations from libraries that operate outside of Realm, e.g. NCCL
# The RAM_PER_NUMA setting should leave:
# - ~10% of RAM for the OS, job manager etc.
# - ~4GB for other uses within a Legion application:
# - ~1GB of RAM for GASNet
# - ~1GB of RAM for the Legion runtime
# - 256MB of RAM for sysmem/csize (reserved for the app, non-NUMA-aware)
# - 256MB of RAM for ib_rsize (reserved for remote DMA transfers)
# - 256MB of RAM for ib_csize (reserved for DMA transfers to/from the GPU)
if [[ "$PLATFORM" == summit ]]; then
# 2 NUMA domains per node
# 2 NICs per NUMA domain (4 NICs per node)
# 21 cores per NUMA domain (1 more reserved for OS) (42 cores per node)
# 4-way SMT per core
# 256GB RAM per NUMA domain (512GB RAM per node)
# 3 Tesla V100 GPUs per NUMA domain (6 GPUs per node)
# 16GB FB per GPU
NUMAS_PER_NODE=2
RAM_PER_NUMA=200000
GPUS_PER_NODE=6
CORES_PER_NUMA=21
FB_PER_GPU=14500
elif [[ "$PLATFORM" == perlmutter ]]; then
# 4 NUMA domains per node
# 1 NIC per NUMA domain (4 NICs per node)
# 16 cores per NUMA domain (64 cores per node)
# 2-way SMT per core
# 64GM RAM per NUMA domain (256GB RAM per node)
# 1 Ampere A100 GPU per NUMA domain (4 GPUs per node)
# 80GB FB per GPU
NUMAS_PER_NODE=4
RAM_PER_NUMA=48000
GPUS_PER_NODE=4
CORES_PER_NUMA=16
FB_PER_GPU=76000
elif [[ "$PLATFORM" == pizdaint ]]; then
# 1 NUMA domain per node
# 1 NIC per node
# 12 cores per NUMA domain
# 2-way SMT per core
# 64GB RAM per NUMA domain
# 1 Tesla P100 GPU per node
# 16GB FB per GPU
NUMAS_PER_NODE=1
RAM_PER_NUMA=55000
GPUS_PER_NODE=1
CORES_PER_NUMA=12
FB_PER_GPU=14500
elif [[ "$PLATFORM" == sapling2 ]]; then
# 2 NUMA domains per node
# 1 NIC per node
# 10 cores per NUMA domain
# 2-way SMT per core
# 128GB RAM per NUMA domain
# 4 Tesla P100 GPUs per node (4/8 nodes)
# 16GB FB per GPU
NUMAS_PER_NODE=2
RAM_PER_NUMA=100000
GPUS_PER_NODE=4
CORES_PER_NUMA=10
FB_PER_GPU=14500
CPU_SLOTS="0,2,4,6,8,10,12,14,16,18 20,22,24,26,28,30,32,34,36,38 1,3,5,7,9,11,13,15,17,19 21,23,25,27,29,31,33,35,37,39"
GPU_SLOTS=" 0 1 2 3"
MEM_SLOTS=" 0 0 1 1"
NIC_SLOTS=" mlx5_0 mlx5_0 mlx5_0 mlx5_0"
elif [[ "$PLATFORM" == lassen ]]; then
# 2 NUMA domains per node
# 2 NICs per NUMA domain (4 NICs per node)
# 20 cores per NUMA domain (2 more reserved for OS) (40 cores per node)
# 4-way SMT per core
# 128GB RAM per NUMA domain (256GB RAM per node)
# 2 Tesla V100 GPUs per NUMA domain (4 GPUs per node)
# 16GB FB per GPU
NUMAS_PER_NODE=2
RAM_PER_NUMA=100000
GPUS_PER_NODE=4
CORES_PER_NUMA=20
FB_PER_GPU=14500
else
# Local run
echo "Did not detect a supported cluster, assuming local-node run."
export NOWAIT=1
# Auto-detect available resources
if [[ "$(uname)" == "Darwin" ]]; then
NUMAS_PER_NODE=1
RAM_PER_NODE=$(( $(sysctl -n hw.memsize) / 1024 / 1024 ))
CORES_PER_NUMA="$(sysctl -n machdep.cpu.core_count)"
else
NUMAS_PER_NODE="$(lscpu | grep 'Socket(s)' | awk '{print $2}')"
RAM_PER_NODE="$(free -m | head -2 | tail -1 | awk '{print $2}')"
CORES_PER_NUMA="$(lscpu | grep 'Core(s) per socket' | awk '{print $4}')"
fi
RAM_PER_NUMA=$(( RAM_PER_NODE / 2 / NUMAS_PER_NODE )) # use half the available system memory
if [[ "$USE_CUDA" == 1 ]]; then
GPUS_PER_NODE="$(nvidia-smi -q | grep 'Attached GPUs' | awk '{print $4}')"
# Leave 5% of the available framebuffer for other uses, but not less than 2GB
FB_PER_GPU="$(nvidia-smi --format=csv,noheader,nounits --query-gpu=memory.total -i 0)"
RESERVED_FB=$(( FB_PER_GPU / 20 ))
RESERVED_FB=$(( RESERVED_FB > 2000 ? RESERVED_FB : 2000 ))
FB_PER_GPU=$(( FB_PER_GPU - RESERVED_FB ))
fi
fi
# Calculate available resources per OpenMP group
NUM_OMPS=$(( NUMAS_PER_NODE * $NODE_RATIO ))
if [[ $NUM_OMPS -lt 1 ]]; then
NUM_OMPS=1
CORES_PER_OMP=$(( CORES_PER_NUMA * NUMAS_PER_NODE * $NODE_RATIO ))
RAM_PER_OMP=$(( RAM_PER_NUMA * NUMAS_PER_NODE * $NODE_RATIO ))
else
CORES_PER_OMP="$CORES_PER_NUMA"
RAM_PER_OMP="$RAM_PER_NUMA"
fi
NUM_CORES=$(( NUM_OMPS * CORES_PER_OMP ))
WORK_RAM=$(( NUM_OMPS * RAM_PER_OMP ))
if [[ "$USE_CUDA" == 1 ]]; then
NUM_GPUS=$(( GPUS_PER_NODE * $NODE_RATIO ))
fi
# Add legate driver to command
if [[ "$NODRIVER" != 1 ]]; then
set -- --nodes "$NUM_NODES" --ranks-per-node "$RANKS_PER_NODE" "$@"
set -- --verbose --log-to-file "$@"
# Split available resources between ranks
if [[ "$USE_OPENMP" == 1 ]]; then
# Need at least 2 more cores, for 1 CPU processor and 1 Python processor
RESERVED_CORES=$(( RESERVED_CORES + 2 ))
# Add 1 core per 2 GPUs (hyperthreads should be sufficient for GPU processors)
if [[ "$USE_CUDA" == 1 ]]; then
RESERVED_CORES=$(( RESERVED_CORES + NUM_GPUS / 2 ))
fi
# These reserved cores must be subtracted equally from each OpenMP group
RESERVED_PER_OMP=$(( ( RESERVED_CORES + NUM_OMPS - 1 ) / NUM_OMPS ))
if (( RESERVED_PER_OMP >= CORES_PER_OMP )); then
echo "Error: Not enough cores, try reducing RESERVED_CORES" 1>&2
exit 1
fi
THREADS_PER_OMP=$(( CORES_PER_OMP - RESERVED_PER_OMP ))
set -- --cpus 1 --sysmem 256 "$@"
set -- --omps "$NUM_OMPS" --ompthreads "$THREADS_PER_OMP" "$@"
set -- --numamem "$RAM_PER_OMP" "$@"
if [[ "$USE_CUDA" == 1 ]]; then
set -- --gpus "$NUM_GPUS" --fbmem "$FB_PER_GPU" "$@"
fi
elif [[ "$USE_CUDA" == 1 ]]; then
# Need at least 2 more cores, for 1 CPU processor and 1 Python processor
RESERVED_CORES=$(( RESERVED_CORES + 2 ))
# Add 1 core per 2 GPUs (hyperthreads should be sufficient for GPU processors)
RESERVED_CORES=$(( RESERVED_CORES + NUM_GPUS / 2 ))
if (( RESERVED_CORES > NUM_CORES )); then
echo "Error: Not enough cores, try reducing RESERVED_CORES" 1>&2
exit 1
fi
set -- --gpus "$NUM_GPUS" --fbmem "$FB_PER_GPU" "$@"
set -- --cpus 1 --sysmem $(( WORK_RAM>4000 ? 4000 : WORK_RAM)) "$@"
else
# Need at least 1 more core, for the Python processor
RESERVED_CORES=$(( RESERVED_CORES + 1 ))
if (( RESERVED_CORES >= NUM_CORES )); then
echo "Error: Not enough cores, try reducing RESERVED_CORES" 1>&2
exit 1
fi
set -- --cpus $(( NUM_CORES - RESERVED_CORES )) --sysmem "$WORK_RAM" "$@"
fi
# Bind resources to ranks, on multi-rank-per-node runs
function compute_binding() {
RES_LIST=( "$@" )
SLOTS_PER_RANK=$(( GPUS_PER_NODE / RANKS_PER_NODE ))
START_SLOT=0
BINDING=""
while (( START_SLOT < GPUS_PER_NODE )); do
RES_FOR_RANK="${RES_LIST[@]:$START_SLOT:$SLOTS_PER_RANK}"
# Combine any duplicates
RES_FOR_RANK="$( echo $RES_FOR_RANK | tr ' ' '\n' | sort -u | tr '\n' ',' )"
RES_FOR_RANK="${RES_FOR_RANK%?}"
if [[ "$BINDING" == "" ]]; then
BINDING="$RES_FOR_RANK"
else
BINDING="$BINDING/$RES_FOR_RANK"
fi
START_SLOT=$(( START_SLOT + SLOTS_PER_RANK ))
done
echo "$BINDING"
}
if [[ "$RANKS_PER_NODE" == 1 ]]; then
true
elif [[ "$RATIO_OF_NODE_USED" != 1 ]]; then
echo "Warning: Automatic resource binding not supported for partial-node runs"
elif [[ -z "${CPU_SLOTS+x}" ]]; then
echo "Warning: Don't know how to bind resources on this platform"
elif (( GPUS_PER_NODE < RANKS_PER_NODE || GPUS_PER_NODE % RANKS_PER_NODE != 0 )); then
echo "Warning: Cannot split resources $RANKS_PER_NODE-ways; skipping resource binding"
else
set -- --nic-bind "$(compute_binding $NIC_SLOTS)" "$@"
set -- --mem-bind "$(compute_binding $MEM_SLOTS)" "$@"
if [[ "$USE_CUDA" == 1 ]]; then
set -- --gpu-bind "$(compute_binding $GPU_SLOTS)" "$@"
fi
set -- --cpu-bind "$(compute_binding $CPU_SLOTS)" "$@"
fi
# Add launcher options
if [[ "$PLATFORM" == summit ]]; then
set -- --launcher jsrun "$@"
elif [[ "$PLATFORM" == perlmutter ]]; then
set -- --launcher srun "$@"
elif [[ "$PLATFORM" == pizdaint ]]; then
set -- --launcher srun "$@"
elif [[ "$PLATFORM" == sapling2 ]]; then
set -- --launcher mpirun "$@"
elif [[ "$PLATFORM" == lassen ]]; then
set -- --launcher jsrun "$@"
else
# Local run
if (( NUM_NODES > 1 || RANKS_PER_NODE > 1 )); then
set -- --launcher mpirun "$@"
else
set -- --launcher none "$@"
fi
fi
set -- legate "$@"
fi
# Submit job to appropriate scheduler
if [[ "$PLATFORM" == summit ]]; then
set -- "$SCRIPT_DIR/legate.lsf" "$@"
QUEUE="${QUEUE:-batch}"
if [[ "$INTERACTIVE" == 1 ]]; then
set -- -Is "$@"
else
set -- -o "$HOST_OUT_DIR/out.txt" "$@"
fi
set -- bsub -J legate -P "$ACCOUNT" -q "$QUEUE" -W "$TIMELIMIT" -nnodes "$NUM_NODES" -alloc_flags smt1 "$@"
submit "$@"
elif [[ "$PLATFORM" == perlmutter ]]; then
# WAR for: https://gasnet-bugs.lbl.gov/bugzilla/show_bug.cgi?id=4638
verbose_export LEGATE_DISABLE_MPI=1
set -- "$SCRIPT_DIR/legate.slurm" "$@"
# We double the number of cores because SLURM counts virtual cores
set -- -J legate -A "$ACCOUNT" -t "$TIMELIMIT" -N "$NUM_NODES" "$@"
set -- --ntasks-per-node "$RANKS_PER_NODE" -c $(( 2 * NUM_CORES )) "$@"
if [[ "$USE_CUDA" == 1 ]]; then
set -- -C "gpu&hbm80g" --gpus-per-task "$NUM_GPUS" "$@"
fi
if [[ "$INTERACTIVE" == 1 ]]; then
QUEUE="${QUEUE:-interactive_ss11}"
set -- salloc -q "$QUEUE" "$@"
else
QUEUE="${QUEUE:-regular_ss11}"
set -- sbatch -q "$QUEUE" -o "$HOST_OUT_DIR/out.txt" "$@"
fi
submit "$@"
elif [[ "$PLATFORM" == pizdaint ]]; then
set -- "$SCRIPT_DIR/legate.slurm" "$@"
QUEUE="${QUEUE:-normal}"
set -- -J legate -A "$ACCOUNT" -p "$QUEUE" -t "$TIMELIMIT" -N "$NUM_NODES" "$@"
if [[ "$USE_CUDA" == 1 ]]; then
set -- -C gpu "$@"
fi
if [[ "$INTERACTIVE" == 1 ]]; then
set -- salloc "$@"
else
set -- sbatch -o "$HOST_OUT_DIR/out.txt" "$@"
fi
submit "$@"
elif [[ "$PLATFORM" == sapling2 ]]; then
set -- "$SCRIPT_DIR/legate.slurm" "$@"
QUEUE="${QUEUE:-gpu}"
set -- -J legate -p "$QUEUE" -t "$TIMELIMIT" -N "$NUM_NODES" "$@"
if [[ "$RATIO_OF_NODE_USED" == 1 ]]; then
set -- --exclusive "$@"
fi
set -- --ntasks-per-node "$RANKS_PER_NODE" -c $(( 2 * NUM_CORES )) "$@"
if [[ "$USE_CUDA" == 1 ]]; then
set -- -G $(( GPUS_PER_NODE * $RATIO_OF_NODE_USED )) "$@"
fi
if [[ "$INTERACTIVE" == 1 ]]; then
set -- salloc "$@"
else
set -- sbatch -o "$HOST_OUT_DIR/out.txt" "$@"
fi
submit "$@"
elif [[ "$PLATFORM" == lassen ]]; then
set -- "$SCRIPT_DIR/legate.lsf" "$@"
QUEUE="${QUEUE:-pbatch}"
if [[ "$INTERACTIVE" == 1 ]]; then
set -- -Is "$@"
else
set -- -o "$HOST_OUT_DIR/out.txt" "$@"
fi
set -- bsub -J legate -P "$ACCOUNT" -q "$QUEUE" -W "$TIMELIMIT" -nnodes "$NUM_NODES" -alloc_flags smt1 "$@"
submit "$@"
else
# Local run
run_command "$@" 2>&1 | tee -a "$CMD_OUT_DIR/out.txt"
fi
# Wait for batch job to start
if [[ "$INTERACTIVE" != 1 && "$NOWAIT" != 1 ]]; then
echo "Waiting for job to start & piping stdout/stderr"
echo "Press Ctrl-C anytime to exit (job will still run; use scancel/bkill to kill it)"
while [[ ! -f "$HOST_OUT_DIR/out.txt" ]]; do sleep 1; done
echo "Job started"
sed '/^Job finished/q' <( exec tail -n +0 -f "$HOST_OUT_DIR/out.txt" ) && kill $!
fi