-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.sh
343 lines (306 loc) · 10.9 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#!/bin/bash
set -xe
start_time=$(date +%s)
# Validate parameters
if [ "$1" != "true" ] && [ "$1" != "false" ]; then
echo "The first parameter must be a boolean value to recreate the environment"
exit 1
fi
if [ "$#" -ne 4 ]; then
echo "Illegal number of parameters"
exit 1
fi
function install_conda() {
echo "Checking if miniconda3 is installed..."
if [ ! -d "$WORK/miniconda3" ]; then
echo "Miniconda not found in $WORK..."
echo "Installing..."
mkdir -p "$WORK/miniconda3"
curl https://repo.anaconda.com/miniconda/Miniconda3-py311_23.10.0-1-Linux-x86_64.sh -o "$WORK/miniconda3/miniconda.sh"
bash "$WORK/miniconda3/miniconda.sh" -b -u -p "$WORK/miniconda3"
rm -rf "$WORK/miniconda3/miniconda.sh"
export PATH="$WORK/miniconda3/bin:$PATH"
echo "Ensuring conda base environment is OFF..."
conda config --set auto_activate_base false
else
export PATH="$WORK/miniconda3/bin:$PATH"
fi
conda init bash
echo "Sourcing .bashrc..."
source ~/.bashrc
unset PYTHONPATH
}
function load_cuda() {
echo "Loading CUDA..."
module load cuda/12.0
}
function export_repo_variables() {
COOKBOOK_NAME="sites-and-stories-nlp"
COOKBOOK_CONDA_ENV="llm"
COOKBOOK_DIR=${WORK}/cookbooks
COOKBOOK_WORKSPACE_DIR=${COOKBOOK_DIR}/${COOKBOOK_NAME}
COOKBOOK_REPOSITORY_PARENT_DIR=${COOKBOOK_DIR}/.repository
COOKBOOK_REPOSITORY_DIR=${COOKBOOK_REPOSITORY_PARENT_DIR}/${COOKBOOK_NAME}
UPDATE_AVAILABLE_FILE=${COOKBOOK_WORKSPACE_DIR}/UPDATE_AVAILABLE.txt
NODE_HOSTNAME_PREFIX=$(hostname -s) # Short Host Name --> name of compute node: c###-###
NODE_HOSTNAME_DOMAIN=$(hostname -d) # DNS Name --> stampede2.tacc.utexas.edu
NODE_HOSTNAME_LONG=$(hostname -f) # Fully Qualified Domain Name --> c###-###.stampede2.tacc.utexas.edu
export COOKBOOK_NAME
export COOKBOOK_DIR
export COOKBOOK_WORKSPACE_DIR
export COOKBOOK_REPOSITORY_DIR
export COOKBOOK_REPOSITORY_PARENT_DIR
export UPDATE_AVAILABLE_FILE
export NODE_HOSTNAME_PREFIX
export NODE_HOSTNAME_DOMAIN
export NODE_HOSTNAME_LONG
export COOKBOOK_CONDA_ENV
}
function clone_cookbook_on_workspace() {
DATE_FILE_SUFFIX=$(date +%Y%m%d%H%M%S)
if [ ! -d "$COOKBOOK_WORKSPACE_DIR" ]; then
git clone ${GIT_REPO_URL} --branch ${GIT_BRANCH} ${COOKBOOK_WORKSPACE_DIR}
else
if [ ${DOWNLOAD_LATEST_VERSION} = "true" ]; then
mv ${COOKBOOK_WORKSPACE_DIR} ${COOKBOOK_WORKSPACE_DIR}-${DATE_FILE_SUFFIX}
git clone ${GIT_REPO_URL} --branch ${GIT_BRANCH} ${COOKBOOK_WORKSPACE_DIR}
fi
fi
}
function clone_cookbook_on_archive() {
if [ ! -d "${COOKBOOK_REPOSITORY_DIR}" ]; then
mkdir -p ${COOKBOOK_REPOSITORY_DIR}
git clone ${GIT_REPO_URL} --branch ${GIT_BRANCH} ${COOKBOOK_REPOSITORY_DIR}
else
git -C ${COOKBOOK_REPOSITORY_DIR} pull origin ${GIT_BRANCH}
fi
}
function init_directory() {
mkdir -p ${COOKBOOK_REPOSITORY_PARENT_DIR}
clone_cookbook_on_workspace
}
function get_tap_certificate() {
mkdir -p ${HOME}/.tap # this should exist at this point, but just in case...
export TAP_CERTFILE=${HOME}/.tap/.${SLURM_JOB_ID}
# bail if we cannot create a secure session
if [ ! -f ${TAP_CERTFILE} ]; then
echo "TACC: ERROR - could not find TLS cert for secure session"
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)"
exit 1
fi
}
function get_tap_token() {
# bail if we cannot create a token for the session
TAP_TOKEN=$(tap_get_token)
if [ -z "${TAP_TOKEN}" ]; then
echo "TACC: ERROR - could not generate token for jupyter session"
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)"
exit 1
fi
echo "TACC: using token ${TAP_TOKEN}"
LOGIN_PORT=$(tap_get_port)
export TAP_TOKEN
export LOGIN_PORT
}
function load_tap_functions() {
TAP_FUNCTIONS="/share/doc/slurm/tap_functions"
if [ -f ${TAP_FUNCTIONS} ]; then
. ${TAP_FUNCTIONS}
else
echo "TACC:"
echo "TACC: ERROR - could not find TAP functions file: ${TAP_FUNCTIONS}"
echo "TACC: ERROR - Please submit a consulting ticket at the TACC user portal"
echo "TACC: ERROR - https://portal.tacc.utexas.edu/tacc-consulting/-/consult/tickets/create"
echo "TACC:"
echo "TACC: job $SLURM_JOB_ID execution finished at: $(date)"
exit 1
fi
}
function create_jupyter_configuration {
mkdir -p ${HOME}/.tap
TAP_JUPYTER_CONFIG="${HOME}/.tap/jupyter_config.py"
JUPYTER_SERVER_APP="ServerApp"
JUPYTER_BIN="jupyter-lab"
LOCAL_PORT=5902
echo ${PWD}
cat <<-EOF >${TAP_JUPYTER_CONFIG}
# Configuration file for TAP jupyter session
import ssl
c = get_config()
c.IPKernelApp.pylab = "inline" # if you want plotting support always
c.${JUPYTER_SERVER_APP}.ip = "0.0.0.0"
c.${JUPYTER_SERVER_APP}.port = $LOCAL_PORT
c.${JUPYTER_SERVER_APP}.open_browser = False
c.${JUPYTER_SERVER_APP}.allow_origin = u"*"
c.${JUPYTER_SERVER_APP}.ssl_options = {"ssl_version": ssl.PROTOCOL_TLSv1_2}
c.${JUPYTER_SERVER_APP}.root_dir = "${_tapisJobWorkingDir}"
c.${JUPYTER_SERVER_APP}.preferred_dir = "${_tapisJobWorkingDir}"
c.${JUPYTER_SERVER_APP}.notebook_dir = "${_tapisJobWorkingDir}/work"
c.FileContentsManager.delete_to_trash = False
c.IdentityProvider.token = "${TAP_TOKEN}"
c.MultiKernelManager.default_kernel_name = "${COOKBOOK_CONDA_ENV}"
EOF
}
function run_jupyter() {
conda activate ${COOKBOOK_CONDA_ENV}
NB_SERVERDIR=$HOME/.jupyter
JUPYTER_SERVER_APP="ServerApp"
JUPYTER_BIN="jupyter-lab"
JUPYTER_ARGS="--certfile=$(cat ${TAP_CERTFILE}) --config=${TAP_JUPYTER_CONFIG} --notebook-dir='${COOKBOOK_DIR}' --preferred-dir='${COOKBOOK_DIR}'"
JUPYTER_LOGFILE=${NB_SERVERDIR}/${NODE_HOSTNAME_PREFIX}.log
mkdir -p ${NB_SERVERDIR}
touch $JUPYTER_LOGFILE
nohup ${JUPYTER_BIN} ${JUPYTER_ARGS} &>${JUPYTER_LOGFILE} &
JUPYTER_PID=$!
# verify jupyter is up. if not, give one more try, then bail
if ! $(ps -fu ${USER} | grep ${JUPYTER_BIN} | grep -qv grep); then
# sometimes jupyter has a bad day. give it another chance to be awesome.
echo "TACC: first jupyter launch failed. Retrying..."
nohup ${JUPYTER_BIN} ${JUPYTER_ARGS} &>${JUPYTER_LOGFILE} &
fi
if ! $(ps -fu ${USER} | grep ${JUPYTER_BIN} | grep -qv grep); then
# jupyter will not be working today. sadness.
echo "TACC: ERROR - jupyter failed to launch"
echo "TACC: ERROR - this is often due to an issue in your python or conda environment"
echo "TACC: ERROR - jupyter logfile contents:"
cat ${JUPYTER_LOGFILE}
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)"
exit 1
fi
}
function port_fowarding() {
LOCAL_PORT=5902
# Disable exit on error so we can check the ssh tunnel status.
set +e
for i in $(seq 2); do
ssh -o StrictHostKeyChecking=no -q -f -g -N -R ${LOGIN_PORT}:${NODE_HOSTNAME_PREFIX}:${LOCAL_PORT} login${i}
done
if [ $(ps -fu ${USER} | grep ssh | grep login | grep -vc grep) != 2 ]; then
# jupyter will not be working today. sadness.
echo "TACC: ERROR - ssh tunnels failed to launch"
echo "TACC: ERROR - this is often due to an issue with your ssh keys"
echo "TACC: ERROR - undo any recent mods in ${HOME}/.ssh"
echo "TACC: ERROR - or submit a TACC consulting ticket with this error"
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)"
exit 1
fi
# Re-enable exit on error.
set -e
}
function send_url_to_webhook() {
JUPYTER_URL="https://${NODE_HOSTNAME_DOMAIN}:${LOGIN_PORT}/?token=${TAP_TOKEN}"
INTERACTIVE_WEBHOOK_URL="${_webhook_base_url}"
# Wait a few seconds for jupyter to boot up and send webhook callback url for job ready notification.
# Notification is sent to _INTERACTIVE_WEBHOOK_URL, e.g. https://3dem.org/webhooks/interactive/
(
sleep 5 &&
curl -k --data "event_type=interactive_session_ready&address=${JUPYTER_URL}&owner=${_tapisJobOwner}&job_uuid=${_tapisJobUUID}" "${_INTERACTIVE_WEBHOOK_URL}" &
) &
}
function session_cleanup() {
# This file will be located in the directory mounted by the job.
SESSION_FILE=delete_me_to_end_session
touch $SESSION_FILE
echo $NODE_HOSTNAME_LONG $IPYTHON_PID >$SESSION_FILE
# While the session file remains undeleted, keep Jupyter session running.
while [ -f $SESSION_FILE ]; do
sleep 10
done
}
function conda_environment_exists() {
conda env list | grep "${COOKBOOK_CONDA_ENV}"
}
function create_conda_environment() {
conda env create -n ${COOKBOOK_CONDA_ENV} -f $COOKBOOK_WORKSPACE_DIR/.binder/environment.yml --yes
conda activate ${COOKBOOK_CONDA_ENV}
conda install jupyterlab ipykernel --yes
pip install --no-cache-dir -r $COOKBOOK_WORKSPACE_DIR/.binder/requirements.txt
python -m ipykernel install --user --name "${COOKBOOK_CONDA_ENV}" --display-name "Python (${COOKBOOK_CONDA_ENV})"
}
function delete_conda_environment() {
conda deactivate
conda env remove -n ${COOKBOOK_CONDA_ENV}
}
function handle_installation() {
if [ ${UPDATE_CONDA_ENV} = "true" ]; then
if { conda_environment_exists; } >/dev/null 2>&1; then
delete_conda_environment
fi
create_conda_environment
else
if { conda_environment_exists; } >/dev/null 2>&1; then
echo "Conda environment already exists"
else
create_conda_environment
fi
fi
}
function set_up_cache_directories() {
# if the user has a cache directory and it is not a symlink, move it to scratch and create a symlink
if [ -d "${HOME}/.cache" ] && [ ! -L "${HOME}/.cache" ]; then
mv "${HOME}/.cache" "${SCRATCH}/"
ln -s "${SCRATCH}/.cache" "${HOME}/.cache"
fi
# if the cache directory does not exist, create it
if [ ! -d "${HOME}/.cache" ]; then
mkdir -p "${SCRATCH}/.cache"
ln -s "${SCRATCH}/.cache" "${HOME}/.cache"
fi
# if [ ! -d "${COOKBOOK_WORKSPACE_DIR}/home" ] && [ ! -L "${COOKBOOK_WORKSPACE_DIR}/home" ]; then
# echo "HOME LN created"
# ln -s $HOME "${COOKBOOK_WORKSPACE_DIR}/home"
# fi
# if [ ! -d "${COOKBOOK_WORKSPACE_DIR}/work" ] && [ ! -L "${COOKBOOK_WORKSPACE_DIR}/work" ]; then
# ln -s " $WORK ${COOKBOOK_WORKSPACE_DIR}/work"
# fi
# if [ ! -d "${COOKBOOK_WORKSPACE_DIR}/scratch" ] && [ ! -L "${COOKBOOK_WORKSPACE_DIR}/scratch" ]; then
# ln -s $SCRATCH "${COOKBOOK_WORKSPACE_DIR}/scratch"
# fi
# if [ ! -d "${COOKBOOK_WORKSPACE_DIR}/shared" ] && [ ! -L "${COOKBOOK_WORKSPACE_DIR}/shared" ]; then
# ln -s "/corral-repl/tacc/aci/PT2050/projects" "${COOKBOOK_WORKSPACE_DIR}/shared"
# fi
}
function start_ollama(){
if [ ! -f $SCRATCH/ollama ]; then
wget "https://github.com/ollama/ollama/releases/download/v0.4.1/ollama-linux-amd64.tgz"
tar -xvzf ollama-linux-amd64.tgz
chmod 755 ./bin/ollama
mv ./bin/ollama $SCRATCH/ollama
fi
nohup $SCRATCH/ollama serve &
nohup !$SCRATCH/ollama pull mixtral &
}
function get_elapsed_time() {
start_time=$1
end_time=$(date +%s)
elapsed_time=$(($end_time - $start_time))
minutes=$(($elapsed_time / 60))
echo "Elapsed time: $minutes minutes"
}
function pre_start(){
conda activate ${COOKBOOK_CONDA_ENV}
python -m spacy download en_core_web_sm
}
#Parameters
export DOWNLOAD_LATEST_VERSION=$1
export UPDATE_CONDA_ENV=$2
export GIT_REPO_URL=$3
export GIT_BRANCH=$4
#Execution
install_conda
load_cuda
export_repo_variables
set_up_cache_directories
init_directory
load_tap_functions
get_tap_certificate
get_tap_token
create_jupyter_configuration
handle_installation
pre_start
run_jupyter
port_fowarding
start_ollama
send_url_to_webhook
get_elapsed_time $start_time
session_cleanup