Skip to content

Commit

Permalink
ocl: fixed composing build flags and other improvements
Browse files Browse the repository at this point in the history
* Account for initial flags (c_dbcsr_acc_opencl_flags_atomics).
* Improved support for multiple processes (auto-tune).
  • Loading branch information
hfp committed Mar 18, 2024
1 parent d82fe1e commit 1b3d868
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .ci/daint.cscs.ch/ocl.build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
fi
cd "${HOME}/libxsmm"
git fetch
git checkout 05705477183444a82c8d9be8d7c2627efd6d67fa
git checkout 6c55e168d2053fa44f60f6985c370303bd84f9c1
make -j
cd ..

Expand Down
2 changes: 1 addition & 1 deletion src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ int c_dbcsr_acc_opencl_flags_atomics(const c_dbcsr_acc_opencl_device_t* devinfo,
}
assert(NULL != atomic_exp);
/* compose build parameters and flags */
result = LIBXSMM_SNPRINTF(flags, flags_maxlen, "-DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type,
result = LIBXSMM_SNPRINTF(flags, flags_maxlen, " -DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type,
atomic_ops, atomic_exp, barrier_expr);
}
}
Expand Down
12 changes: 6 additions & 6 deletions src/acc/opencl/smm/tune_multiply.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from opentuner import Result
from signal import signal, SIGINT
import tempfile
import socket
import shutil
import copy
import json
Expand Down Expand Up @@ -176,16 +175,15 @@ def __init__(self, args):
): # setup database (DB)
if args.database is None: # adjust DB-location
envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK"))
directory = "{}-{}".format(dbdir, os.getenv("HOSTNAME"))
if envrank:
self.idevice = int(envrank) % self.ndevices
directory = "{}-{}.db".format(dbdir, self.idevice)
else:
directory = "{}.db".format(dbdir)
directory += ".{}".format(self.idevice)
if os.path.isdir(directory):
shutil.rmtree(directory)
os.mkdir(directory)
self.args.database = "sqlite:///" + os.path.join(
directory, "{}.db".format(socket.gethostname())
directory, "{}.db".format(os.getpid())
)
if not self.args.label: # label for DB-session
self.args.label = "{}-{}-{}-s{}".format(
Expand Down Expand Up @@ -436,7 +434,7 @@ def merge_jsons(self, filenames):
s = 0
if 0 < gflops:
g = int(filename.split("-")[-1].split("g")[0])
s = gflops / g # slowdown
s = gflops / g if 0 < g else 0 # slowdown
if mtime < os.path.getmtime(filename):
if 0 < s:
retsld[1] = retsld[1] + math.log(s)
Expand Down Expand Up @@ -842,6 +840,8 @@ def handle_sigint(self, signum, frame):
# OPENCL_LIBSMM_SMM_xx=tune|enabled|on must be given to permit tuning)
if os.getenv("OPENCL_LIBSMM_SMM_WS") not in default_enable_tune:
os.environ["OPENCL_LIBSMM_SMM_WS"] = "{}".format(args.ws)
if os.getenv("OPENCL_LIBSMM_SMM_AL") not in default_enable_tune:
os.environ["OPENCL_LIBSMM_SMM_AL"] = "{}".format(args.al)
# fix tunables according to level of tuning
if 1 <= args.tlevel or 0 > args.tlevel:
os.environ["OPENCL_LIBSMM_SMM_BM"] = "{}".format(args.bm)
Expand Down
78 changes: 43 additions & 35 deletions src/acc/opencl/smm/tune_multiply.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,39 +83,19 @@ then
break;;
esac
done
# how to print standard vs error messages
if [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then
ECHO=">&2 echo"
else
ECHO="echo"
fi
eval "${ECHO} \"Usage: $0 [options] [<triplet-spec>]\""
eval "${ECHO} \" Options must precede triplet specification\""
eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\""
eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\""
eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\""
eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\""
eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\""
eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\""
eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\""
eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\""
eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\""
eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\""
eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\""
eval "${ECHO} \" -m|--limit N: limit any shape extent to N\""
eval "${ECHO} \" -n|--triplets N: limit number of triplet\""
eval "${ECHO} \" -k|--specid N: predefined triplets\""
eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\""
eval "${ECHO} \" 0: 201 kernels\""
eval "${ECHO} \" 10: 1266 kernels\""
eval "${ECHO} \" <triplet-spec>, e.g., 134 kernels\""
eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\""
eval "${ECHO}"
# default settings
# default/basic settings
if [ ! "${BATCHSIZE}" ]; then BATCHSIZE=0; fi
if [ ! "${JSONDIR}" ]; then JSONDIR=.; fi
if [ ! "${TLEVEL}" ]; then TLEVEL=-1; fi
if [ ! "${NPARTS}" ]; then NPARTS=1; fi
if [ ! "${PART}" ]; then PART=1; fi
if [ ! "${NPARTS}" ]; then NPARTS=${PMI_SIZE:-1}; fi
if [ ! "${PART}" ]; then PART=${PMI_RANK:-0}; PART=$((PART+1)); fi
if [ ! "${WAIT}" ] && [ "1" = "${NPARTS}" ]; then WAIT=0; fi
# sanity checks
if [ "0" != "$((NPARTS<PART))" ]; then
>&2 echo "ERROR: part-number ${PART} is larger than the requested ${NPARTS} parts!"
Expand All @@ -131,7 +111,6 @@ then
exit 1
elif [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then
if [ "${UPDATE}" ] && [ "0" != "${UPDATE}" ]; then
if [ ! "${TLEVEL}" ] || [ "0" != "$((0>TLEVEL))" ]; then TLEVEL=1; fi
MNKS=$(${SED} -n "s/.*tune_multiply-..*-\(..*x..*x.[^-]*\)-..*gflops\.json/\1/p" <<<"${JSONS}" \
| ${SORT} -u -n -tx -k1,1 -k2,2 -k3,3)
elif [ "${SPECID}" ]; then
Expand All @@ -142,6 +121,30 @@ then
else
exit 0
fi
if [ ! "${WAIT}" ]; then
eval "${ECHO} \"Usage: $0 [options] [<triplet-spec>]\""
eval "${ECHO} \" Options must precede triplet specification\""
eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\""
eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\""
eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\""
eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\""
eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\""
eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\""
eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\""
eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\""
eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\""
eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\""
eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\""
eval "${ECHO} \" -m|--limit N: limit any shape extent to N\""
eval "${ECHO} \" -n|--triplets N: limit number of triplet\""
eval "${ECHO} \" -k|--specid N: predefined triplets\""
eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\""
eval "${ECHO} \" 0: 201 kernels\""
eval "${ECHO} \" 10: 1266 kernels\""
eval "${ECHO} \" <triplet-spec>, e.g., 134 kernels\""
eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\""
eval "${ECHO}"
fi
if [ "${MNKS}" ]; then
if [ "${BOUNDL}" ] || [ "${BOUNDU}" ]; then
if [ ! "${BOUNDL}" ]; then BOUNDL=0; elif [ ! "${BOUNDU}" ]; then BOUNDU=0; fi
Expand Down Expand Up @@ -187,10 +190,12 @@ then
PARTSIZE=$(((NTRIPLETS+NPARTS-1)/NPARTS))
PARTOFFS=$(((PART-1)*PARTSIZE))
PARTSIZE=$((PARTSIZE<=(NTRIPLETS-PARTOFFS)?PARTSIZE:(NTRIPLETS-PARTOFFS)))
if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then
echo "Session ${PART} of ${NPARTS} part(s)."
else
echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!"
if [ ! "${WAIT}" ] || [ "0" != "${WAIT}" ]; then
if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then
echo "Session ${PART} of ${NPARTS} part(s)."
else
echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!"
fi
fi
if [ ! "${MAXTIME}" ] && [[ (! "${CONTINUE}" || \
"${CONTINUE}" = "false" || \
Expand All @@ -200,10 +205,12 @@ then
MAXTIME=160
fi
if [ "${MAXTIME}" ] && [ "0" != "$((0<MAXTIME))" ]; then
HRS=$((MAXTIME*PARTSIZE/3600))
MNS=$(((MAXTIME*PARTSIZE-HRS*3600+59)/60))
echo "Tuning ${PARTSIZE} kernels in this session will take about" \
"${MAXTIME}s per kernel and ${HRS}h${MNS}m in total."
if [ ! "${WAIT}" ] || [ "0" != "${WAIT}" ]; then
HRS=$((MAXTIME*PARTSIZE/3600))
MNS=$(((MAXTIME*PARTSIZE-HRS*3600+59)/60))
echo "Tuning ${PARTSIZE} kernels in this session will take about" \
"${MAXTIME}s per kernel and ${HRS}h${MNS}m in total."
fi
MAXTIME="--stop-after=${MAXTIME}"
else
echo "Tuning ${PARTSIZE} kernels will take an unknown time (no limit given)."
Expand All @@ -227,8 +234,9 @@ then
MNKPART=$(${CUT} -d' ' -f $((PARTOFFS+1))-$((PARTOFFS+PARTSIZE)) <<<"${MNKS}")
for MNK in ${MNKPART}; do
if [ "0" != "$(((N)<PARTSIZE))" ]; then
if [ "1" != "${NPARTS}" ] && [ "${HOSTNAME}" ]; then STEP="@${HOSTNAME}"; fi
echo
echo "[$((N+1))/${PARTSIZE}]: auto-tuning ${MNK}-kernel..."
echo "[$((N+1))/${PARTSIZE}]${STEP}: auto-tuning ${MNK}-kernel..."
# avoid mixing database of previous results into new session
${RM} -rf ./opentuner.db
eval "${HERE}/tune_multiply.py ${MNK} ${DELETE} -p ${JSONDIR} -s ${BATCHSIZE} -a ${TLEVEL} ${MAXTIME}"
Expand Down

0 comments on commit 1b3d868

Please sign in to comment.