Skip to content

Commit

Permalink
add stuff for roofline analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Boro Sofranac committed Sep 23, 2021
1 parent 0f19beb commit 59505d0
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
fileReader/venv
.idea/
build/
cmake-build-debug/
*__pycache__*
193 changes: 193 additions & 0 deletions fileReader/analyze_roofline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import os
from matplotlib import pyplot as plt
from regexes import *

V100_peak_flops = 7800.0 # GFLOPS
#V100_peak_flops = 15700.0 # GFLOPS
V100_peak_bandwidth = 900.0 # GB/s
V100_machine_balance = V100_peak_flops/V100_peak_bandwidth
print("V100 machine balance: ", V100_machine_balance)

# unit conversion dict to GB/s or s
units = {
'B/s': 1e9,
'KB/s': 1e6,
'MB/s': 1e3,
'GB/s': 1,
'us': 1e6,
'ms': 1e3,
's':1
}


def frange(start, stop, step=1.0):
f = start
while f < stop:
f += step
yield f


def to_byte_per_s(val, unit):
if unit == 'GB/s':
return val * 1e9
elif unit == 'MB/s':
return val * 1e6
elif unit == 'KB/s':
return val * 1e3
elif unit == 'B/s':
return val
else:
raise Exception("Unknown unit ", unit)


def to_seconds(val, unit):
if unit == 's':
return val
elif unit == 'ms':
return val / 1e3
elif unit == 'us':
return val / 1e6


def plot_roofline():
fig = plt.figure(frameon=False)
ax = fig.add_subplot(1, 1, 1)
yticks_labels = []
yticks = []
xticks_labels = []
xticks = [2.**i for i in range(-6, 6)]
ax.set_xlabel('arithmetic intensity [FLOP/byte]')
ax.set_ylabel('performance [FLOP/s]')

# Upper bound
x = list(frange(min(xticks), max(xticks), 0.01))
ax.plot(x, [min(V100_peak_bandwidth*x, float(V100_peak_flops)) for x in x])


ax.set_xscale('log', basex=2)
ax.set_yscale('log')
ax.set_xlim(min(xticks), max(xticks))
# ax.set_yticks([perf, float(max_flops)])
ax.set_xticks(xticks)
ax.grid(axis='x', alpha=0.7, linestyle='--')
# fig.savefig('out.pdf')
plt.show()


class RunData:
def __init__(self):
self.prob_name = None
self.num_invocation = None
self.flops = None
self.DR = None
self.TR = None
self.TR_unit = None
self.DW = None
self.TW = None
self.TW_unit = None
self.runtime = None
self.runtime_unit = None
self.total_mem = None
self.total_mem_unit = None

self.AI = None
self.attainable_perf = None
self.achieve_perf = None
self.achieved_perf_percent = None

def print(self):
print(self.prob_name, ": invocations:", self.num_invocation, ", flops:", self.flops, ", DR:", self.DR, ", DR_unit:", self.DR_unit, ", DW:", self.DW, ", DW_unit", self.DW_unit)

def compute_metrics(self):

runtime_in_s = to_seconds(self.runtime, self.runtime_unit)

data_movement = (to_byte_per_s(self.TR, self.TR_unit) + to_byte_per_s(self.TW, self.TW_unit)) * runtime_in_s
self.AI = self.flops / data_movement
#self.AI = self.flops / ((self.DR + self.DW)*32)

self.attainable_perf = min(V100_peak_bandwidth * self.AI, float(V100_peak_flops)) # in GB/s
self.achieve_perf = (self.flops/1e9) / runtime_in_s
self.achieved_perf_percent = (self.achieve_perf / self.attainable_perf) * 100


def get_run_object(output: str) -> RunData:
runData = RunData()
runData.prob_name = get_regex_result(prob_name_pattern, output, "prob_file")
runData.num_invocation = get_regex_result(roofline_flops_pattern, output, "invocations")
runData.flops = int(float(get_regex_result(roofline_flops_pattern, output, "avg")))
runData.DR = int(get_regex_result(roofline_DR_pattern, output, "avg"))
runData.TR = float(get_regex_result(roofline_TR_pattern, output, "avg"))
runData.TR_unit = get_regex_result(roofline_TR_pattern, output, "avg_unit").strip()
runData.DW = int(get_regex_result(roofline_DW_pattern, output, "avg"))
runData.TW = float(get_regex_result(roofline_TW_pattern, output, "avg"))
runData.TW_unit = get_regex_result(roofline_TW_pattern, output, "avg_unit").strip()

return runData


def add_runtime_data_to_run_objects(output: str, run_data_objects):
prob_name = get_regex_result(prob_name_pattern, output, "prob_file")

obj = [obj for obj in run_data_objects if obj.prob_name == prob_name]
assert len(obj) == 1
obj = obj[0]

obj.runtime = float(get_regex_result(roofline_runtime_pattern, output, "avg"))
obj.runtime_unit = get_regex_result(roofline_runtime_pattern, output, "avg_unit")


if __name__ == "__main__":

throughput_log_file = "04_08_2021_gpu2_double_roofline.log"
runtime_log_file = "05_08_2021_gpu2_roofline_runtimes_double.log"

with open(os.path.join("plotter", throughput_log_file), 'r') as f:
throughput_results_file = f.read()
with open(os.path.join("plotter", runtime_log_file), 'r') as f:
runtime_results_file = f.read()

throughput_instances_output = list(map(lambda x: str(x.group()), re.finditer(re.compile(roofline_prob_out_pattern), throughput_results_file)))
runtime_instances_output = list(map(lambda x: str(x.group()), re.finditer(re.compile(roofline_prob_out_pattern), runtime_results_file)))

throughput_instances_output = list(filter(lambda output: get_regex_result(roofline_success_run, output) is not None, throughput_instances_output))
runtime_instances_output = list(filter(lambda output: get_regex_result(roofline_success_run, output) is not None, runtime_instances_output))
print("num instances after removing those with no results available: ", len(throughput_instances_output))

throughput_instances_output = list(filter(lambda output: int(get_regex_result(nnz_pattern, output, 'nnz')) >2.5*1e5, throughput_instances_output))
runtime_instances_output = list(filter(lambda output: int(get_regex_result(nnz_pattern, output, 'nnz')) >2.5*1e5, runtime_instances_output))
print("num instances after removing those with nnz <= 1.5*1e5: ", len(throughput_instances_output))

run_data_objects = list(map(lambda x: get_run_object(x), throughput_instances_output))
assert len(run_data_objects) == len(throughput_instances_output)

#add runtime data to objects

for output in runtime_instances_output:
add_runtime_data_to_run_objects(output, run_data_objects)

run_data_objects = list(filter(lambda obj: obj.runtime is not None, run_data_objects))

print("num isntances with both runtime and throughput info: ", len(run_data_objects))

# run_data_objects = list(filter(lambda obj: obj.TR_unit == 'GB/s' and obj.TW_unit == 'GB/s', run_data_objects))
# print("num isntances with GB/s throughput: ", len(run_data_objects))
# highest mem throughput
for obj in run_data_objects:
obj.compute_metrics()

AIs = list(map(lambda obj: obj.AI, run_data_objects))
print("arithemtic intensity: min: ", min(AIs), ", max: ", max(AIs), ", avr: ", sum(AIs)/len(AIs))

perf_percentages = list(map(lambda obj: obj.achieved_perf_percent, run_data_objects))
print("achieved performance percentages: min: ", min(perf_percentages), ", max: ", max(perf_percentages), ", avr: ", sum(perf_percentages)/len(perf_percentages))



# AI= calc_AI(flop,DR,DW)
# print("Arithmetic intensity is: ", AI, ". Machine balance is: ", V100_machine_balance)
#
# achieved_perf = get_achieved_perf(flop,runtime)
# attainable_perf = get_attainable_perf(AI)
# print("achieved perf in percentages: ", get_achieved_per_in_percent_to_attainable(achieved_perf,attainable_perf))

4 changes: 2 additions & 2 deletions fileReader/exec_file.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,6 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

printf "\n\n === $FILE ===\n"


#nvprof --profile-child-processes --profile-from-start off --kernels "GPUAtomicDomainPropagation" --metrics flop_count_dp --metrics dram_read_throughput --metrics dram_write_throughput --metrics dram_read_transactions --metrics dram_write_transactions python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE

#nvprof --profile-child-processes --profile-from-start off --concurrent-kernels off --openacc-profiling off --print-gpu-summary python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
3 changes: 3 additions & 0 deletions fileReader/exec_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ while true ; do
esac
done

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
for filename in $FILES*; do
printf "\n\n === $filename ===\n"
python3 -u run_propagation.py -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
#nvprof --profile-child-processes --profile-from-start off --kernels "GPUAtomicDomainPropagation" --metrics flop_count_dp --metrics dram_read_throughput --metrics dram_write_throughput --metrics dram_read_transactions --metrics dram_write_transactions python3 -u "$DIR/run_propagation.py" -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
# nvprof --profile-child-processes --profile-from-start off --concurrent-kernels off --openacc-profiling off --print-gpu-summary python3 -u "$DIR/run_propagation.py" -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
done
20 changes: 16 additions & 4 deletions fileReader/plotter/plot_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@


def get_linestyle(algorithm, machine):
if 'fastmath' in machine:

if any(x in machine for x in ['fastmath', 'megakernel']):
return 'dotted'
elif algorithm == 'cpu_omp' or any(x in machine for x in ['single', 'gpu_loop', '8thrds']):
return 'dashed'
Expand All @@ -66,6 +67,13 @@ def get_line_color(machine, algorithm):
if algorithm == 'papilo':
return 'tab:red'

if 'cpu_loop' in machine:
return 'tab:green'
if 'gpu_loop' in machine:
return 'tab:red'
if 'megakernel' in machine:
return 'tab:blue'

if 'seed0' in machine:
return 'tab:blue'
if 'seed1' in machine:
Expand Down Expand Up @@ -213,7 +221,7 @@ def truncate(number, decimals=0):
factor = 10.0 ** decimals
return math.trunc(number * factor) / factor

fig = plt.figure()
fig = plt.figure(figsize=(10,8))
plt.style.use('bmh')
plot_a = True
### Subplot A ###
Expand All @@ -228,10 +236,10 @@ def truncate(number, decimals=0):
label=str(algorithm) + "-" + str(machine), linestyle=get_linestyle(algorithm, machine),
color=get_line_color(machine, algorithm))
# constant one - the baseline case
# plt.plot(np.arange(8), np.ones(8), label="cpu_seq-xeon", linestyle='dashdot', color='tab:gray')
plt.plot(np.arange(8), np.ones(8), label="cpu_seq-xeon", linestyle='dashdot', color='tab:gray')

plt.yscale('log')
yticks = get_y_ticks(ys, 10)
yticks = get_y_ticks(ys, 8)
plt.yticks(yticks, yticks)
plt.xticks(np.arange(len(speedups[algorithm][machine][0])),
map(lambda x: "Set-" + str(x + 1), np.arange(len(speedups[algorithm][machine][0]))))
Expand Down Expand Up @@ -268,6 +276,10 @@ def truncate(number, decimals=0):
rc('font', **{'family': 'serif', 'serif': ['Times']})
rc('text', usetex=True)
ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
from matplotlib.pyplot import figure



plt.show()


Expand Down
13 changes: 13 additions & 0 deletions fileReader/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import time
import re

nnz_pattern = r"nnz : (?P<nnz>\d+)\n"

#result_pattern = r"Reding of (?P<prob_file>.*) model done!\nnum vars: (?P<n_vars>\d*)\nnum cons: (?P<n_cons>\d*)\nnnz : (?P<nnz>\d*)\n\n.*\n.*cpu_seq propagation done. Num rounds: (?P<cpu_seq_rounds>\d*)\ncpu_seq execution time : (?P<cpu_seq_time>\d*).*\n\n.*\n.*cpu_omp propagation done. Num rounds: (?P<cpu_omp_rounds>\d*)\ncpu_omp execution time : (?P<cpu_omp_time>\d*).*\n\n.*\n.*gpu_reduction propagation done. Num rounds: (?P<gpu_reduction_rounds>\d*)\ngpu_reduction execution time : (?P<gpu_reduction_time>\d*).*\n\n.*\n.*gpu_atomic propagation done. Num rounds: (?P<gpu_atomic_rounds>\d*)\ngpu_atomic execution time : (?P<gpu_atomic_time>\d*).*\n\n.*\n.*\n.*\nall results match: (?P<results_correct>.*)"
result_pattern = r"Reading of (?P<prob_file>.*) model done!\nnum vars: (?P<n_vars>\d*)\nnum cons: (?P<n_cons>\d*)\nnnz : (?P<nnz>\d*)\n\n.*\n.*cpu_seq propagation done. Num rounds: (?P<cpu_seq_rounds>\d*)\ncpu_seq execution time : (?P<cpu_seq_time>\d*).*\n\n.*\n.*cpu_omp propagation done. Num rounds: (?P<cpu_omp_rounds>\d*)\ncpu_omp execution time : (?P<cpu_omp_time>\d*).*\n\n.*\n.*gpu_atomic propagation done. Num rounds: (?P<gpu_atomic_rounds>\d*)\ngpu_atomic execution time : (?P<gpu_atomic_time>\d*).*\n\ncpu_seq to cpu_omp results match: (?P<dsadasdas>.*)\ncpu_seq to gpu_atomic results match: (?P<dadadas>.*)\nall results match: (?P<results_correct>.*)"
seq_to_omp_pattern = r"cpu_seq to cpu_omp results match: (?P<match>.*)"
Expand Down Expand Up @@ -33,6 +35,17 @@ def round_timestamp_pattern(prop_round, alg): return "Propagation round: {}, {}
no_bdchgs_after_papilo_pattern = r"papilo did not find any bound changes after cpu_seq!"


# Roofline analysis regexes
roofline_prob_out_pattern = r"(?s)read with 0 errors\n(.*?)Reding lp file"
roofline_flops_pattern = r"[ ]+(?P<invocations>\d*)[ ]+flop_count_dp[ ]+Floating Point Operations\(Double Precision\)[ ]+(?P<min>\d+.\d+e\+\d+|\d+)[ ]+(?P<max>\d+.\d+e\+\d+|\d+)[ ]+(?P<avg>\d+.\d+e\+\d+|\d+)\n"
roofline_flops_float_pattern = r"[ ]+(?P<invocations>\d*)[ ]+flop_count_sp[ ]+Floating Point Operations\(Single Precision\)[ ]+(?P<min>\d+.\d+e\+\d+|\d+)[ ]+(?P<max>\d+.\d+e\+\d+|\d+)[ ]+(?P<avg>\d+.\d+e\+\d+|\d+)\n"
roofline_TR_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_read_throughput[ ]+Device Memory Read Throughput[ ]+(?P<min>\d+.\d+)(?P<min_unit>.*)[ ]+(?P<max>\d+.\d+)(?P<max_unit>.*)[ ]+(?P<avg>\d+.\d+)(?P<avg_unit>.*)\n"
roofline_TW_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_write_throughput[ ]+Device Memory Write Throughput[ ]+(?P<min>\d+.\d+)(?P<min_unit>.*)[ ]+(?P<max>\d+.\d+)(?P<max_unit>.*)[ ]+(?P<avg>\d+.\d+)(?P<avg_unit>.*)\n"
roofline_DR_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_read_transactions[ ]+Device Memory Read Transactions[ ]+(?P<min>\d+)[ ]+(?P<max>\d+)[ ]+(?P<avg>\d+)\n"
roofline_DW_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_write_transactions[ ]+Device Memory Write Transactions[ ]+(?P<min>\d+)[ ]+(?P<max>\d+)[ ]+(?P<avg>\d+)\n"
roofline_success_run = r"gpu_atomic propagation done."
roofline_runtime_pattern = r" GPU activities:[ ]+\d+.\d+\%[ ]+\d+.\d+[a-z]+[ ]+\d+[ ]+(?P<avg>\d+.\d+)(?P<avg_unit>[a-z]+)[ ]+(?P<min>\d+.\d+)(?P<min_unit>[a-z]+)[ ]+(?P<max>\d+.\d+)(?P<max_unit>[a-z]+)[ ]+void GPUAtomicDomainPropagation"

def get_regex_result(regex_string: str, search_string: str, group_name: str = None):
m = re.compile(regex_string).search(search_string)

Expand Down
12 changes: 6 additions & 6 deletions fileReader/run_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,20 @@ def exec_run(
ubs_dis = ubs_seq = ubs_gpuatomic = ubs_gpu = ubs_omp = ubs


(seq_new_lbs, seq_new_ubs) = propagateSequential(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_seq, ubs_seq, vartypes, datatype=c_double)
# (seq_new_lbs, seq_new_ubs) = propagateSequential(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_seq, ubs_seq, vartypes, datatype=c_double)

(omp_new_lbs, omp_new_ubs) = propagateFullOMP(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_omp, ubs_omp, vartypes, datatype=c_double)
#(omp_new_lbs, omp_new_ubs) = propagateFullOMP(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_omp, ubs_omp, vartypes, datatype=c_double)

# (gpu_new_lbs, gpu_new_ubs) = propagateGPUReduction(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_gpu, ubs_gpu, vartypes, datatype=datatype)

# (dis_new_lbs, dis_new_ubs) = propagateSequentialDisjoint( n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_dis, ubs_dis, vartypes)
# idx = 1

(gpuatomic_new_lbs, gpuatomic_new_ubs) = propagateGPUAtomic(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_gpuatomic, ubs_gpuatomic, vartypes, synctype=synctype, datatype=datatype)
print("")
eq1 = compare_results(seq_new_lbs, seq_new_ubs, omp_new_lbs, omp_new_ubs,"cpu_seq", "cpu_omp")
eq2 = compare_results(seq_new_lbs, seq_new_ubs, gpuatomic_new_lbs, gpuatomic_new_ubs, "cpu_seq", "gpu_atomic")
print("all results match: ", eq1 and eq2)
# print("")
# eq1 = compare_results(seq_new_lbs, seq_new_ubs, omp_new_lbs, omp_new_ubs,"cpu_seq", "cpu_omp")
# eq2 = compare_results(seq_new_lbs, seq_new_ubs, gpuatomic_new_lbs, gpuatomic_new_ubs, "cpu_seq", "gpu_atomic")
# print("all results match: ", eq1 and eq2)

# compare_arrays_diff_idx(seq_new_lbs, omp_new_lbs, "lbs")
# compare_arrays_diff_idx(seq_new_ubs, omp_new_ubs, "ubs")
Expand Down
22 changes: 11 additions & 11 deletions test/testCases/test_end_to_end.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ void runAllAlgsAnalyticalSol() {
Problem<datatype> ts_gpu_r;
Problem<datatype> ts_gpu_a;

printf("running cpu_seq");
tester.executeSequentialPropagator(ts_seq);
tester.checkSolution(ts_seq);

printf("running cpu_omp");
tester.executeFullOMPPropagator(ts_omp);
tester.checkSolution(ts_omp);

printf("running gpu_reduction");
tester.executeGPUReduction(ts_gpu_r);
tester.checkSolution(ts_gpu_r);
// printf("running cpu_seq");
// tester.executeSequentialPropagator(ts_seq);
// tester.checkSolution(ts_seq);
//
// printf("running cpu_omp");
// tester.executeFullOMPPropagator(ts_omp);
// tester.checkSolution(ts_omp);
//
// printf("running gpu_reduction");
// tester.executeGPUReduction(ts_gpu_r);
// tester.checkSolution(ts_gpu_r);

printf("running gpu_atomic");
tester.executeAtomicGPUPropagator(ts_gpu_a);
Expand Down

0 comments on commit 59505d0

Please sign in to comment.