diff --git a/.gitignore b/.gitignore index 38af277..e104a66 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ fileReader/venv .idea/ +build/ cmake-build-debug/ *__pycache__* diff --git a/fileReader/analyze_roofline.py b/fileReader/analyze_roofline.py new file mode 100644 index 0000000..02a0cb1 --- /dev/null +++ b/fileReader/analyze_roofline.py @@ -0,0 +1,193 @@ +import os +from matplotlib import pyplot as plt +from regexes import * + +V100_peak_flops = 7800.0 # GFLOPS +#V100_peak_flops = 15700.0 # GFLOPS +V100_peak_bandwidth = 900.0 # GB/s +V100_machine_balance = V100_peak_flops/V100_peak_bandwidth +print("V100 machine balance: ", V100_machine_balance) + +# unit conversion dict to GB/s or s +units = { + 'B/s': 1e9, + 'KB/s': 1e6, + 'MB/s': 1e3, + 'GB/s': 1, + 'us': 1e6, + 'ms': 1e3, + 's':1 +} + + +def frange(start, stop, step=1.0): + f = start + while f < stop: + f += step + yield f + + +def to_byte_per_s(val, unit): + if unit == 'GB/s': + return val * 1e9 + elif unit == 'MB/s': + return val * 1e6 + elif unit == 'KB/s': + return val * 1e3 + elif unit == 'B/s': + return val + else: + raise Exception("Unknown unit ", unit) + + +def to_seconds(val, unit): + if unit == 's': + return val + elif unit == 'ms': + return val / 1e3 + elif unit == 'us': + return val / 1e6 + + +def plot_roofline(): + fig = plt.figure(frameon=False) + ax = fig.add_subplot(1, 1, 1) + yticks_labels = [] + yticks = [] + xticks_labels = [] + xticks = [2.**i for i in range(-6, 6)] + ax.set_xlabel('arithmetic intensity [FLOP/byte]') + ax.set_ylabel('performance [FLOP/s]') + + # Upper bound + x = list(frange(min(xticks), max(xticks), 0.01)) + ax.plot(x, [min(V100_peak_bandwidth*x, float(V100_peak_flops)) for x in x]) + + + ax.set_xscale('log', basex=2) + ax.set_yscale('log') + ax.set_xlim(min(xticks), max(xticks)) + # ax.set_yticks([perf, float(max_flops)]) + ax.set_xticks(xticks) + ax.grid(axis='x', alpha=0.7, linestyle='--') + # fig.savefig('out.pdf') + plt.show() + + +class RunData: + def __init__(self): + self.prob_name = None + self.num_invocation = None + self.flops = None + self.DR = None + self.TR = None + self.TR_unit = None + self.DW = None + self.TW = None + self.TW_unit = None + self.runtime = None + self.runtime_unit = None + self.total_mem = None + self.total_mem_unit = None + + self.AI = None + self.attainable_perf = None + self.achieve_perf = None + self.achieved_perf_percent = None + + def print(self): + print(self.prob_name, ": invocations:", self.num_invocation, ", flops:", self.flops, ", DR:", self.DR, ", DR_unit:", self.DR_unit, ", DW:", self.DW, ", DW_unit", self.DW_unit) + + def compute_metrics(self): + + runtime_in_s = to_seconds(self.runtime, self.runtime_unit) + + data_movement = (to_byte_per_s(self.TR, self.TR_unit) + to_byte_per_s(self.TW, self.TW_unit)) * runtime_in_s + self.AI = self.flops / data_movement + #self.AI = self.flops / ((self.DR + self.DW)*32) + + self.attainable_perf = min(V100_peak_bandwidth * self.AI, float(V100_peak_flops)) # in GB/s + self.achieve_perf = (self.flops/1e9) / runtime_in_s + self.achieved_perf_percent = (self.achieve_perf / self.attainable_perf) * 100 + + +def get_run_object(output: str) -> RunData: + runData = RunData() + runData.prob_name = get_regex_result(prob_name_pattern, output, "prob_file") + runData.num_invocation = get_regex_result(roofline_flops_pattern, output, "invocations") + runData.flops = int(float(get_regex_result(roofline_flops_pattern, output, "avg"))) + runData.DR = int(get_regex_result(roofline_DR_pattern, output, "avg")) + runData.TR = float(get_regex_result(roofline_TR_pattern, output, "avg")) + runData.TR_unit = get_regex_result(roofline_TR_pattern, output, "avg_unit").strip() + runData.DW = int(get_regex_result(roofline_DW_pattern, output, "avg")) + runData.TW = float(get_regex_result(roofline_TW_pattern, output, "avg")) + runData.TW_unit = get_regex_result(roofline_TW_pattern, output, "avg_unit").strip() + + return runData + + +def add_runtime_data_to_run_objects(output: str, run_data_objects): + prob_name = get_regex_result(prob_name_pattern, output, "prob_file") + + obj = [obj for obj in run_data_objects if obj.prob_name == prob_name] + assert len(obj) == 1 + obj = obj[0] + + obj.runtime = float(get_regex_result(roofline_runtime_pattern, output, "avg")) + obj.runtime_unit = get_regex_result(roofline_runtime_pattern, output, "avg_unit") + + +if __name__ == "__main__": + + throughput_log_file = "04_08_2021_gpu2_double_roofline.log" + runtime_log_file = "05_08_2021_gpu2_roofline_runtimes_double.log" + + with open(os.path.join("plotter", throughput_log_file), 'r') as f: + throughput_results_file = f.read() + with open(os.path.join("plotter", runtime_log_file), 'r') as f: + runtime_results_file = f.read() + + throughput_instances_output = list(map(lambda x: str(x.group()), re.finditer(re.compile(roofline_prob_out_pattern), throughput_results_file))) + runtime_instances_output = list(map(lambda x: str(x.group()), re.finditer(re.compile(roofline_prob_out_pattern), runtime_results_file))) + + throughput_instances_output = list(filter(lambda output: get_regex_result(roofline_success_run, output) is not None, throughput_instances_output)) + runtime_instances_output = list(filter(lambda output: get_regex_result(roofline_success_run, output) is not None, runtime_instances_output)) + print("num instances after removing those with no results available: ", len(throughput_instances_output)) + + throughput_instances_output = list(filter(lambda output: int(get_regex_result(nnz_pattern, output, 'nnz')) >2.5*1e5, throughput_instances_output)) + runtime_instances_output = list(filter(lambda output: int(get_regex_result(nnz_pattern, output, 'nnz')) >2.5*1e5, runtime_instances_output)) + print("num instances after removing those with nnz <= 1.5*1e5: ", len(throughput_instances_output)) + + run_data_objects = list(map(lambda x: get_run_object(x), throughput_instances_output)) + assert len(run_data_objects) == len(throughput_instances_output) + + #add runtime data to objects + + for output in runtime_instances_output: + add_runtime_data_to_run_objects(output, run_data_objects) + + run_data_objects = list(filter(lambda obj: obj.runtime is not None, run_data_objects)) + + print("num isntances with both runtime and throughput info: ", len(run_data_objects)) + + # run_data_objects = list(filter(lambda obj: obj.TR_unit == 'GB/s' and obj.TW_unit == 'GB/s', run_data_objects)) + # print("num isntances with GB/s throughput: ", len(run_data_objects)) + # highest mem throughput + for obj in run_data_objects: + obj.compute_metrics() + + AIs = list(map(lambda obj: obj.AI, run_data_objects)) + print("arithemtic intensity: min: ", min(AIs), ", max: ", max(AIs), ", avr: ", sum(AIs)/len(AIs)) + + perf_percentages = list(map(lambda obj: obj.achieved_perf_percent, run_data_objects)) + print("achieved performance percentages: min: ", min(perf_percentages), ", max: ", max(perf_percentages), ", avr: ", sum(perf_percentages)/len(perf_percentages)) + + + + # AI= calc_AI(flop,DR,DW) + # print("Arithmetic intensity is: ", AI, ". Machine balance is: ", V100_machine_balance) + # + # achieved_perf = get_achieved_perf(flop,runtime) + # attainable_perf = get_attainable_perf(AI) + # print("achieved perf in percentages: ", get_achieved_per_in_percent_to_attainable(achieved_perf,attainable_perf)) + diff --git a/fileReader/exec_file.sh b/fileReader/exec_file.sh index 875be1d..e2e9d8c 100755 --- a/fileReader/exec_file.sh +++ b/fileReader/exec_file.sh @@ -61,6 +61,6 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" printf "\n\n === $FILE ===\n" - +#nvprof --profile-child-processes --profile-from-start off --kernels "GPUAtomicDomainPropagation" --metrics flop_count_dp --metrics dram_read_throughput --metrics dram_write_throughput --metrics dram_read_transactions --metrics dram_write_transactions python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE - +#nvprof --profile-child-processes --profile-from-start off --concurrent-kernels off --openacc-profiling off --print-gpu-summary python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE diff --git a/fileReader/exec_tests.sh b/fileReader/exec_tests.sh index dbc82b1..fa85420 100755 --- a/fileReader/exec_tests.sh +++ b/fileReader/exec_tests.sh @@ -57,7 +57,10 @@ while true ; do esac done +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" for filename in $FILES*; do printf "\n\n === $filename ===\n" python3 -u run_propagation.py -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE + #nvprof --profile-child-processes --profile-from-start off --kernels "GPUAtomicDomainPropagation" --metrics flop_count_dp --metrics dram_read_throughput --metrics dram_write_throughput --metrics dram_read_transactions --metrics dram_write_transactions python3 -u "$DIR/run_propagation.py" -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE + # nvprof --profile-child-processes --profile-from-start off --concurrent-kernels off --openacc-profiling off --print-gpu-summary python3 -u "$DIR/run_propagation.py" -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE done \ No newline at end of file diff --git a/fileReader/plotter/plot_results.py b/fileReader/plotter/plot_results.py index 279f504..11b96b6 100644 --- a/fileReader/plotter/plot_results.py +++ b/fileReader/plotter/plot_results.py @@ -54,7 +54,8 @@ def get_linestyle(algorithm, machine): - if 'fastmath' in machine: + + if any(x in machine for x in ['fastmath', 'megakernel']): return 'dotted' elif algorithm == 'cpu_omp' or any(x in machine for x in ['single', 'gpu_loop', '8thrds']): return 'dashed' @@ -66,6 +67,13 @@ def get_line_color(machine, algorithm): if algorithm == 'papilo': return 'tab:red' + if 'cpu_loop' in machine: + return 'tab:green' + if 'gpu_loop' in machine: + return 'tab:red' + if 'megakernel' in machine: + return 'tab:blue' + if 'seed0' in machine: return 'tab:blue' if 'seed1' in machine: @@ -213,7 +221,7 @@ def truncate(number, decimals=0): factor = 10.0 ** decimals return math.trunc(number * factor) / factor - fig = plt.figure() + fig = plt.figure(figsize=(10,8)) plt.style.use('bmh') plot_a = True ### Subplot A ### @@ -228,10 +236,10 @@ def truncate(number, decimals=0): label=str(algorithm) + "-" + str(machine), linestyle=get_linestyle(algorithm, machine), color=get_line_color(machine, algorithm)) # constant one - the baseline case - # plt.plot(np.arange(8), np.ones(8), label="cpu_seq-xeon", linestyle='dashdot', color='tab:gray') + plt.plot(np.arange(8), np.ones(8), label="cpu_seq-xeon", linestyle='dashdot', color='tab:gray') plt.yscale('log') - yticks = get_y_ticks(ys, 10) + yticks = get_y_ticks(ys, 8) plt.yticks(yticks, yticks) plt.xticks(np.arange(len(speedups[algorithm][machine][0])), map(lambda x: "Set-" + str(x + 1), np.arange(len(speedups[algorithm][machine][0])))) @@ -268,6 +276,10 @@ def truncate(number, decimals=0): rc('font', **{'family': 'serif', 'serif': ['Times']}) rc('text', usetex=True) ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f')) + from matplotlib.pyplot import figure + + + plt.show() diff --git a/fileReader/regexes.py b/fileReader/regexes.py index bdfd6f2..00411ef 100644 --- a/fileReader/regexes.py +++ b/fileReader/regexes.py @@ -4,6 +4,8 @@ import time import re +nnz_pattern = r"nnz : (?P\d+)\n" + #result_pattern = r"Reding of (?P.*) model done!\nnum vars: (?P\d*)\nnum cons: (?P\d*)\nnnz : (?P\d*)\n\n.*\n.*cpu_seq propagation done. Num rounds: (?P\d*)\ncpu_seq execution time : (?P\d*).*\n\n.*\n.*cpu_omp propagation done. Num rounds: (?P\d*)\ncpu_omp execution time : (?P\d*).*\n\n.*\n.*gpu_reduction propagation done. Num rounds: (?P\d*)\ngpu_reduction execution time : (?P\d*).*\n\n.*\n.*gpu_atomic propagation done. Num rounds: (?P\d*)\ngpu_atomic execution time : (?P\d*).*\n\n.*\n.*\n.*\nall results match: (?P.*)" result_pattern = r"Reading of (?P.*) model done!\nnum vars: (?P\d*)\nnum cons: (?P\d*)\nnnz : (?P\d*)\n\n.*\n.*cpu_seq propagation done. Num rounds: (?P\d*)\ncpu_seq execution time : (?P\d*).*\n\n.*\n.*cpu_omp propagation done. Num rounds: (?P\d*)\ncpu_omp execution time : (?P\d*).*\n\n.*\n.*gpu_atomic propagation done. Num rounds: (?P\d*)\ngpu_atomic execution time : (?P\d*).*\n\ncpu_seq to cpu_omp results match: (?P.*)\ncpu_seq to gpu_atomic results match: (?P.*)\nall results match: (?P.*)" seq_to_omp_pattern = r"cpu_seq to cpu_omp results match: (?P.*)" @@ -33,6 +35,17 @@ def round_timestamp_pattern(prop_round, alg): return "Propagation round: {}, {} no_bdchgs_after_papilo_pattern = r"papilo did not find any bound changes after cpu_seq!" +# Roofline analysis regexes +roofline_prob_out_pattern = r"(?s)read with 0 errors\n(.*?)Reding lp file" +roofline_flops_pattern = r"[ ]+(?P\d*)[ ]+flop_count_dp[ ]+Floating Point Operations\(Double Precision\)[ ]+(?P\d+.\d+e\+\d+|\d+)[ ]+(?P\d+.\d+e\+\d+|\d+)[ ]+(?P\d+.\d+e\+\d+|\d+)\n" +roofline_flops_float_pattern = r"[ ]+(?P\d*)[ ]+flop_count_sp[ ]+Floating Point Operations\(Single Precision\)[ ]+(?P\d+.\d+e\+\d+|\d+)[ ]+(?P\d+.\d+e\+\d+|\d+)[ ]+(?P\d+.\d+e\+\d+|\d+)\n" +roofline_TR_pattern = r"[ ]+(?P\d*)[ ]+dram_read_throughput[ ]+Device Memory Read Throughput[ ]+(?P\d+.\d+)(?P.*)[ ]+(?P\d+.\d+)(?P.*)[ ]+(?P\d+.\d+)(?P.*)\n" +roofline_TW_pattern = r"[ ]+(?P\d*)[ ]+dram_write_throughput[ ]+Device Memory Write Throughput[ ]+(?P\d+.\d+)(?P.*)[ ]+(?P\d+.\d+)(?P.*)[ ]+(?P\d+.\d+)(?P.*)\n" +roofline_DR_pattern = r"[ ]+(?P\d*)[ ]+dram_read_transactions[ ]+Device Memory Read Transactions[ ]+(?P\d+)[ ]+(?P\d+)[ ]+(?P\d+)\n" +roofline_DW_pattern = r"[ ]+(?P\d*)[ ]+dram_write_transactions[ ]+Device Memory Write Transactions[ ]+(?P\d+)[ ]+(?P\d+)[ ]+(?P\d+)\n" +roofline_success_run = r"gpu_atomic propagation done." +roofline_runtime_pattern = r" GPU activities:[ ]+\d+.\d+\%[ ]+\d+.\d+[a-z]+[ ]+\d+[ ]+(?P\d+.\d+)(?P[a-z]+)[ ]+(?P\d+.\d+)(?P[a-z]+)[ ]+(?P\d+.\d+)(?P[a-z]+)[ ]+void GPUAtomicDomainPropagation" + def get_regex_result(regex_string: str, search_string: str, group_name: str = None): m = re.compile(regex_string).search(search_string) diff --git a/fileReader/run_propagation.py b/fileReader/run_propagation.py index 3a9b366..7b29326 100644 --- a/fileReader/run_propagation.py +++ b/fileReader/run_propagation.py @@ -88,9 +88,9 @@ def exec_run( ubs_dis = ubs_seq = ubs_gpuatomic = ubs_gpu = ubs_omp = ubs - (seq_new_lbs, seq_new_ubs) = propagateSequential(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_seq, ubs_seq, vartypes, datatype=c_double) + # (seq_new_lbs, seq_new_ubs) = propagateSequential(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_seq, ubs_seq, vartypes, datatype=c_double) - (omp_new_lbs, omp_new_ubs) = propagateFullOMP(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_omp, ubs_omp, vartypes, datatype=c_double) + #(omp_new_lbs, omp_new_ubs) = propagateFullOMP(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_omp, ubs_omp, vartypes, datatype=c_double) # (gpu_new_lbs, gpu_new_ubs) = propagateGPUReduction(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_gpu, ubs_gpu, vartypes, datatype=datatype) @@ -98,10 +98,10 @@ def exec_run( # idx = 1 (gpuatomic_new_lbs, gpuatomic_new_ubs) = propagateGPUAtomic(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_gpuatomic, ubs_gpuatomic, vartypes, synctype=synctype, datatype=datatype) - print("") - eq1 = compare_results(seq_new_lbs, seq_new_ubs, omp_new_lbs, omp_new_ubs,"cpu_seq", "cpu_omp") - eq2 = compare_results(seq_new_lbs, seq_new_ubs, gpuatomic_new_lbs, gpuatomic_new_ubs, "cpu_seq", "gpu_atomic") - print("all results match: ", eq1 and eq2) + # print("") + # eq1 = compare_results(seq_new_lbs, seq_new_ubs, omp_new_lbs, omp_new_ubs,"cpu_seq", "cpu_omp") + # eq2 = compare_results(seq_new_lbs, seq_new_ubs, gpuatomic_new_lbs, gpuatomic_new_ubs, "cpu_seq", "gpu_atomic") + # print("all results match: ", eq1 and eq2) # compare_arrays_diff_idx(seq_new_lbs, omp_new_lbs, "lbs") # compare_arrays_diff_idx(seq_new_ubs, omp_new_ubs, "ubs") diff --git a/test/testCases/test_end_to_end.cuh b/test/testCases/test_end_to_end.cuh index 0c58430..fb9aa71 100644 --- a/test/testCases/test_end_to_end.cuh +++ b/test/testCases/test_end_to_end.cuh @@ -7,17 +7,17 @@ void runAllAlgsAnalyticalSol() { Problem ts_gpu_r; Problem ts_gpu_a; - printf("running cpu_seq"); - tester.executeSequentialPropagator(ts_seq); - tester.checkSolution(ts_seq); - - printf("running cpu_omp"); - tester.executeFullOMPPropagator(ts_omp); - tester.checkSolution(ts_omp); - - printf("running gpu_reduction"); - tester.executeGPUReduction(ts_gpu_r); - tester.checkSolution(ts_gpu_r); +// printf("running cpu_seq"); +// tester.executeSequentialPropagator(ts_seq); +// tester.checkSolution(ts_seq); +// +// printf("running cpu_omp"); +// tester.executeFullOMPPropagator(ts_omp); +// tester.checkSolution(ts_omp); +// +// printf("running gpu_reduction"); +// tester.executeGPUReduction(ts_gpu_r); +// tester.checkSolution(ts_gpu_r); printf("running gpu_atomic"); tester.executeAtomicGPUPropagator(ts_gpu_a);