diff --git a/.gitignore b/.gitignore
index 38af277..e104a66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 fileReader/venv
 .idea/
+build/
 cmake-build-debug/
 *__pycache__*
diff --git a/fileReader/analyze_roofline.py b/fileReader/analyze_roofline.py
new file mode 100644
index 0000000..02a0cb1
--- /dev/null
+++ b/fileReader/analyze_roofline.py
@@ -0,0 +1,193 @@
+import os
+from matplotlib import pyplot as plt
+from regexes import *
+
+V100_peak_flops  = 7800.0 # GFLOPS
+#V100_peak_flops = 15700.0 # GFLOPS
+V100_peak_bandwidth = 900.0 # GB/s
+V100_machine_balance = V100_peak_flops/V100_peak_bandwidth
+print("V100 machine balance: ", V100_machine_balance)
+
+# unit conversion dict to GB/s or s
+units = {
+    'B/s': 1e9,
+    'KB/s': 1e6,
+    'MB/s': 1e3,
+    'GB/s': 1,
+    'us': 1e6,
+    'ms': 1e3,
+    's':1
+}
+
+
+def frange(start, stop, step=1.0):
+    f = start
+    while f < stop:
+        f += step
+        yield f
+
+
+def to_byte_per_s(val, unit):
+    if unit == 'GB/s':
+        return val * 1e9
+    elif unit == 'MB/s':
+        return val * 1e6
+    elif unit == 'KB/s':
+        return val * 1e3
+    elif unit == 'B/s':
+        return val
+    else:
+        raise Exception("Unknown unit ", unit)
+
+
+def to_seconds(val, unit):
+    if unit == 's':
+        return val
+    elif unit == 'ms':
+        return val / 1e3
+    elif unit == 'us':
+        return val / 1e6
+
+
+def plot_roofline():
+    fig = plt.figure(frameon=False)
+    ax = fig.add_subplot(1, 1, 1)
+    yticks_labels = []
+    yticks = []
+    xticks_labels = []
+    xticks = [2.**i for i in range(-6, 6)]
+    ax.set_xlabel('arithmetic intensity [FLOP/byte]')
+    ax.set_ylabel('performance [FLOP/s]')
+
+    # Upper bound
+    x = list(frange(min(xticks), max(xticks), 0.01))
+    ax.plot(x, [min(V100_peak_bandwidth*x, float(V100_peak_flops)) for x in x])
+
+
+    ax.set_xscale('log', basex=2)
+    ax.set_yscale('log')
+    ax.set_xlim(min(xticks), max(xticks))
+    # ax.set_yticks([perf, float(max_flops)])
+    ax.set_xticks(xticks)
+    ax.grid(axis='x', alpha=0.7, linestyle='--')
+    # fig.savefig('out.pdf')
+    plt.show()
+
+
+class RunData:
+    def __init__(self):
+        self.prob_name = None
+        self.num_invocation = None
+        self.flops = None
+        self.DR = None
+        self.TR = None
+        self.TR_unit = None
+        self.DW = None
+        self.TW = None
+        self.TW_unit = None
+        self.runtime = None
+        self.runtime_unit = None
+        self.total_mem = None
+        self.total_mem_unit = None
+
+        self.AI = None
+        self.attainable_perf = None
+        self.achieve_perf = None
+        self.achieved_perf_percent = None
+
+    def print(self):
+        print(self.prob_name, ":   invocations:", self.num_invocation, ", flops:", self.flops, ", DR:", self.DR, ", DR_unit:", self.DR_unit, ", DW:", self.DW, ", DW_unit", self.DW_unit)
+
+    def compute_metrics(self):
+
+        runtime_in_s = to_seconds(self.runtime, self.runtime_unit)
+
+        data_movement = (to_byte_per_s(self.TR, self.TR_unit) + to_byte_per_s(self.TW, self.TW_unit)) * runtime_in_s
+        self.AI = self.flops / data_movement
+        #self.AI = self.flops / ((self.DR + self.DW)*32)
+
+        self.attainable_perf = min(V100_peak_bandwidth * self.AI, float(V100_peak_flops)) # in GB/s
+        self.achieve_perf = (self.flops/1e9) / runtime_in_s
+        self.achieved_perf_percent = (self.achieve_perf / self.attainable_perf) * 100
+
+
+def get_run_object(output: str) -> RunData:
+    runData = RunData()
+    runData.prob_name = get_regex_result(prob_name_pattern, output, "prob_file")
+    runData.num_invocation = get_regex_result(roofline_flops_pattern, output, "invocations")
+    runData.flops = int(float(get_regex_result(roofline_flops_pattern, output, "avg")))
+    runData.DR = int(get_regex_result(roofline_DR_pattern, output, "avg"))
+    runData.TR = float(get_regex_result(roofline_TR_pattern, output, "avg"))
+    runData.TR_unit = get_regex_result(roofline_TR_pattern, output, "avg_unit").strip()
+    runData.DW = int(get_regex_result(roofline_DW_pattern, output, "avg"))
+    runData.TW = float(get_regex_result(roofline_TW_pattern, output, "avg"))
+    runData.TW_unit = get_regex_result(roofline_TW_pattern, output, "avg_unit").strip()
+
+    return runData
+
+
+def add_runtime_data_to_run_objects(output: str, run_data_objects):
+    prob_name = get_regex_result(prob_name_pattern, output, "prob_file")
+
+    obj = [obj for obj in run_data_objects if obj.prob_name == prob_name]
+    assert len(obj) == 1
+    obj = obj[0]
+
+    obj.runtime = float(get_regex_result(roofline_runtime_pattern, output, "avg"))
+    obj.runtime_unit = get_regex_result(roofline_runtime_pattern, output, "avg_unit")
+
+
+if __name__ == "__main__":
+
+    throughput_log_file = "04_08_2021_gpu2_double_roofline.log"
+    runtime_log_file = "05_08_2021_gpu2_roofline_runtimes_double.log"
+
+    with open(os.path.join("plotter", throughput_log_file), 'r') as f:
+       throughput_results_file = f.read()
+    with open(os.path.join("plotter", runtime_log_file), 'r') as f:
+        runtime_results_file = f.read()
+
+    throughput_instances_output = list(map(lambda x: str(x.group()), re.finditer(re.compile(roofline_prob_out_pattern), throughput_results_file)))
+    runtime_instances_output = list(map(lambda x: str(x.group()), re.finditer(re.compile(roofline_prob_out_pattern), runtime_results_file)))
+
+    throughput_instances_output = list(filter(lambda output: get_regex_result(roofline_success_run, output) is not None, throughput_instances_output))
+    runtime_instances_output = list(filter(lambda output: get_regex_result(roofline_success_run, output) is not None, runtime_instances_output))
+    print("num instances after removing those with no results available: ", len(throughput_instances_output))
+
+    throughput_instances_output = list(filter(lambda output: int(get_regex_result(nnz_pattern, output, 'nnz')) >2.5*1e5, throughput_instances_output))
+    runtime_instances_output = list(filter(lambda output: int(get_regex_result(nnz_pattern, output, 'nnz')) >2.5*1e5, runtime_instances_output))
+    print("num instances after removing those with nnz <= 1.5*1e5: ", len(throughput_instances_output))
+
+    run_data_objects = list(map(lambda x: get_run_object(x), throughput_instances_output))
+    assert len(run_data_objects) == len(throughput_instances_output)
+
+    #add runtime data to objects
+
+    for output in runtime_instances_output:
+        add_runtime_data_to_run_objects(output, run_data_objects)
+
+    run_data_objects = list(filter(lambda obj: obj.runtime is not None, run_data_objects))
+
+    print("num isntances with both runtime and throughput info: ", len(run_data_objects))
+
+    # run_data_objects = list(filter(lambda obj: obj.TR_unit == 'GB/s' and obj.TW_unit == 'GB/s', run_data_objects))
+    # print("num isntances with GB/s throughput: ", len(run_data_objects))
+    # highest mem throughput
+    for obj in run_data_objects:
+        obj.compute_metrics()
+
+    AIs = list(map(lambda obj: obj.AI, run_data_objects))
+    print("arithemtic intensity: min: ", min(AIs), ", max: ", max(AIs), ", avr: ", sum(AIs)/len(AIs))
+
+    perf_percentages = list(map(lambda obj: obj.achieved_perf_percent, run_data_objects))
+    print("achieved performance percentages: min: ", min(perf_percentages), ", max: ", max(perf_percentages), ", avr: ", sum(perf_percentages)/len(perf_percentages))
+
+
+
+    # AI= calc_AI(flop,DR,DW)
+    # print("Arithmetic intensity is: ", AI, ". Machine balance is: ", V100_machine_balance)
+    #
+    # achieved_perf = get_achieved_perf(flop,runtime)
+    # attainable_perf = get_attainable_perf(AI)
+    # print("achieved perf in percentages: ", get_achieved_per_in_percent_to_attainable(achieved_perf,attainable_perf))
+
diff --git a/fileReader/exec_file.sh b/fileReader/exec_file.sh
index 875be1d..e2e9d8c 100755
--- a/fileReader/exec_file.sh
+++ b/fileReader/exec_file.sh
@@ -61,6 +61,6 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
 printf "\n\n === $FILE ===\n"
 
-
+#nvprof --profile-child-processes --profile-from-start off  --kernels "GPUAtomicDomainPropagation" --metrics flop_count_dp  --metrics dram_read_throughput  --metrics dram_write_throughput --metrics dram_read_transactions --metrics dram_write_transactions python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
 python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
-
+#nvprof --profile-child-processes --profile-from-start off --concurrent-kernels off --openacc-profiling off --print-gpu-summary python3 -u "$DIR/run_propagation.py" -f "$FILE" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
diff --git a/fileReader/exec_tests.sh b/fileReader/exec_tests.sh
index dbc82b1..fa85420 100755
--- a/fileReader/exec_tests.sh
+++ b/fileReader/exec_tests.sh
@@ -57,7 +57,10 @@ while true ; do
   esac
 done
 
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 for filename in $FILES*; do
     printf "\n\n === $filename ===\n"
     python3 -u run_propagation.py -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
+    #nvprof --profile-child-processes --profile-from-start off  --kernels "GPUAtomicDomainPropagation" --metrics flop_count_dp  --metrics dram_read_throughput  --metrics dram_write_throughput --metrics dram_read_transactions --metrics dram_write_transactions python3 -u "$DIR/run_propagation.py" -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
+  #  nvprof --profile-child-processes --profile-from-start off --concurrent-kernels off --openacc-profiling off --print-gpu-summary python3 -u "$DIR/run_propagation.py" -f "$filename" -d "$DATATYPE" -t "$TESTTYPE" -s "$SEED" -c "$SYNCTYPE" 2>&1 | tee -a $LOGFILE
 done
\ No newline at end of file
diff --git a/fileReader/plotter/plot_results.py b/fileReader/plotter/plot_results.py
index 279f504..11b96b6 100644
--- a/fileReader/plotter/plot_results.py
+++ b/fileReader/plotter/plot_results.py
@@ -54,7 +54,8 @@
 
 
 def get_linestyle(algorithm, machine):
-    if 'fastmath' in machine:
+
+    if any(x in machine for x in ['fastmath', 'megakernel']):
         return 'dotted'
     elif algorithm == 'cpu_omp' or any(x in machine for x in ['single', 'gpu_loop', '8thrds']):
         return 'dashed'
@@ -66,6 +67,13 @@ def get_line_color(machine, algorithm):
     if algorithm == 'papilo':
         return 'tab:red'
 
+    if 'cpu_loop' in machine:
+        return 'tab:green'
+    if 'gpu_loop' in machine:
+        return 'tab:red'
+    if 'megakernel' in machine:
+        return 'tab:blue'
+
     if 'seed0' in machine:
         return 'tab:blue'
     if 'seed1' in machine:
@@ -213,7 +221,7 @@ def truncate(number, decimals=0):
         factor = 10.0 ** decimals
         return math.trunc(number * factor) / factor
 
-    fig = plt.figure()
+    fig = plt.figure(figsize=(10,8))
     plt.style.use('bmh')
     plot_a = True
     ### Subplot A ###
@@ -228,10 +236,10 @@ def truncate(number, decimals=0):
                          label=str(algorithm) + "-" + str(machine), linestyle=get_linestyle(algorithm, machine),
                          color=get_line_color(machine, algorithm))
         # constant one - the baseline case
-    #    plt.plot(np.arange(8), np.ones(8), label="cpu_seq-xeon", linestyle='dashdot', color='tab:gray')
+        plt.plot(np.arange(8), np.ones(8), label="cpu_seq-xeon", linestyle='dashdot', color='tab:gray')
 
         plt.yscale('log')
-        yticks = get_y_ticks(ys, 10)
+        yticks = get_y_ticks(ys, 8)
         plt.yticks(yticks, yticks)
         plt.xticks(np.arange(len(speedups[algorithm][machine][0])),
                    map(lambda x: "Set-" + str(x + 1), np.arange(len(speedups[algorithm][machine][0]))))
@@ -268,6 +276,10 @@ def truncate(number, decimals=0):
         rc('font', **{'family': 'serif', 'serif': ['Times']})
         rc('text', usetex=True)
         ax2.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
+        from matplotlib.pyplot import figure
+
+
+
     plt.show()
 
 
diff --git a/fileReader/regexes.py b/fileReader/regexes.py
index bdfd6f2..00411ef 100644
--- a/fileReader/regexes.py
+++ b/fileReader/regexes.py
@@ -4,6 +4,8 @@
 import time
 import re
 
+nnz_pattern = r"nnz     :  (?P<nnz>\d+)\n"
+
 #result_pattern = r"Reding of  (?P<prob_file>.*)  model done!\nnum vars:  (?P<n_vars>\d*)\nnum cons:  (?P<n_cons>\d*)\nnnz     :  (?P<nnz>\d*)\n\n.*\n.*cpu_seq propagation done. Num rounds: (?P<cpu_seq_rounds>\d*)\ncpu_seq execution time : (?P<cpu_seq_time>\d*).*\n\n.*\n.*cpu_omp propagation done. Num rounds: (?P<cpu_omp_rounds>\d*)\ncpu_omp execution time : (?P<cpu_omp_time>\d*).*\n\n.*\n.*gpu_reduction propagation done. Num rounds: (?P<gpu_reduction_rounds>\d*)\ngpu_reduction execution time : (?P<gpu_reduction_time>\d*).*\n\n.*\n.*gpu_atomic propagation done. Num rounds: (?P<gpu_atomic_rounds>\d*)\ngpu_atomic execution time : (?P<gpu_atomic_time>\d*).*\n\n.*\n.*\n.*\nall results match:  (?P<results_correct>.*)"
 result_pattern = r"Reading of  (?P<prob_file>.*)  model done!\nnum vars:  (?P<n_vars>\d*)\nnum cons:  (?P<n_cons>\d*)\nnnz     :  (?P<nnz>\d*)\n\n.*\n.*cpu_seq propagation done. Num rounds: (?P<cpu_seq_rounds>\d*)\ncpu_seq execution time : (?P<cpu_seq_time>\d*).*\n\n.*\n.*cpu_omp propagation done. Num rounds: (?P<cpu_omp_rounds>\d*)\ncpu_omp execution time : (?P<cpu_omp_time>\d*).*\n\n.*\n.*gpu_atomic propagation done. Num rounds: (?P<gpu_atomic_rounds>\d*)\ngpu_atomic execution time : (?P<gpu_atomic_time>\d*).*\n\ncpu_seq to cpu_omp results match:  (?P<dsadasdas>.*)\ncpu_seq to gpu_atomic results match:  (?P<dadadas>.*)\nall results match:  (?P<results_correct>.*)"
 seq_to_omp_pattern = r"cpu_seq to cpu_omp results match:  (?P<match>.*)"
@@ -33,6 +35,17 @@ def round_timestamp_pattern(prop_round, alg): return "Propagation round: {}, {}
 no_bdchgs_after_papilo_pattern = r"papilo did not find any bound changes after cpu_seq!"
 
 
+# Roofline analysis regexes
+roofline_prob_out_pattern = r"(?s)read with 0 errors\n(.*?)Reding lp file"
+roofline_flops_pattern = r"[ ]+(?P<invocations>\d*)[ ]+flop_count_dp[ ]+Floating Point Operations\(Double Precision\)[ ]+(?P<min>\d+.\d+e\+\d+|\d+)[ ]+(?P<max>\d+.\d+e\+\d+|\d+)[ ]+(?P<avg>\d+.\d+e\+\d+|\d+)\n"
+roofline_flops_float_pattern = r"[ ]+(?P<invocations>\d*)[ ]+flop_count_sp[ ]+Floating Point Operations\(Single Precision\)[ ]+(?P<min>\d+.\d+e\+\d+|\d+)[ ]+(?P<max>\d+.\d+e\+\d+|\d+)[ ]+(?P<avg>\d+.\d+e\+\d+|\d+)\n"
+roofline_TR_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_read_throughput[ ]+Device Memory Read Throughput[ ]+(?P<min>\d+.\d+)(?P<min_unit>.*)[ ]+(?P<max>\d+.\d+)(?P<max_unit>.*)[ ]+(?P<avg>\d+.\d+)(?P<avg_unit>.*)\n"
+roofline_TW_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_write_throughput[ ]+Device Memory Write Throughput[ ]+(?P<min>\d+.\d+)(?P<min_unit>.*)[ ]+(?P<max>\d+.\d+)(?P<max_unit>.*)[ ]+(?P<avg>\d+.\d+)(?P<avg_unit>.*)\n"
+roofline_DR_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_read_transactions[ ]+Device Memory Read Transactions[ ]+(?P<min>\d+)[ ]+(?P<max>\d+)[ ]+(?P<avg>\d+)\n"
+roofline_DW_pattern = r"[ ]+(?P<invocations>\d*)[ ]+dram_write_transactions[ ]+Device Memory Write Transactions[ ]+(?P<min>\d+)[ ]+(?P<max>\d+)[ ]+(?P<avg>\d+)\n"
+roofline_success_run = r"gpu_atomic propagation done."
+roofline_runtime_pattern = r" GPU activities:[ ]+\d+.\d+\%[ ]+\d+.\d+[a-z]+[ ]+\d+[ ]+(?P<avg>\d+.\d+)(?P<avg_unit>[a-z]+)[ ]+(?P<min>\d+.\d+)(?P<min_unit>[a-z]+)[ ]+(?P<max>\d+.\d+)(?P<max_unit>[a-z]+)[ ]+void GPUAtomicDomainPropagation"
+
 def get_regex_result(regex_string: str, search_string: str, group_name: str = None):
     m = re.compile(regex_string).search(search_string)
 
diff --git a/fileReader/run_propagation.py b/fileReader/run_propagation.py
index 3a9b366..7b29326 100644
--- a/fileReader/run_propagation.py
+++ b/fileReader/run_propagation.py
@@ -88,9 +88,9 @@ def exec_run(
     ubs_dis = ubs_seq = ubs_gpuatomic = ubs_gpu = ubs_omp = ubs
 
 
-    (seq_new_lbs, seq_new_ubs) = propagateSequential(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_seq, ubs_seq, vartypes, datatype=c_double)
+   # (seq_new_lbs, seq_new_ubs) = propagateSequential(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_seq, ubs_seq, vartypes, datatype=c_double)
 
-    (omp_new_lbs, omp_new_ubs) = propagateFullOMP(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_omp, ubs_omp, vartypes, datatype=c_double)
+    #(omp_new_lbs, omp_new_ubs) = propagateFullOMP(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_omp, ubs_omp, vartypes, datatype=c_double)
 
     # (gpu_new_lbs, gpu_new_ubs) = propagateGPUReduction(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_gpu, ubs_gpu, vartypes, datatype=datatype)
 
@@ -98,10 +98,10 @@ def exec_run(
     #  idx = 1
 
     (gpuatomic_new_lbs, gpuatomic_new_ubs) = propagateGPUAtomic(n_vars, n_cons, nnz, col_indices, row_ptrs, coeffs, lhss, rhss, lbs_gpuatomic, ubs_gpuatomic, vartypes, synctype=synctype, datatype=datatype)
-    print("")
-    eq1 = compare_results(seq_new_lbs, seq_new_ubs, omp_new_lbs, omp_new_ubs,"cpu_seq", "cpu_omp")
-    eq2 = compare_results(seq_new_lbs, seq_new_ubs, gpuatomic_new_lbs, gpuatomic_new_ubs, "cpu_seq", "gpu_atomic")
-    print("all results match: ", eq1 and eq2)
+   # print("")
+   # eq1 = compare_results(seq_new_lbs, seq_new_ubs, omp_new_lbs, omp_new_ubs,"cpu_seq", "cpu_omp")
+   # eq2 = compare_results(seq_new_lbs, seq_new_ubs, gpuatomic_new_lbs, gpuatomic_new_ubs, "cpu_seq", "gpu_atomic")
+   # print("all results match: ", eq1 and eq2)
 
 #  compare_arrays_diff_idx(seq_new_lbs, omp_new_lbs, "lbs")
 #  compare_arrays_diff_idx(seq_new_ubs, omp_new_ubs, "ubs")
diff --git a/test/testCases/test_end_to_end.cuh b/test/testCases/test_end_to_end.cuh
index 0c58430..fb9aa71 100644
--- a/test/testCases/test_end_to_end.cuh
+++ b/test/testCases/test_end_to_end.cuh
@@ -7,17 +7,17 @@ void runAllAlgsAnalyticalSol() {
    Problem<datatype> ts_gpu_r;
    Problem<datatype> ts_gpu_a;
 
-   printf("running cpu_seq");
-   tester.executeSequentialPropagator(ts_seq);
-   tester.checkSolution(ts_seq);
-
-   printf("running cpu_omp");
-   tester.executeFullOMPPropagator(ts_omp);
-   tester.checkSolution(ts_omp);
-
-   printf("running gpu_reduction");
-   tester.executeGPUReduction(ts_gpu_r);
-   tester.checkSolution(ts_gpu_r);
+//   printf("running cpu_seq");
+//   tester.executeSequentialPropagator(ts_seq);
+//   tester.checkSolution(ts_seq);
+//
+//   printf("running cpu_omp");
+//   tester.executeFullOMPPropagator(ts_omp);
+//   tester.checkSolution(ts_omp);
+//
+//   printf("running gpu_reduction");
+//   tester.executeGPUReduction(ts_gpu_r);
+//   tester.checkSolution(ts_gpu_r);
 
    printf("running gpu_atomic");
    tester.executeAtomicGPUPropagator(ts_gpu_a);