feat: improved proof replay evaluation scripts (#28)

* feat: script on selected 10 egg tests * feat: script to build egg binary with stop-on-success * fix: align `gp4` test * feat: constant expressions have term size 1 * fix: revert size calculation * style: remove random check * feat: another Python script for data extraction * fix: directory in Lean-pre-DCP evaluation script * feat: save results in CSV file
verified-optimization · Apr 8, 2024 · f301364 · f301364
1 parent c11be2c
commit f301364
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 11 deletions.
diff --git a/CvxLean/Test/PreDCP/MainExample.lean b/CvxLean/Test/PreDCP/MainExample.lean
@@ -23,8 +23,6 @@ def p :=
 time_cmd equivalence eq/q : p := by
   pre_dcp
 
-#check congrArg
-
 #print q
 -- optimization (x : ℝ)
 --   minimize x

diff --git a/egg-pre-dcp/tests/test_dgp.rs b/egg-pre-dcp/tests/test_dgp.rs
@@ -36,12 +36,12 @@ fn test_gp3() {
 #[test]
 fn test_gp4() {
     pre_dcp_check_and_print(
-        "(div 1 (div (exp (var u)) (exp (var v))))",
+        "(div (exp (var u)) (exp (var v)))",
         vec![
             "(le 2 (exp (var u)))",
             "(le (exp (var u)) 3)",
-            "(le (add (pow (exp (var u)) 2) (div (mul 3 (exp (var v))) (exp (var w)))) (sqrt (exp (var u))))",
-            "(eq (div (exp (var u)) (exp (var v))) (pow (exp (var w)) 2))"
+            "(le (add (pow (exp (var u)) 2) (div (mul 6 (exp (var v))) (exp (var w)))) (sqrt (exp (var u))))",
+            "(eq (mul (exp (var u)) (exp (var v))) (exp (var w)))"
         ]);
 }
 

diff --git a/scripts/evaluation/lean-pre-dcp/build_egg_pre_dcp_stop_on_success.sh b/scripts/evaluation/lean-pre-dcp/build_egg_pre_dcp_stop_on_success.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+RUSTFLAGS="--cfg stop_on_success"  cargo build --release --manifest-path egg-pre-dcp/Cargo.toml
+mkdir -p egg-pre-dcp/utils
+cp egg-pre-dcp/target/release/egg-pre-dcp egg-pre-dcp/utils/egg-pre-dcp
diff --git a/scripts/evaluation/lean-pre-dcp/extract_stats.py b/scripts/evaluation/lean-pre-dcp/extract_stats.py
@@ -3,6 +3,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import heapq
+import csv
 
 def extract_stats_from_file(file_path):
     with open(file_path, 'r') as file:
@@ -167,6 +168,17 @@ def plot_stats1(fn, xs, xs_label, ys, ys_label):
 command_times_per_step = np.array([0 if numbers_of_steps[i] == 0 else command_times[i] / numbers_of_steps[i] for i in range(len(command_times))])
 print('Average command time per step: ', np.average(command_times_per_step))
 
+results = [(problem_names[i], egg_times[i], command_times[i]) for i in range(len(problem_names))]
+
+output_file = eval_dir + "/pre_dcp_eval_results.csv"
+
+with open(output_file, 'w', newline='') as file:
+    writer = csv.writer(file)
+    writer.writerow(["problem_name", "egg_time", "command_time"]) 
+    writer.writerows(results)
+
+print("Results saved to:", output_file)
+
 # Remove some outliers.
 to_remove = 8
 idxs_to_remove = heapq.nlargest(to_remove, range(len(command_times)), key=command_times.__getitem__)

diff --git a/scripts/evaluation/lean-pre-dcp/extract_stats_selected_10.py b/scripts/evaluation/lean-pre-dcp/extract_stats_selected_10.py
@@ -0,0 +1,109 @@
+import sys
+import re
+
+def extract_test_results(file_path):
+    test_results = {}
+
+    with open(file_path, 'r') as file:
+        data = file.read()
+
+        # Define regex patterns.
+        test_name_pattern = r'test_(\w+)::(\w+)'
+        total_time_pattern = r'Total time: (\d+) ms\.'
+        e_graph_time_pattern = r'E-graph building time: (\d+) ms\.'
+        step_extraction_time_pattern = r'Step extraction time: (\d+) ms\.'
+        num_of_nodes_pattern = r'Succeeded with node limit (\d+) \(using (\d+) nodes\).'
+        num_steps_pattern = r'Number of steps: (\d+)\.'
+        best_term_size_pattern = r'Best term size: (\d+)\.'
+        best_num_variables_pattern = r'Best number of variables: (\d+)\.'
+        num_of_iterations = r'Number of iterations: (\d+)\.'
+        num_of_rules_applied = r'Number of rules applied: (\d+)\.'
+
+        failure_pattern = r'failures:(.*?)test_(\w+)'
+
+        # Extract test names and corresponding total times or failure status.
+        test_names = re.findall(test_name_pattern, data)
+        test_names = [x[1] for x in test_names]
+        total_times = re.findall(total_time_pattern, data)
+        e_graph_times = re.findall(e_graph_time_pattern, data)
+        step_extraction_times = re.findall(step_extraction_time_pattern, data)
+        num_of_nodes = re.findall(num_of_nodes_pattern, data)
+        num_of_nodes = [int(x[1]) for x in num_of_nodes]
+        num_steps = re.findall(num_steps_pattern, data)
+        best_term_sizes = re.findall(best_term_size_pattern, data)
+        best_num_variables = re.findall(best_num_variables_pattern, data)
+        num_of_iterations = re.findall(num_of_iterations, data)
+        num_of_rules_applied = re.findall(num_of_rules_applied, data)
+
+        failures = re.findall(failure_pattern, data, re.DOTALL)
+        failed_tests = ["test_" + test[1] for test in failures]
+
+        print(num_of_nodes)
+        print(len(test_names), len(total_times))
+        assert len(test_names) == len(total_times) + len(failed_tests)
+        assert len(total_times) == len(e_graph_times)
+        assert len(total_times) == len(step_extraction_times)
+        assert len(total_times) == len(num_of_nodes)
+        assert len(total_times) == len(num_steps)
+        assert len(total_times) == len(best_term_sizes)
+        assert len(total_times) == len(best_num_variables)
+        assert len(total_times) == len(num_of_iterations)
+        assert len(total_times) == len(num_of_rules_applied)
+
+        # Match test names with their total times or failure status.
+        i = 0
+        for test_name in test_names:
+            if test_name in failed_tests:
+                test_results[test_name] = {
+                    'total_time': None, 
+                    'e_graph_time': None, 
+                    'step_extraction_time': None,
+                    'num_of_nodes': None,
+                    'steps': None,
+                    'term_size': None,
+                    'num_variables': None,
+                    'num_of_iterations': None,
+                    'num_of_rules_applied': None
+                }
+            else:
+                test_results[test_name] = {
+                    'total_time': int(total_times[i]),
+                    'e_graph_time': int(e_graph_times[i]),
+                    'step_extraction_time': int(step_extraction_times[i]),
+                    'num_of_nodes': num_of_nodes[i],
+                    'steps': int(num_steps[i]),
+                    'term_size': int(best_term_sizes[i]),
+                    'num_variables': int(best_num_variables[i]),
+                    'num_of_iterations': int(num_of_iterations[i]),
+                    'num_of_rules_applied': int(num_of_rules_applied[i])
+                }
+                i += 1
+
+    return test_results
+
+file_path = 'scripts/evaluation/data/' + sys.argv[1]
+results = extract_test_results(file_path)
+
+benchmark_path = 'scripts/evaluation/egg-pre-dcp-options/benchmark.txt'
+benchmark = []
+with open(benchmark_path, 'r') as file:
+    benchmark = file.readlines()
+
+with open('scripts/evaluation/data/summary.csv', 'w') as file:
+    file.write("name,term_size_after,num_nodes,num_steps,num_of_iterations,num_of_rules_applied\n")
+    for prob in benchmark:
+        prob = "test_" + prob.strip()
+        if prob not in results:
+            continue
+        prob_results = results[prob]
+        values = [prob[5:]]
+        values += ["te", "tl", "sb"]
+        values += ["-" if prob_results["term_size"] is None else str(prob_results["term_size"])]
+        values += ["-" if prob_results["num_of_nodes"] is None else str(prob_results["num_of_nodes"])]
+        values += ["-" if prob_results["steps"] is None else str(prob_results["steps"])]
+        values += ["-" if prob_results["num_of_iterations"] is None else str(prob_results["num_of_iterations"])]
+        values += ["-" if prob_results["num_of_rules_applied"] is None else str(prob_results["num_of_rules_applied"])]
+        file.write(','.join(values) + '\n')
+        # To copy-paste into LaTeX.
+        print(' & '.join(values) + ' \\\\')
+        print('\\hline')
diff --git a/scripts/evaluation/lean-pre-dcp/run_all.sh b/scripts/evaluation/lean-pre-dcp/run_all.sh
@@ -2,10 +2,10 @@
 
 OUT=$1
 
-mkdir -p ./evaluation/data/$OUT
+mkdir -p ./scripts/evaluation/data/$OUT
 
-./scripts/evaluation/lean-pre-dcp/run.sh ./evaluation/data/$OUT/lean_pre_dcp_test_out_1.txt;
-./scripts/evaluation/lean-pre-dcp/run.sh ./evaluation/data/$OUT/lean_pre_dcp_test_out_2.txt;
-./scripts/evaluation/lean-pre-dcp/run.sh ./evaluation/data/$OUT/lean_pre_dcp_test_out_3.txt;
-./scripts/evaluation/lean-pre-dcp/run.sh ./evaluation/data/$OUT/lean_pre_dcp_test_out_4.txt; 
-./scripts/evaluation/lean-pre-dcp/run.sh ./evaluation/data/$OUT/lean_pre_dcp_test_out_5.txt 
+./scripts/evaluation/lean-pre-dcp/run.sh ./scripts/evaluation/data/$OUT/lean_pre_dcp_test_out_1.txt;
+./scripts/evaluation/lean-pre-dcp/run.sh ./scripts/evaluation/data/$OUT/lean_pre_dcp_test_out_2.txt;
+./scripts/evaluation/lean-pre-dcp/run.sh ./scripts/evaluation/data/$OUT/lean_pre_dcp_test_out_3.txt;
+./scripts/evaluation/lean-pre-dcp/run.sh ./scripts/evaluation/data/$OUT/lean_pre_dcp_test_out_4.txt; 
+./scripts/evaluation/lean-pre-dcp/run.sh ./scripts/evaluation/data/$OUT/lean_pre_dcp_test_out_5.txt 
diff --git a/scripts/evaluation/lean-pre-dcp/run_selected_10_stop_on_success.sh b/scripts/evaluation/lean-pre-dcp/run_selected_10_stop_on_success.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+cd egg-pre-dcp
+
+RUSTFLAGS="--cfg stop_on_success" cargo test test_dgp::test_gp4 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_dgp::test_gp5 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_dgp::test_gp8 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_dgp::test_gp9 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_almost_dgp::test_agp3 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_dqcp::test_qcp4 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_stanford::test_stan3 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_stanford::test_stan4 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_quiz::test_quiz9 -- --test-threads=1 --nocapture
+RUSTFLAGS="--cfg stop_on_success" cargo test test_main_example::test_main_example -- --test-threads=1 --nocapture