diff --git a/benchmarks/async_advection.py b/benchmarks/async_advection.py
index 16ab176a1dcb2..ba9ad7bea10c5 100644
--- a/benchmarks/async_advection.py
+++ b/benchmarks/async_advection.py
@@ -8,7 +8,7 @@
 
 @benchmark_async
 def simple_advection(scale):
-    n = 128 * 2**int((math.log(scale, 2)) // 2)
+    n = 256 * 2**int((math.log(scale, 2)) // 2)
     x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
     new_x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
     v = ti.Vector.field(2, dtype=ti.f32, shape=(n, n))
diff --git a/benchmarks/async_cases.py b/benchmarks/async_cases.py
index 82e32d586a64e..d6f9191a9bea4 100644
--- a/benchmarks/async_cases.py
+++ b/benchmarks/async_cases.py
@@ -165,8 +165,9 @@ def task():
 
 @benchmark_async
 def mpm_splitted(scale):
-    quality = int(scale**(1 /
-                          3))  # Use a larger value for higher-res simulations
+    quality = int(3 * scale**(1 / 3))
+    # Use a larger value for higher-res simulations
+
     n_particles, n_grid = 9000 * quality**2, 128 * quality
     dx, inv_dx = 1 / n_grid, float(n_grid)
     dt = 1e-4 / quality
@@ -294,7 +295,7 @@ def task():
         for s in range(int(2e-3 // dt)):
             substep()
 
-    ti.benchmark(task, repeat=10)
+    ti.benchmark(task, repeat=5)
 
 
 @benchmark_async
diff --git a/benchmarks/benchmark_async.py b/benchmarks/benchmark_async.py
index d69da2713774e..76f81793bc7ed 100644
--- a/benchmarks/benchmark_async.py
+++ b/benchmarks/benchmark_async.py
@@ -21,6 +21,16 @@
 
 ti.benchmark_plot(fn='benchmark.yml',
                   cases=case_names,
-                  archs=['x64', 'cuda'],
+                  columns=[
+                      'wall_clk_t', 'exec_t', 'launched_tasks',
+                      'compiled_inst', 'compiled_tasks'
+                  ],
+                  column_titles=[
+                      'Wall-clock time', 'Backend time', 'Tasks launched',
+                      'Instructions emitted', 'Tasks compiled'
+                  ],
+                  archs=['cuda', 'x64'],
+                  title='Whole-Program Optimization Microbenchmarks',
                   bars='sync_vs_async',
-                  left_margin=0.2)
+                  left_margin=0.2,
+                  size=(11.5, 9))
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 640277b46956b..4292db52dfdcb 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -9,7 +9,10 @@ def body():
         for arch in [ti.cpu, ti.cuda]:
             for async_mode in [True, False]:
                 os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
-                ti.init(arch=arch, async_mode=async_mode, kernel_profiler=True)
+                ti.init(arch=arch,
+                        async_mode=async_mode,
+                        kernel_profiler=True,
+                        verbose=False)
                 if arch == ti.cpu:
                     scale = 2
                 else:
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index e80c545acb015..596d2dfaeabce 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -371,7 +371,8 @@ def run_benchmark():
         avg = elapsed / repeat
         ti.stat_write('wall_clk_t', avg)
         device_time = ti.kernel_profiler_total_time()
-        ti.stat_write('exec_t', device_time)
+        avg_device_time = device_time / repeat
+        ti.stat_write('exec_t', avg_device_time)
 
     run_benchmark()
 
@@ -379,12 +380,14 @@ def run_benchmark():
 def benchmark_plot(fn=None,
                    cases=None,
                    columns=None,
+                   column_titles=None,
                    archs=None,
                    title=None,
                    bars='sync_vs_async',
                    bar_width=0.4,
                    bar_distance=0,
-                   left_margin=0):
+                   left_margin=0,
+                   size=(12, 8)):
     import taichi as ti
     import yaml
     import matplotlib.pyplot as plt
@@ -412,13 +415,15 @@ def benchmark_plot(fn=None,
 
     if columns is None:
         columns = list(data[cases[0]].keys())
+    if column_titles is None:
+        column_titles = columns
     normalize_to_lowest = lambda x: True
     figure, subfigures = plt.subplots(len(cases), len(columns))
     if title is None:
         title = 'Taichi Performance Benchmarks (Higher means more)'
     figure.suptitle(title, fontweight="bold")
     for col_id in range(len(columns)):
-        subfigures[0][col_id].set_title(columns[col_id])
+        subfigures[0][col_id].set_title(column_titles[col_id])
     for case_id in range(len(cases)):
         case = cases[case_id]
         subfigures[case_id][0].annotate(
@@ -435,7 +440,9 @@ def benchmark_plot(fn=None,
             if archs is None:
                 current_archs = data[case][col].keys()
             else:
-                current_archs = archs & data[case][col].keys()
+                current_archs = [
+                    x for x in archs if x in data[case][col].keys()
+                ]
             if bars == 'sync_vs_async':
                 y_left = [
                     data[case][col][arch]['sync'] for arch in current_archs
@@ -480,7 +487,7 @@ def benchmark_plot(fn=None,
                               height=y_left,
                               width=bar_width,
                               label=label_left,
-                              color=(0.3, 0.7, 0.9, 1.0))
+                              color=(0.47, 0.69, 0.89, 1.0))
             bar_right = ax.bar(x=[
                 i + bar_width / 2 + bar_distance / 2
                 for i in range(len(current_archs))
@@ -488,7 +495,7 @@ def benchmark_plot(fn=None,
                                height=y_right,
                                width=bar_width,
                                label=label_right,
-                               color=(0.8, 0.2, 0.3, 1.0))
+                               color=(0.68, 0.26, 0.31, 1.0))
             ax.set_xticks(range(len(current_archs)))
             ax.set_xticklabels(current_archs)
             figure.legend((bar_left, bar_right), (label_left, label_right),
@@ -496,7 +503,7 @@ def benchmark_plot(fn=None,
     figure.subplots_adjust(left=left_margin)
 
     fig = plt.gcf()
-    fig.set_size_inches(13, 8)
+    fig.set_size_inches(size)
 
     plt.show()
 
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index 59076abfc91c9..d978de3a9f677 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -70,7 +70,8 @@ struct CompileConfig {
   bool async_opt_dse{true};
   bool async_listgen_fast_filtering{true};
   std::string async_opt_intermediate_file;
-  int async_flush_every{0};
+  // Setting 0 effectively means do not automatically flush
+  int async_flush_every{50};
   // Setting 0 effectively means unlimited
   int async_max_fuse_per_task{1};