diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py
index c88a747513c5a..8f52f59fb7d70 100644
--- a/benchmarks/mpm2d.py
+++ b/benchmarks/mpm2d.py
@@ -117,21 +117,7 @@ def substep():
         F[i] = [[1, 0], [0, 1]]
         Jp[i] = 1
 
-    compile_time = time.time()
-    substep()
-    compile_time = time.time() - compile_time
-    ti.stat_write_yaml('compilation_time(s)', compile_time)
-    ti.get_runtime().sync()
-    t = time.time()
-    for frame in range(200):
-        for s in range(20):
-            substep()
-        # colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-        # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-        # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
-    ti.get_runtime().sync()
-    avg = (time.time() - t) / 4000 * 1000  # miliseconds
-    ti.stat_write_yaml('running_time(ms)', avg)
+    ti.benchmark(substep, repeat=4000)
 
 
 @ti.archs_excluding(ti.opengl)
@@ -250,18 +236,4 @@ def substep():
         F[i] = [[1, 0], [0, 1]]
         Jp[i] = 1
 
-    compile_time = time.time()
-    substep()
-    compile_time = time.time() - compile_time
-    ti.stat_write_yaml('compilation_time(s)', compile_time)
-    ti.get_runtime().sync()
-    t = time.time()
-    for frame in range(200):
-        for s in range(20):
-            substep()
-        # colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-        # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-        # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
-    ti.get_runtime().sync()
-    avg = (time.time() - t) / 4000 * 1000  # miliseconds
-    ti.stat_write_yaml('running_time(ms)', avg)
+    ti.benchmark(substep, repeat=4000)
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 5d9f0b4b4ca6f..71191f9f98002 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -321,25 +321,47 @@ def visit(node):
 def benchmark(func, repeat=300, args=()):
     import taichi as ti
     import time
-    compile_time = time.time()
-    func(*args)
-    compile_time = time.time() - compile_time
-    ti.stat_write_yaml('compilation_time(s)', compile_time)
-    # The reason why we run 4 times is to warm up instruction/data caches.
-    # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
-    for i in range(4):
-        func(*args)  # compile the kernel first
-    ti.sync()
-    t = time.time()
-    for n in range(repeat):
-        func(*args)
-    ti.get_runtime().sync()
-    elapsed = time.time() - t
-    avg = elapsed / repeat * 1000  # miliseconds
-    ti.stat_write_yaml('running_time(ms)', avg)
-
 
-def stat_write_yaml(key, value):
+    def run_benchmark():
+        compile_time = time.time()
+        func(*args)
+        compile_time = time.time() - compile_time
+        ti.stat_write('compilation_time', compile_time)
+        codegen_stat = ti.core.stat()
+        for line in codegen_stat.split('\n'):
+            try:
+                a, b = line.strip().split(':')
+            except:
+                continue
+            a = a.strip()
+            b = int(float(b))
+            if a == 'codegen_kernel_statements':
+                ti.stat_write('instructions', b)
+            if a == 'codegen_offloaded_tasks':
+                ti.stat_write('offloaded_tasks', b)
+            elif a == 'launched_kernels':
+                ti.stat_write('launched_kernels', b)
+        # The reason why we run 4 times is to warm up instruction/data caches.
+        # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
+        for i in range(4):
+            func(*args)  # compile the kernel first
+        ti.sync()
+        t = time.time()
+        for n in range(repeat):
+            func(*args)
+        ti.get_runtime().sync()
+        elapsed = time.time() - t
+        avg = elapsed / repeat
+        ti.stat_write('running_time', avg)
+
+    ti.cfg.async_mode = False
+    run_benchmark()
+    if ti.is_extension_supported(ti.cfg.arch, ti.extension.async_mode):
+        ti.cfg.async_mode = True
+        run_benchmark()
+
+
+def stat_write(key, value):
     import taichi as ti
     import yaml
     case_name = os.environ.get('TI_CURRENT_BENCHMARK')
@@ -348,6 +370,7 @@ def stat_write_yaml(key, value):
     if case_name.startswith('benchmark_'):
         case_name = case_name[10:]
     arch_name = core.arch_name(ti.cfg.arch)
+    async_mode = 'async' if ti.cfg.async_mode else 'sync'
     output_dir = os.environ.get('TI_BENCHMARK_OUTPUT_DIR', '.')
     filename = f'{output_dir}/benchmark.yml'
     try:
@@ -357,7 +380,8 @@ def stat_write_yaml(key, value):
         data = {}
     data.setdefault(key, {})
     data[key].setdefault(case_name, {})
-    data[key][case_name][arch_name] = value
+    data[key][case_name].setdefault(async_mode, {})
+    data[key][case_name][async_mode][arch_name] = value
     with open(filename, 'w') as f:
         yaml.dump(data, f, Dumper=yaml.SafeDumper)
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 90f44e487708e..8b44712a95de6 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -595,6 +595,11 @@ void export_lang(py::module &m) {
   m.def("is_extension_supported", is_extension_supported);
 
   m.def("print_stat", [] { stat.print(); });
+  m.def("stat", [] {
+    std::string result;
+    stat.print(&result);
+    return result;
+  });
 
   m.def("record_action_hint", [](std::string content) {
     ActionRecorder::get_instance().record("hint",