diff --git a/benchmarks/async_advection.py b/benchmarks/async_advection.py index 16ab176a1dcb2..ba9ad7bea10c5 100644 --- a/benchmarks/async_advection.py +++ b/benchmarks/async_advection.py @@ -8,7 +8,7 @@ @benchmark_async def simple_advection(scale): - n = 128 * 2**int((math.log(scale, 2)) // 2) + n = 256 * 2**int((math.log(scale, 2)) // 2) x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n)) new_x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n)) v = ti.Vector.field(2, dtype=ti.f32, shape=(n, n)) diff --git a/benchmarks/async_cases.py b/benchmarks/async_cases.py index 82e32d586a64e..d6f9191a9bea4 100644 --- a/benchmarks/async_cases.py +++ b/benchmarks/async_cases.py @@ -165,8 +165,9 @@ def task(): @benchmark_async def mpm_splitted(scale): - quality = int(scale**(1 / - 3)) # Use a larger value for higher-res simulations + quality = int(3 * scale**(1 / 3)) + # Use a larger value for higher-res simulations + n_particles, n_grid = 9000 * quality**2, 128 * quality dx, inv_dx = 1 / n_grid, float(n_grid) dt = 1e-4 / quality @@ -294,7 +295,7 @@ def task(): for s in range(int(2e-3 // dt)): substep() - ti.benchmark(task, repeat=10) + ti.benchmark(task, repeat=5) @benchmark_async diff --git a/benchmarks/benchmark_async.py b/benchmarks/benchmark_async.py index d69da2713774e..76f81793bc7ed 100644 --- a/benchmarks/benchmark_async.py +++ b/benchmarks/benchmark_async.py @@ -21,6 +21,16 @@ ti.benchmark_plot(fn='benchmark.yml', cases=case_names, - archs=['x64', 'cuda'], + columns=[ + 'wall_clk_t', 'exec_t', 'launched_tasks', + 'compiled_inst', 'compiled_tasks' + ], + column_titles=[ + 'Wall-clock time', 'Backend time', 'Tasks launched', + 'Instructions emitted', 'Tasks compiled' + ], + archs=['cuda', 'x64'], + title='Whole-Program Optimization Microbenchmarks', bars='sync_vs_async', - left_margin=0.2) + left_margin=0.2, + size=(11.5, 9)) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 640277b46956b..4292db52dfdcb 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -9,7 +9,10 @@ def body(): for arch in [ti.cpu, ti.cuda]: for async_mode in [True, False]: os.environ['TI_CURRENT_BENCHMARK'] = func.__name__ - ti.init(arch=arch, async_mode=async_mode, kernel_profiler=True) + ti.init(arch=arch, + async_mode=async_mode, + kernel_profiler=True, + verbose=False) if arch == ti.cpu: scale = 2 else: diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index e80c545acb015..596d2dfaeabce 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -371,7 +371,8 @@ def run_benchmark(): avg = elapsed / repeat ti.stat_write('wall_clk_t', avg) device_time = ti.kernel_profiler_total_time() - ti.stat_write('exec_t', device_time) + avg_device_time = device_time / repeat + ti.stat_write('exec_t', avg_device_time) run_benchmark() @@ -379,12 +380,14 @@ def run_benchmark(): def benchmark_plot(fn=None, cases=None, columns=None, + column_titles=None, archs=None, title=None, bars='sync_vs_async', bar_width=0.4, bar_distance=0, - left_margin=0): + left_margin=0, + size=(12, 8)): import taichi as ti import yaml import matplotlib.pyplot as plt @@ -412,13 +415,15 @@ def benchmark_plot(fn=None, if columns is None: columns = list(data[cases[0]].keys()) + if column_titles is None: + column_titles = columns normalize_to_lowest = lambda x: True figure, subfigures = plt.subplots(len(cases), len(columns)) if title is None: title = 'Taichi Performance Benchmarks (Higher means more)' figure.suptitle(title, fontweight="bold") for col_id in range(len(columns)): - subfigures[0][col_id].set_title(columns[col_id]) + subfigures[0][col_id].set_title(column_titles[col_id]) for case_id in range(len(cases)): case = cases[case_id] subfigures[case_id][0].annotate( @@ -435,7 +440,9 @@ def benchmark_plot(fn=None, if archs is None: current_archs = data[case][col].keys() else: - current_archs = archs & data[case][col].keys() + current_archs = [ + x for x in archs if x in data[case][col].keys() + ] if bars == 'sync_vs_async': y_left = [ data[case][col][arch]['sync'] for arch in current_archs @@ -480,7 +487,7 @@ def benchmark_plot(fn=None, height=y_left, width=bar_width, label=label_left, - color=(0.3, 0.7, 0.9, 1.0)) + color=(0.47, 0.69, 0.89, 1.0)) bar_right = ax.bar(x=[ i + bar_width / 2 + bar_distance / 2 for i in range(len(current_archs)) @@ -488,7 +495,7 @@ def benchmark_plot(fn=None, height=y_right, width=bar_width, label=label_right, - color=(0.8, 0.2, 0.3, 1.0)) + color=(0.68, 0.26, 0.31, 1.0)) ax.set_xticks(range(len(current_archs))) ax.set_xticklabels(current_archs) figure.legend((bar_left, bar_right), (label_left, label_right), @@ -496,7 +503,7 @@ def benchmark_plot(fn=None, figure.subplots_adjust(left=left_margin) fig = plt.gcf() - fig.set_size_inches(13, 8) + fig.set_size_inches(size) plt.show() diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h index 59076abfc91c9..d978de3a9f677 100644 --- a/taichi/program/compile_config.h +++ b/taichi/program/compile_config.h @@ -70,7 +70,8 @@ struct CompileConfig { bool async_opt_dse{true}; bool async_listgen_fast_filtering{true}; std::string async_opt_intermediate_file; - int async_flush_every{0}; + // Setting 0 effectively means do not automatically flush + int async_flush_every{50}; // Setting 0 effectively means unlimited int async_max_fuse_per_task{1};