From baf40455516a3a33a20bb7728d8acfed8a740e4d Mon Sep 17 00:00:00 2001 From: Xiaozhu Meng Date: Tue, 16 Jul 2024 15:44:08 -0700 Subject: [PATCH] Add rotating buffer feature to quantize_bench Summary: On AMD, rotating buffer gives benchmarking results closer to E2E runs Reviewed By: xw285cornell Differential Revision: D59828276 --- .../gen_ai/bench/quantize_bench.py | 26 ++++++++-- .../experimental/gen_ai/bench/quantize_ops.py | 51 +++++++++++++++++-- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py index c3c802769d..780c01de4b 100644 --- a/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +++ b/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py @@ -37,6 +37,7 @@ def benchmark( k: int, kernels: Optional[List[str]] = None, bench_quantize: bool = False, + use_rotating_buffer_bench: bool = False, ) -> Dict[str, Any]: # Create input tensors. A = torch.randn(m, k, device="cuda", dtype=torch.bfloat16) @@ -63,10 +64,17 @@ def benchmark( # Now perform benchmark. if bench_quantize: # Benchmark both quantize and compute. - ms_runtime = quantize_op.benchmark(A, B, bench_quantize=True) + ms_runtime = quantize_op.benchmark( + A, + B, + bench_quantize=True, + use_rotating_buffer_bench=use_rotating_buffer_bench, + ) else: ms_runtime = quantize_op.benchmark( - *quantized_vals, bench_quantize=False + *quantized_vals, + bench_quantize=False, + use_rotating_buffer_bench=use_rotating_buffer_bench, ) # Print out results for this op. @@ -137,7 +145,13 @@ def main(args: Any): for m, n, k in MNK: print(f"Benchmarking M={m}, N={n}, K={k}.") quantize_measurements = benchmark( - quantize_ops, m, n, k, kernels, args.bench_quantize + quantize_ops, + m, + n, + k, + kernels, + args.bench_quantize, + args.use_rotating_buffer_bench, ) benchmark_results.append(quantize_measurements) if args.export_csv: @@ -189,6 +203,12 @@ def invoke_main() -> None: parser.add_argument( "--K", default=None, help="Comma separated list of K values to benchmark." ) + parser.add_argument( + "--use_rotating_buffer_bench", + default=False, + action="store_true", + help="If set, use rotating buffer to benchmark.", + ) args = parser.parse_args() main(args) diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py index f994224a01..28c3211f5e 100644 --- a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +++ b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py @@ -41,12 +41,57 @@ def quantize_and_compute(self, *args): """Function which quantizes inputs and performs main compute operation.""" pass - def benchmark(self, *args, bench_quantize: bool = False) -> float: + def bench_with_rotating_buffer(self, fn, args): + import copy + import pickle + + # torch.cuda.get_device_properties does not have L2 cache size, + # so hard code an overapproximation of L2 cache size to ensure L2 cache flush + total_buffer_size = 16 * 1024 * 1024 + + # Use pickle to serialize model input to estimate total sizes of input + input_sizes = len(pickle.dumps(args)) + + # Make at least one copy of the inputs + copy_cnt = total_buffer_size // input_sizes + if copy_cnt == 0: + copy_cnt = 1 + + args_list = [args] + for _ in range(copy_cnt): + args_list.append(copy.deepcopy(args)) + + def rotating_buffer_fn(fn, args_list, copy_cnt): + for i in range(copy_cnt): + fn(*(args_list[i])) + + with torch.cuda.stream(torch.cuda.Stream()): + # A rotating_buffer_fn contains multiple runs of the fn to benchmark, + # so divide time accordingly + return triton.testing.do_bench_cudagraph( + lambda: rotating_buffer_fn(self.compute, args_list, copy_cnt + 1), + rep=500, + ) / (copy_cnt + 1) + + def benchmark( + self, + *args, + bench_quantize: bool = False, + use_rotating_buffer_bench: bool = False + ) -> float: """Benchmark runtime of this operator.""" if bench_quantize: - return triton.testing.do_bench(lambda: self.quantize_and_compute(*args)) + with torch.cuda.stream(torch.cuda.Stream()): + t = triton.testing.do_bench_cudagraph( + lambda: self.quantize_and_compute(*args) + ) else: - return triton.testing.do_bench(lambda: self.compute(*args)) + if use_rotating_buffer_bench: + t = self.bench_with_rotating_buffer(self.compute, args) + else: + with torch.cuda.stream(torch.cuda.Stream()): + t = triton.testing.do_bench_cudagraph(lambda: self.compute(*args)) + return t @abc.abstractproperty def name(self) -> str: