From 086e674ee96862e326603a8fe2e42f2aa461b958 Mon Sep 17 00:00:00 2001 From: Shreya Gaur <48754356+Shreya-gaur@users.noreply.github.com> Date: Mon, 24 Jun 2024 13:52:54 -0700 Subject: [PATCH] cutlass update (#242) * cutlass: Added cutlass 3.0 to gpu-app-collection This commit contains the changes in the make file and the define-app-apps file. * Commit changes to cutlass app * Changes for cutlass in define-all-apps * Changes to define-power.yml for cutlass 3 * Comment change of cutlass --------- Co-authored-by: JRPan <25518778+JRPan@users.noreply.github.com> Co-authored-by: Tim Rogers Co-authored-by: WilliamMTK --- util/job_launching/apps/define-all-apps.yml | 92 +++++++++++---------- util/job_launching/apps/define-power.yml | 8 +- 2 files changed, 54 insertions(+), 46 deletions(-) diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index b65b6a509..cc3e68d45 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -511,51 +511,59 @@ cutlass_5_trace: exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" data_dirs: "$GPUAPPS_ROOT/data_dirs/" execs: - - cutlass_perf_test: - - args: --seed=2020 --dist=0 --m=2560 --n=16 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 3G - - args: --seed=2020 --dist=0 --m=2560 --n=32 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 3G - - args: --seed=2020 --dist=0 --m=2560 --n=64 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 3G - - args: --seed=2020 --dist=0 --m=2560 --n=128 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 3G - - args: --seed=2020 --dist=0 --m=2560 --n=7000 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 3G - - args: --seed=2020 --dist=0 --m=4096 --n=16 --k=4096 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 5G - - args: --seed=2020 --dist=0 --m=4096 --n=32 --k=4096 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 5G - - args: --seed=2020 --dist=0 --m=4096 --n=64 --k=4096 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 5G - - args: --seed=2020 --dist=0 --m=4096 --n=128 --k=4096 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 5G - - args: --seed=2020 --dist=0 --m=4096 --n=7000 --k=4096 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 5G - - args: --seed=2020 --dist=0 --m=2560 --n=16 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass + - cutlass_profiler: + #single precision gemm kernels + - args: --seed=2020 --dist=0 --m=2560 --n=16 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=2560 --n=32 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass + - args: --seed=2020 --dist=0 --m=2560 --n=32 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=2560 --n=64 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass + # - args: --seed=2020 --dist=0 --m=2560 --n=64 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --m=2560 --n=128 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --m=2560 --n=512 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --m=2560 --n=1024 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --m=2560 --n=2560 --k=2560 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --m=4096 --n=16 --k=4096 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 16G + # - args: --seed=2020 --dist=0 --m=4096 --n=32 --k=4096 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 16G + # - args: --seed=2020 --dist=0 --m=4096 --n=64 --k=4096 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 16G + # - args: --seed=2020 --dist=0 --m=4096 --n=128 --k=4096 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 16G + # - args: --seed=2020 --dist=0 --m=4096 --n=4096 --k=4096 --kernels=sgemm --iterations=5 --providers=cutlass + # accel-sim-mem: 20G + #gemm kernels on tensor cores + - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=16 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=2560 --n=128 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=2560 --n=512 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=2560 --n=1024 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=2560 --n=2560 --k=2560 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 13G - - args: --seed=2020 --dist=0 --m=4096 --n=16 --k=4096 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 16G - - args: --seed=2020 --dist=0 --m=4096 --n=32 --k=4096 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 16G - - args: --seed=2020 --dist=0 --m=4096 --n=64 --k=4096 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 16G - - args: --seed=2020 --dist=0 --m=4096 --n=128 --k=4096 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 16G - - args: --seed=2020 --dist=0 --m=4096 --n=4096 --k=4096 --kernels=sgemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 20G + # - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=32 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=64 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=128 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=512 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=1024 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=2560 --n=2056 --k=2560 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=4096 --n=16 --k=4096 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=4096 --n=32 --k=4096 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=4096 --n=64 --k=4096 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=4096 --n=128 --k=4096 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=4096 --n=512 --k=4096 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G + # - args: --seed=2020 --dist=0 --operation=gemm --m=4096 --n=4096 --k=4096 --op_class=tensorop --iterations=5 --provider=cutlass + # accel-sim-mem: 13G ## Not sure how much memory the following apps take - just letting them go with the default diff --git a/util/job_launching/apps/define-power.yml b/util/job_launching/apps/define-power.yml index 4326e2be3..34c4add4b 100644 --- a/util/job_launching/apps/define-power.yml +++ b/util/job_launching/apps/define-power.yml @@ -206,13 +206,13 @@ cutlass_5_trace_validation: data_dirs: "$ACCELSIM_ROOT/../util/accelwattch/accelwattch_benchmarks/data_dirs/" execs: - cutlass_perf_test_k1: - - args: --seed=2020 --dist=0 --m=2560 --n=16 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass + - args: --seed=2020 --dist=0 --m=2560 --n=16 --k=2560 --operation=gemm --op_class=tensorop --iterations=5 --providers=cutlass accel-sim-mem: 5G - cutlass_perf_test_k2: - - args: --seed=2020 --dist=0 --m=4096 --n=128 --k=4096 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass - accel-sim-mem: 5G + - args: --seed=2020 --dist=0 --m=4096 --n=128 --k=4096 --operation=gemm --op_class=tensorop --iterations=5 --providers=cutlass + accel-sim-mem: 5G - cutlass_perf_test_k3: - - args: --seed=2020 --dist=0 --m=2560 --n=512 --k=2560 --kernels=wmma_gemm_nn --iterations=5 --providers=cutlass + - args: --seed=2020 --dist=0 --m=2560 --n=512 --k=2560 --operation=gemm --op_class=tensorop --iterations=5 --providers=cutlass accel-sim-mem: 5G Deepbench_validation: