[CUDA] Add SparseAttention kernel for sm=75 (microsoft#20531)

### Description Follow up of microsoft#20216 to add kernel for sm=75 (GPU like T4, Geforce RTX 2080, GeForce GTX 1650 Ti, NVIDIA TITAN RTX, RTX 4000 etc) - [x] Add kernel for sm=75 - [x] Update dispatch code to use sm to call different kernel. - [x] Update compile script to use num_stages=2 instead of 3 for sm=75 - [x] Refactor test script and add tests for bfloat16. - [x] Fix performance test of token generation (previously we did not concatenate past_key) - [x] Fix debug build - [x] Run performance test and update numbers. For sm=70, the v1 kernel can be compiled but there is error in compiling v2 kernel. So it is skipped in this pull request. Performance Test on T4 GPU (using Standard_NC4as_T4_v3 Azure VM) with `batch_size=4, num_heads=32, max_seq_len=8192, head_size=128, sparse_block_size=64, local_blocks=16, vert_stride=8, num_layout=8` We compare sparse attention to corresponding GQA with dense causal. Note that GQA with dense need more computation since no sparsity is used. The TORCH-GQA use naive implementation (using cuSPARSE Block-SpMM could be faster). ``` prompt-sm75-batch4-head32-d128-local16-vert8-torch.float16: sequence_length TORCH-GQA ORT-GQA-Dense ORT-SparseAtt 1 32.0 0.184173 2.994347 0.089064 2 64.0 0.303300 3.023986 0.107418 3 128.0 0.887795 3.073728 0.174213 4 256.0 2.797654 3.246899 0.357869 5 512.0 10.055048 3.814039 0.893903 6 1024.0 37.849937 5.818439 2.658720 7 2048.0 148.641785 13.638480 7.202690 8 4096.0 OOM 43.556847 17.680954 9 8192.0 OOM 161.628540 44.336670 token-sm75-batch4-head32-d128-local16-vert8-torch.float16: past_sequence_length TORCH-GQA ORT-GQA-Dense ORT-SparseAtt 1 32.0 0.110353 2.996305 0.137509 2 64.0 0.145088 3.006860 0.165424 3 128.0 0.219500 3.036448 0.192001 4 256.0 0.347496 3.071341 0.249125 5 512.0 0.595842 3.135225 0.398726 6 1024.0 1.081216 3.261110 0.612744 7 2048.0 2.060307 3.515578 0.685670 8 4096.0 OOM 4.022986 0.819707 9 8191.0 OOM 5.024528 1.072912 ``` ### Motivation and Context To inference Phi-3-small in T4 GPU
TedThemistokleous · May 7, 2024 · 98c48e3 · 98c48e3
1 parent ab0dc35
commit 98c48e3
Show file tree

Hide file tree

Showing 24 changed files with 1,374 additions and 300 deletions.
diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.cc
@@ -63,6 +63,15 @@ SparseAttention<T>::SparseAttention(const OpKernelInfo& info)
 
 template <typename T>
 Status SparseAttention<T>::ComputeInternal(OpKernelContext* context) const {
+  auto& device_prop = GetDeviceProp();
+  if constexpr (std::is_same<T, BFloat16>::value) {
+    if (device_prop.major < 8) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                             "bfloat16 requires Ampere and above GPUs with Compute Capability >= 8. Got major=",
+                             device_prop.major);
+    }
+  }
+
   const Tensor* query = context->Input<Tensor>(0);
   const Tensor* key = context->Input<Tensor>(1);
   const Tensor* value = context->Input<Tensor>(2);
@@ -74,8 +83,6 @@ Status SparseAttention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* cos_cache = context->Input<Tensor>(8);
   const Tensor* sin_cache = context->Input<Tensor>(9);
 
-  auto& device_prop = GetDeviceProp();
-
   SparseAttentionParameters parameters;
 
   // Parameters from node attribute
@@ -97,9 +104,9 @@ Status SparseAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                            block_mask,
                                                            seqlens_k_total,
                                                            total_seq_len));
-
   // Some limitations of CUDA kernels
-  if (!sparse_attention_v1::is_supported_sparse_attention(device_prop)) {
+  // The v1 and v2 kernels have same coverage, so only check one of them to see whether it is supported.
+  if (!sparse_attention_v1::is_supported_device(device_prop)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "SparseAttention only support CUDA device with compute capacity 8.*. Got ",
                            device_prop.major);
@@ -118,6 +125,9 @@ Status SparseAttention<T>::ComputeInternal(OpKernelContext* context) const {
 
   int past_seq_len = parameters.total_sequence_length - parameters.sequence_length;
   bool is_prompt = (past_seq_len == 0);
+
+  // The v1 kernel support only prompt and right padding only.
+  // The v2 kernel support both prompt and token generation, and left/right padding.
   bool use_v2_kernel = disable_v1_kernel_ || !is_prompt;
 
   // Async Copy total_k_seq_len from GPU to CPU.
@@ -139,19 +149,21 @@ Status SparseAttention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 
   if (!kernel_loaded_) {
+    int sm = device_prop.major * 10 + device_prop.minor;
     if constexpr (std::is_same<T, MLFloat16>::value) {
       // std::call_once is used in load_sparse_attention_fp16 so no need to use mutex here.
       // After kernel is loaded, it will stay in memory until the process exits. We do not unload explicitly.
+      // TODO(tianleiwu): use TSharedCubinKernelFactory to manage kernel loading/unloading.
       if (use_v2_kernel) {
-        sparse_attention_v2::load_sparse_attention_fp16();
+        sparse_attention_v2::load_sparse_attention_fp16(sm);
       } else {
-        sparse_attention_v1::load_sparse_attention_fp16();
+        sparse_attention_v1::load_sparse_attention_fp16(sm);
       }
     } else {
       if (use_v2_kernel) {
-        sparse_attention_v2::load_sparse_attention_bf16();
+        sparse_attention_v2::load_sparse_attention_bf16(sm);
       } else {
-        sparse_attention_v1::load_sparse_attention_bf16();
+        sparse_attention_v1::load_sparse_attention_bf16(sm);
       }
     }
     kernel_loaded_ = true;
@@ -164,7 +176,6 @@ Status SparseAttention<T>::ComputeInternal(OpKernelContext* context) const {
   output_shape[2] = static_cast<int64_t>(parameters.hidden_size);
   Tensor* output = context->Output(0, output_shape);
 
-  assert(parameters.past_kv_format == AttentionQkvFormat::Q_K_V_BNSH);
   std::vector<int64_t> present_dims = {
       parameters.batch_size, parameters.kv_num_heads, parameters.max_sequence_length, parameters.head_size};
   TensorShape present_shape(present_dims);

diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.h b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.h
@@ -25,7 +25,7 @@ class SparseAttention final : public CudaKernel {
   int sparse_block_size_;       // block size for sparsity
   bool do_rotary_;              // Has rotary positional embedding
   bool rotary_interleaved_;     // Interleaved rotary positional embedding
-  bool disable_v1_kernel_;      // Disable V2 kernel
+  bool disable_v1_kernel_;      // Whether disable v1 kernel and use v2 kernel for prompt.
   mutable bool kernel_loaded_;  // Kernel has been loaded
 };
 

diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_impl.cu b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_impl.cu
@@ -252,9 +252,11 @@ Status QkvToContext(
       data.kernel_layout.num_layout);
 #endif
 
+  int sm = device_prop.major * 10 + device_prop.minor;
   if (data.use_v2_kernel) {
     sparse_attention_v2::SparseAttentionParams params(
         ort_stream,
+        sm,
         data.output,
         reinterpret_cast<const void*>(query),
         reinterpret_cast<const void*>(data.present_key),
@@ -289,6 +291,7 @@ Status QkvToContext(
   } else {
     sparse_attention_v1::SparseAttentionParams params(
         ort_stream,
+        sm,
         data.output,
         reinterpret_cast<const void*>(query),
         reinterpret_cast<const void*>(data.present_key),

diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_v1/compile_sparse_attention.py b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_v1/compile_sparse_attention.py
@@ -4,21 +4,21 @@
 # --------------------------------------------------------------------------
 
 # Use triton AoT compiler to convert sparse_attention_triton.py to C source files including cubin and dispatcher.
-# Example to use this script (Tested with CUDA 12.3 in Ubuntu 20.04):
-#    python3 -m pip install triton==2.3.0
+# Example to use this script (Tested with Python 3.10 and CUDA 12.3 in Ubuntu 20.04):
+#    python3 -m pip install torch==2.3.0 triton==2.3.0
 #    python3 compile_sparse_attention.py | sh
 #
 # Note that sparse_attention_v1_*.cc and sparse_attention_dispatcher_*.h are the generated files.
 
 import math
 from itertools import product
 
+import torch
 
-def generate_triton_compile_shell_script(dtype="fp16"):
+
+def generate_triton_compile_shell_script(sm, dtype="fp16"):
     assert dtype in ["fp16", "bf16"]
     print("export TRITON_ROOT=$(pip show triton | grep Location | cut -d' ' -f2)")
-    print('export ARCH="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader|head -n 1)"')
-    print("export SM=$(echo $ARCH | sed -e 's/\\.//g')")
 
     # Modify the compile.py to use custom template file template_h.txt and template_c.txt in current directory.
     # Also pass block_m to the template.
@@ -51,8 +51,8 @@ def generate_triton_compile_shell_script(dtype="fp16"):
             scalar_params = "i32,i32,i32,fp32,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32:16,i32,i32,i32"
             sig = f"*{dtype}:16,*{dtype}:16,*{dtype}:16,*{dtype}:16,*i32:16,*i32:16,{scalar_params},{block_m},{int(even_m)},{block_n},{int(even_n)},{block_d},{num_blocks_d}"
             prefix = "python compile.py sparse_attention_triton.py"
-            filename = f"sparse_attention_v1_{dtype}_m{block_m}_{int(even_m)}_n{block_n}_{int(even_n)}_d{block_d}_{num_blocks_d}_sm${{SM}}"
-            name = f"sparse_attention_{dtype}_sm${{SM}}"
+            filename = f"sparse_attention_v1_{dtype}_m{block_m}_{int(even_m)}_n{block_n}_{int(even_n)}_d{block_d}_{num_blocks_d}_sm{sm}"
+            name = f"sparse_attention_{dtype}_sm{sm}"
             num_warps = max(1, 2 ** int(math.log2(min(block_m, block_n, block_d) / 16)))
             num_stages = 2
             # TODO: use different kernel name (change the name in sparse_attention_triton.py before running compile.py)
@@ -62,7 +62,7 @@ def generate_triton_compile_shell_script(dtype="fp16"):
             )
 
     # Generate the dispatcher.
-    dispatcher = f"sparse_attention_dispatcher_{dtype}_sm${{SM}}"
+    dispatcher = f"sparse_attention_dispatcher_{dtype}_sm{sm}"
     print(f"cd {out_dir}")
     print(f"python ${{TRITON_ROOT}}/triton/tools/link.py sparse_attention_v1_*.h -o {dispatcher}")
     print("rm *.h")
@@ -122,5 +122,10 @@ def generate_triton_compile_shell_script(dtype="fp16"):
 
 
 if __name__ == "__main__":
-    for dtype in ["fp16", "bf16"]:
-        generate_triton_compile_shell_script(dtype)
+    major, minor = torch.cuda.get_device_capability()
+    print(f"echo Generate sparse attention v1 kernels for compute capability:{major}.{minor}")
+    assert major >= 7, "triton only supports compute capability >= 7.0"
+
+    sm = major * 10 + minor
+    for dtype in ["fp16", "bf16"] if major >= 8 else ["fp16"]:
+        generate_triton_compile_shell_script(sm, dtype)
diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_v1/sparse_attention_common.h b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_v1/sparse_attention_common.h
@@ -15,6 +15,8 @@ namespace sparse_attention_v1 {
 
 struct SparseAttentionParams {
   onnxruntime::Stream* ort_stream;
+  int sm;  // compute capability like 80 for A100
+
   void* output;
   const void* q;
   const void* k;
@@ -57,6 +59,7 @@ struct SparseAttentionParams {
 
   SparseAttentionParams(
       onnxruntime::Stream* ort_stream,
+      int sm,
       void* output,
       const void* q,
       const void* k,
@@ -76,6 +79,7 @@ struct SparseAttentionParams {
       int layout_col_stride_h,
       int num_layout) {
     this->ort_stream = ort_stream;
+    this->sm = sm;
     this->output = output;
     this->q = q;
     this->k = k;