diff --git a/hopper/benchmark_flash_attention_fp8.py b/hopper/benchmark_flash_attention_fp8.py
index efd007dd2..8490d00b3 100644
--- a/hopper/benchmark_flash_attention_fp8.py
+++ b/hopper/benchmark_flash_attention_fp8.py
@@ -229,7 +229,8 @@ def time_fwd(func, *args, **kwargs):
 # dim = 256
 dropout_p = 0.0
 
-methods = (["Pytorch", "Flash3", "cuDNN"]        
+methods = (["Pytorch", "Flash3"]
+        + (["cuDNN"] if cudnn is not None else [])
         # + (["Triton"] if attention_triton is not None else [])
         #    + (["xformers.c"] if xops is not None else [])
         #    + (["xformers.f"] if xops is not None else [])
@@ -247,10 +248,10 @@ def time_fwd(func, *args, **kwargs):
             torch.cuda.empty_cache()
             config = (causal, headdim, batch_size, seqlen)
             nheads = dim // headdim
-            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=torch.float16, requires_grad=False) for _ in range(3)]
+            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=torch.bfloat16, requires_grad=False) for _ in range(3)]
             
             qkv = torch.stack([q, k, v], dim=2)
-            qkv = qkv.to(torch.float16)
+            qkv = qkv.to(torch.bfloat16)
             f = time_fwd(attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False)
             time_f[config, "Pytorch"] = f
             res_baseline = attention_pytorch(qkv, dropout_p, causal=causal)
@@ -289,7 +290,8 @@ def time_fwd(func, *args, **kwargs):
                 k, 
                 v, 
                 softmax_scale, 
-                causal=causal, 
+                causal=causal,
+                window_size=(-1,-1),
                 descale_q=descale_q, 
                 descale_k=descale_k, 
                 descale_v=descale_v, 
diff --git a/hopper/benchmark_split_kv.py b/hopper/benchmark_split_kv.py
new file mode 100644
index 000000000..d3d83590a
--- /dev/null
+++ b/hopper/benchmark_split_kv.py
@@ -0,0 +1,325 @@
+import torch
+import flash_attn
+import flash_attn_interface
+import itertools
+import time
+import math
+
+import torch.utils.benchmark as benchmark
+
+def round_up_to_power_of_2(x):
+    if x <= 1:
+        return 1
+    return 1 << (x - 1).bit_length()
+
+def timeit(fn, *args, **kwargs):
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        fn(*args, **kwargs)
+    
+    # Benchmark using PyTorch Timer
+    t = benchmark.Timer(
+        stmt='fn(*args, **kwargs)',
+        globals={'fn': fn, 'args': args, 'kwargs': kwargs}
+    )
+    
+    # Measure execution time
+    measurement = t.timeit(20)  # Runs the function 20 times
+    # measurement = t.blocked_autorange(min_run_time=1)
+    avg_time = measurement.mean  # Average time in seconds
+
+    return avg_time
+
+def main():
+    num_sms = torch.cuda.get_device_properties(
+        torch.cuda.current_device()
+    ).multi_processor_count
+
+    max_splits = 129
+    check_all_splits = False
+
+    causal = True
+    # causal = False
+    # dtype=torch.float16
+    dtype=torch.bfloat16
+
+    torch.manual_seed(42)  
+
+    model_configs = [
+        # ("Gemma-2-2B", 8, 4, 256),
+        # ("Gemma-2-9B", 16, 8, 256),
+        # ("Gemma-2-27B", 32, 16, 128),
+        # ("Qwen-2.5-0.5B", 14, 2, 64),
+        # ("Qwen-2.5-1.5B", 12, 2, 128),
+        # ("Qwen-2.5-7B", 28, 4, 128),
+        # ("Llama-3.1-8B", 32, 8, 128),
+        ("Llama-3.1-70B", 64, 8, 128),
+        # ("Llama-3.1-405B", 128, 8, 128),
+        # ("Llama-3.2-1B", 32, 8, 64),
+        # ("Llama-3.2-3B", 24, 8, 128),
+        # ("Nemotron-4-15B", 48, 8, 128),
+    ]
+
+    all_batch_configs = []
+
+    all_batch_configs.extend(itertools.product(
+        # [1024, 2048, 4096, 8192, 16384, 32768, 131072],  # context_seqlen
+        [4096, 16384, 65536],  # context_seqlen
+        # [131072],  # context_seqlen
+        # [i for i in range(1, (num_sms) + 1)], # num_requests
+        [1, 4, 8, 16],  # num_requests
+        # [1],  # num_requests
+        [1, 4, 8, 16],  # query_seqlen
+        # [1],  # query_seqlen
+    ))
+
+    num_caches = max(reqs for _, reqs, _ in all_batch_configs)
+    cache_seqlen = max(seqlen for seqlen, _, _ in all_batch_configs)
+
+    for model_name, nheads_q, nheads_kv, headdim in model_configs:
+        k_cache = torch.randn(
+            (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=dtype
+        )
+        v_cache = torch.randn(
+            (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=dtype
+        )
+        print(f"***{model_name}***")
+        print(f"QHEADS:{nheads_q}, KVHEADS:{nheads_kv}, HEADDIM:{headdim}")
+        
+        if check_all_splits is False:
+            print(f"{'CONTEXT':<9}{'BSZ':<5}{'QLEN':<6}{'FA2':<10}{'FA3':<9}{'RATIO':<7}{'GB/s':<10}")
+
+        for context_seqlen, num_requests, query_seqlen in all_batch_configs:
+            bytes_kv = (context_seqlen * num_requests * nheads_kv * headdim * 4)
+            bytes_q = (query_seqlen * num_requests * nheads_q * headdim * 4)
+            blockH = round_up_to_power_of_2(nheads_q//nheads_kv)
+            blockM = 128 # true for hdim 128 causal and hdim 64
+            blockM_div_H = blockM//blockH
+            num_work_tiles = nheads_kv * num_requests * math.ceil(query_seqlen/blockM_div_H)
+
+            q = torch.randn((num_requests, query_seqlen, nheads_q, headdim), device="cuda", dtype=dtype)
+            cache_idxs = torch.randperm(num_caches, dtype=torch.int32, device="cuda")[:num_requests]
+            cache_seqlens = torch.tensor(
+                [context_seqlen] * num_requests, dtype=torch.int32, device="cuda"
+            )
+
+            fa2_time_heuristic = timeit(
+                flash_attn.flash_attn_with_kvcache,
+                q=q,
+                k_cache=k_cache,
+                v_cache=v_cache,
+                cache_seqlens=cache_seqlens,
+                cache_batch_idx=cache_idxs,
+                causal=causal,
+            ) * 1000. * 1000.
+            # fastest_splitk_time = float("inf")
+            # fastest_splitk = 0
+            # for i in range(1, max_splits):
+            #     t = timeit(
+            #         flash_attn.flash_attn_with_kvcache,
+            #         q=q,
+            #         k_cache=k_cache,
+            #         v_cache=v_cache,
+            #         cache_seqlens=cache_seqlens,
+            #         cache_batch_idx=cache_idxs,
+            #         causal=causal,
+            #         num_splits=i,
+            #     ) * 1000. * 1000.
+            #     if t < fastest_splitk_time:
+            #         fastest_splitk_time = t
+            #         fastest_splitk = i
+
+            fa3_time_one_split = timeit(
+                flash_attn_interface.flash_attn_with_kvcache,
+                q=q,
+                k_cache=k_cache,
+                v_cache=v_cache,
+                cache_seqlens=cache_seqlens,
+                cache_batch_idx=cache_idxs,
+                causal=causal,
+                gqa_parallel=False,
+                num_splits=1,
+            ) * 1000. * 1000.
+
+            fa3_time_gqa_heuristic = timeit(
+                flash_attn_interface.flash_attn_with_kvcache,
+                q=q,
+                k_cache=k_cache,
+                v_cache=v_cache,
+                cache_seqlens=cache_seqlens,
+                cache_batch_idx=cache_idxs,
+                causal=causal,
+                gqa_parallel=True,
+                num_splits=0,
+                max_seqlen_k_hint=context_seqlen
+            ) * 1000. * 1000.
+
+            if check_all_splits:
+            
+                fa3_fastest_num_splits = 0
+                fa3_fastest_splitk_time = float("inf")
+                
+                for num_splits in range(1, max_splits):
+                    t = timeit(
+                        flash_attn_interface.flash_attn_with_kvcache,
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=False,
+                        num_splits=num_splits
+                    ) * 1000. * 1000.
+
+                    out0 = flash_attn_interface.flash_attn_with_kvcache(
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=False,
+                        num_splits=num_splits
+                    )
+
+                    out1 = flash_attn_interface.flash_attn_with_kvcache(
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=False,
+                        num_splits=1
+                    )
+
+                    max_diff = (out0 - out1).abs().max().item()
+                    mean_diff = (out0 - out1).abs().mean().item()
+                    # print (f"splits {num_splits}, out diff-max, {max_diff}, out diff-mean, {mean_diff}, time {t:.2f}")
+                    # print (f"splits {num_splits}, time {t:.2f}")
+
+                    if math.isnan(max_diff) or math.isnan(mean_diff) or max_diff > 2e-3 or mean_diff > 1e-4:
+                        print(f"Numerical error too high: Splits: {num_splits}, Max: {max_diff}, Mean: {mean_diff}")
+
+                    if t < fa3_fastest_splitk_time:
+                        fa3_fastest_splitk_time = t
+                        fa3_fastest_num_splits = num_splits
+
+                fa3_fastest_num_splits_gqa = 0
+                fa3_fastest_splitk_time_gqa = float("inf")
+                for num_splits in range(1, max_splits):
+
+                    t = timeit(
+                        flash_attn_interface.flash_attn_with_kvcache,
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=True,
+                        num_splits=num_splits
+                    ) * 1000. * 1000.
+
+                    out0 = flash_attn_interface.flash_attn_with_kvcache(
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=True,
+                        num_splits=num_splits
+                    )
+
+                    out1 = flash_attn_interface.flash_attn_with_kvcache(
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=True,
+                        num_splits=1
+                    )
+
+                    max_diff = (out0 - out1).abs().max().item()
+                    mean_diff = (out0 - out1).abs().mean().item()
+                    # print (f"gqa splits {num_splits}, out gqa diff-max {max_diff}, out gqa diff-mean {mean_diff}, time {t:.2f}")
+                    # print (f"gqa splits {num_splits}, time {t:.2f}")
+
+                    if math.isnan(max_diff) or math.isnan(mean_diff) or max_diff > 2e-3 or mean_diff > 1e-4:
+                        print(f"Numerical error too high (gqa): Splits: {num_splits}, Max: {max_diff}, Mean: {mean_diff}")
+
+                    if t < fa3_fastest_splitk_time_gqa:
+                        fa3_fastest_splitk_time_gqa = t
+                        fa3_fastest_num_splits_gqa = num_splits
+                
+                efficiency = (num_work_tiles * fa3_fastest_num_splits_gqa)/num_sms
+                heuristic_ratio = fa3_time_gqa_heuristic/fa3_fastest_splitk_time_gqa
+                # remeasure to smooth anomalies
+                if heuristic_ratio > 1.1:
+
+                    fa3_time_gqa_heuristic = timeit(
+                        flash_attn_interface.flash_attn_with_kvcache,
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=True,
+                        # num_splits=num_splits_select,
+                        # num_splits=1,
+                        num_splits=0,
+                        max_seqlen_k_hint=context_seqlen
+                    ) * 1000. * 1000.
+
+                    fa3_fastest_splitk_time_gqa = timeit(
+                        flash_attn_interface.flash_attn_with_kvcache,
+                        q=q,
+                        k_cache=k_cache,
+                        v_cache=v_cache,
+                        cache_seqlens=cache_seqlens,
+                        cache_batch_idx=cache_idxs,
+                        causal=causal,
+                        gqa_parallel=True,
+                        num_splits=fa3_fastest_num_splits_gqa
+                    ) * 1000. * 1000. 
+
+            if check_all_splits is True:
+                print(
+                    f"CONTEXT:{context_seqlen}, BSZ:{num_requests}, QLEN:{query_seqlen}, "
+                    f"FA2:{fa2_time_heuristic:.2f}, "
+                    # f"FA2 MANUAL:{fastest_splitk_time:.2f}, "
+                    # f"FA2 NUM SPLITS:{fastest_splitk}, "
+                    # f"FA3 NOGQA NOSPLIT:{fa3_time_one_split:.2f}, "
+                    # f"FA3 NOGQA SPLIT MANUAL:{fa3_fastest_splitk_time:.2f}, "
+                    # f"FA3 NOSPLIT:{fa3_time_one_split_gqa:.2f}, "
+                    f"FA3 SPLIT MANUAL:{fa3_fastest_splitk_time_gqa:.2f}, "
+                    f"FA3:{fa3_time_gqa_heuristic:.2f}, "
+                    # f"FA3 RATIO (NONSPLIT/SPLIT):{fa3_time_one_split_gqa/fa3_time_gqa_heuristic:.2f}, "
+                    # f"FA2 NUM SPLITS:{fastest_splitk}, "
+                    # f"FA3 NOGQA NUM SPLITS:{fa3_fastest_num_splits}, "
+                    f"FA3 NUM SPLITS:{fa3_fastest_num_splits_gqa}, "
+                    # f"RATIO (FA2/3):{fa2_time_heuristic/fa3_time_gqa_heuristic:.2f}, "
+                    f"RATIO:{fa3_time_gqa_heuristic/fa3_fastest_splitk_time_gqa:.2f}, "
+                    f"EFF:{efficiency:.2f}, "
+                    f"GB/s:{bytes_kv/fa3_time_gqa_heuristic * 1e-3:.2f}" 
+                )
+
+            if check_all_splits is False:
+                print(
+                    f"{context_seqlen:<9}{num_requests:<5}{query_seqlen:<6}"
+                    f"{fa2_time_heuristic:<10.2f}{fa3_time_gqa_heuristic:<9.2f}"
+                    f"{fa2_time_heuristic/fa3_time_gqa_heuristic:<7.2f}"
+                    f"{bytes_kv/fa3_time_gqa_heuristic * 1e-3:<10.2f}"
+                )
+
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/hopper/block_info.h b/hopper/block_info.h
deleted file mode 100644
index 3a23a1e1f..000000000
--- a/hopper/block_info.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2023, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-namespace flash {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<bool Varlen=true>
-struct BlockInfo {
-
-    template<typename Params>
-    __device__ BlockInfo(const Params &params, const int bidb)
-        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
-        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
-        , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
-        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
-        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
-        , seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb]))
-        , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew))
-        {
-        }
-
-    template <typename index_t>
-    __forceinline__ __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
-        return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
-    }
-
-    template <typename index_t>
-    __forceinline__ __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
-        return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride;
-    }
-
-    const int sum_s_q;
-    const int sum_s_k;
-    const int actual_seqlen_q;
-    // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0.
-    const int seqlen_k_cache;
-    const int actual_seqlen_k;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace flash
diff --git a/hopper/combine.h b/hopper/combine.h
new file mode 100644
index 000000000..a3317631b
--- /dev/null
+++ b/hopper/combine.h
@@ -0,0 +1,248 @@
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/cutlass.h>
+#include "cutlass/layout/layout.h"
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+
+#include "kernel_traits.h"
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Element, class SmemShape, class SmemShapeMaxSplits>
+struct SharedStorageLSE {
+    cute::array_aligned<Element, cute::size_v<SmemShape>> smem_lse;
+    cute::array_aligned<bool, cute::size_v<SmemShapeMaxSplits>> smem_valid_splits;
+};
+
+// DONT use Kernel_traits here to avoid redundant compilation.
+// template<typename Kernel_traits, int kBlockM, int Log_max_splits, bool Is_even_K, typename Params>
+template<typename Element, typename ElementAccum, int kHeadDim, int kBlockM, int Log_max_splits, bool Is_even_K, typename Params>
+__global__ void combine_attn_seqk_parallel(Params const params) {
+    // using Element = typename Kernel_traits::OutputType;
+    // using ElementAccum = typename Kernel_traits::ElementAccum;
+    using index_t = int64_t; // Kernel_traits::index_t
+    constexpr int kMaxSplits = 1 << Log_max_splits;
+    // constexpr int kHeadDim = Kernel_traits::kHeadDim;
+    constexpr int kNThreads = 128; //Kernel_traits::kNThreads;
+
+    static_assert(kMaxSplits <= 128, "kMaxSplits must be <= 128");
+    static_assert(kBlockM == 4 || kBlockM == 8 || kBlockM == 16 || kBlockM == 32, "kBlockM must be 4, 8, 16 or 32");
+    static_assert(kNThreads == 128, "We assume that each block has 128 threads");
+
+    // Shared memory.
+    // kBlockM + 1 instead of kBlockM to reduce bank conflicts.
+    //__shared__ __align__(16) ElementAccum sLSE[kMaxSplits][kBlockM+1];
+    extern __shared__  char smem_[];
+    using SharedStorage = SharedStorageLSE<ElementAccum, Shape<Int<kMaxSplits>, Int<kBlockM+1>>, Shape<Int<kMaxSplits>>>;
+    SharedStorage &shared_storage =
+      *reinterpret_cast<SharedStorage *>(smem_);
+    Tensor sLSE = make_tensor(make_smem_ptr(shared_storage.smem_lse.data()), Shape<Int<kMaxSplits>, Int<kBlockM+1>>{});
+    Tensor sValidSplits = make_tensor(make_smem_ptr(shared_storage.smem_valid_splits.data()), Shape<Int<kMaxSplits>>{});
+
+    // The thread and block index.
+    const int tidx = threadIdx.x;
+    const int bidx = blockIdx.x;
+
+    const index_t lse_size = params.b * params.h * params.seqlen_q;
+    //if (cute::thread0()) print ("final %d %d %d %d\n",  params.b, params.h, params.seqlen_q, params.b * params.h * params.seqlen_q); 
+
+    const index_t row_offset_lse = bidx * kBlockM;
+    Tensor gLSEaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.softmax_lseaccum_ptr) + row_offset_lse),
+                                   Shape<Int<kMaxSplits>, Int<kBlockM>>{},
+                                   make_stride(lse_size, _1{}));
+
+    // LSE format is different depending on params.unpadded_lse and params.seqlenq_ngroups_swapped, see comment in get_lse_tile.
+    // This tensor's layout maps row_offset_lse to {bidb, bidh, q_offset}.
+    Tensor gLSE = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.softmax_lse_ptr) + row_offset_lse),
+                              Shape<Int<kBlockM>>{}, Stride<_1>{});
+
+    // This layout maps row_offset_lse to {bidh, q_offset, bidb} or {bidh, bidb, q_offset}.
+    Layout flat_layout = make_layout(lse_size);
+    Layout orig_layout = make_layout(make_shape(params.seqlen_q, params.h, params.b));
+    auto transposed_stride = params.seqlenq_ngroups_swapped ? make_stride(params.b, params.seqlen_q * params.b, 1) : make_stride(1, params.seqlen_q * params.b, params.seqlen_q);
+    Layout remapped_layout = make_layout(make_shape(params.seqlen_q, params.h, params.b), transposed_stride);
+    Layout final_layout = cute::composition(remapped_layout, cute::composition(orig_layout, flat_layout));
+
+    Tensor gLSE_unpadded = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.softmax_lse_ptr)), final_layout);
+
+    constexpr int kNLsePerThread = (kMaxSplits * kBlockM + kNThreads - 1) / kNThreads;
+
+    // Read the LSE values from gmem and store them in shared memory, then transpose them.
+    constexpr int kRowsPerLoadLSE = kNThreads / kBlockM;
+    #pragma unroll
+    for (int l = 0; l < kNLsePerThread; ++l) {
+        const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
+        const int col = tidx % kBlockM;
+        ElementAccum lse = (row < params.num_splits && col < lse_size - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY;
+        if (row < kMaxSplits) { sLSE(row,col) = lse; }
+        // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse); }
+    }
+    __syncthreads();
+
+    // Reduce along the kBlockM dimension to determine valid splits (store in SMEM)
+    // One thread per split. Know NumThreads = 128 >= NumMaxSplits
+    if (tidx < kMaxSplits) {
+        bool is_valid_split = false;
+        #pragma unroll
+        for (int col = 0; col < kBlockM; ++col) {
+            if(sLSE(tidx,col) != -INFINITY) {
+                is_valid_split = true;
+            }
+        }
+        sValidSplits(tidx) = is_valid_split;
+    }
+    __syncthreads();
+    // if (bidx == 1 && tidx < 32) { printf("tidx = %d, row_offset_lse = %d, lse = %f\n", tidx, row_offset_lse, lse_accum(0)); }
+    
+    Tensor lse_accum = make_tensor<ElementAccum>(Shape<Int<kNLsePerThread>>{});
+    constexpr int kRowsPerLoadTranspose = std::min(kRowsPerLoadLSE, kMaxSplits);
+    // To make sure that kMaxSplits is within 1 warp: we decide how many elements within kMaxSplits
+    // each thread should hold. If kMaxSplits = 16, then each thread holds 2 elements (128 threads,
+    // kBlockM rows, so each time we load we can load 128 / kBlockM rows).
+    // constexpr int kThreadsPerSplit = kMaxSplits / kRowsPerLoadTranspose;
+    // static_assert(kThreadsPerSplit <= 32);
+    static_assert(kRowsPerLoadTranspose <= 32);
+    static_assert(kNLsePerThread * kRowsPerLoadTranspose <= kMaxSplits);
+    #pragma unroll
+    for (int l = 0; l < kNLsePerThread; ++l) {
+        const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
+        const int col = tidx / kRowsPerLoadTranspose;
+        //if (bidx == 0 && tidx < 128) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); }
+        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row,col) : -INFINITY;
+
+    }
+    //return;
+
+    // Compute the logsumexp of the LSE along the split dimension.
+    ElementAccum lse_max = lse_accum(0);
+    #pragma unroll
+    for (int l = 1; l < kNLsePerThread; ++l) { lse_max = max(lse_max, lse_accum(l)); }
+    MaxOp<float> max_op;
+    lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
+    lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+    float lse_sum = expf(lse_accum(0) - lse_max);
+    #pragma unroll
+    for (int l = 1; l < kNLsePerThread; ++l) { lse_sum += expf(lse_accum(l) - lse_max); }
+    SumOp<float> sum_op;
+    lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
+    // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
+    // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
+    ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+    // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); }
+    if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) {
+        if (params.unpadded_lse) {
+            const index_t lse_offset = row_offset_lse + tidx / kRowsPerLoadTranspose;
+            if (lse_offset < lse_size) {
+                gLSE_unpadded(lse_offset) = lse_logsum;
+            }
+        } else {
+            gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum;
+        }
+    }
+    //if (cute::thread0()) printf ("lse_logsum = %f\n", lse_logsum);
+
+    // Store the scales exp(lse - lse_logsum) in shared memory.
+    #pragma unroll
+    for (int l = 0; l < kNLsePerThread; ++l) {
+        const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
+        const int col = tidx / kRowsPerLoadTranspose;
+        if (row < params.num_splits && col < kBlockM) { sLSE(row,col) = expf(lse_accum(l) - lse_logsum); }
+    }
+    __syncthreads();
+
+    const index_t row_offset_oaccum = bidx * kBlockM * params.d_rounded;
+    Tensor gOaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.oaccum_ptr) + row_offset_oaccum),
+                                 Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                                 Stride<Int<kHeadDim>, _1>{});
+    constexpr int kBlockN = kNThreads / kBlockM;
+    using GmemLayoutAtomOaccum = Layout<Shape<Int<kBlockM>, Int<kBlockN>>, Stride<Int<kBlockN>, _1>>;
+    using GmemTiledCopyOaccum = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomOaccum{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
+    GmemTiledCopyOaccum gmem_tiled_copy_Oaccum;
+    auto gmem_thr_copy_Oaccum = gmem_tiled_copy_Oaccum.get_thread_slice(tidx);
+    Tensor tOgOaccum = gmem_thr_copy_Oaccum.partition_S(gOaccum);
+    Tensor tOrO = make_tensor<ElementAccum>(shape(tOgOaccum));
+    Tensor tOrOaccum = make_tensor<ElementAccum>(shape(tOgOaccum));
+    clear(tOrO);
+
+    // Predicates
+    Tensor cOaccum = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});
+    //if (cute::thread0()) print_tensor (cOaccum);
+    // Repeat the partitioning with identity layouts
+    Tensor tOcOaccum = gmem_thr_copy_Oaccum.partition_S(cOaccum);
+    Tensor tOpOaccum = make_tensor<bool>(make_shape(size<2>(tOgOaccum)));
+    if (!Is_even_K) {
+        #pragma unroll
+        for (int k = 0; k < size(tOpOaccum); ++k) { tOpOaccum(k) = get<1>(tOcOaccum(0, 0, k)) < params.d; }
+    }
+    // Load Oaccum in then scale and accumulate to O
+    for (int split = 0; split < params.num_splits; ++split) {
+        // DONT copy in Oaccum if lse(split) = -inf for all kBlockM.
+        if(sValidSplits(split)) {            
+            flash::copy</*Is_even_MN=*/false, Is_even_K>(
+                gmem_tiled_copy_Oaccum, tOgOaccum, tOrOaccum, tOcOaccum, tOpOaccum, params.b * params.h * params.seqlen_q - bidx * kBlockM
+            );
+            #pragma unroll
+            for (int m = 0; m < size<1>(tOrOaccum); ++m) {
+                int row = get<0>(tOcOaccum(0, m, 0));
+                ElementAccum lse_scale = sLSE(split,row);
+                if (lse_scale != 0.f) {
+                    #pragma unroll
+                    for (int k = 0; k < size<2>(tOrOaccum); ++k) {
+                        #pragma unroll
+                        for (int i = 0; i < size<0>(tOrOaccum); ++i) {
+                            tOrO(i, m, k) += lse_scale * tOrOaccum(i, m, k);
+                            //tOrO(i, m, k) += tOrOaccum(i, m, k);
+                        }
+                    }
+                }
+            //if (cute::thread0()) { printf("lse_scale = %f, %f\n", sLSE(split, 0), sLSE(split, 1)); print_tensor(tOrOaccum); }
+            }
+        }
+        tOgOaccum.data() = tOgOaccum.data() + params.b * params.h * params.seqlen_q * params.d_rounded;
+    }
+     //if (cute::thread0()) { print_tensor(tOrO); }
+
+    Tensor rO = flash::convert_type<Element>(tOrO);
+    // Write to gO
+    #pragma unroll
+    for (int m = 0; m < size<1>(rO); ++m) {
+        const int idx = bidx * kBlockM + get<0>(tOcOaccum(0, m, 0));
+        //if (cute::thread0()) print ("final %d %d %d %d %d\n", idx, params.b, params.h, params.seqlen_q, params.b * params.h * params.seqlen_q); 
+        if (idx < params.b * params.h * params.seqlen_q) {
+            //print ("final2\n"); 
+            const int batch_idx = idx / (params.h * params.seqlen_q);
+            const int head_idx = (idx - batch_idx * (params.h * params.seqlen_q)) / params.seqlen_q;
+            // The index to the rows of Q
+            const int row = idx - batch_idx * (params.h * params.seqlen_q) - head_idx * params.seqlen_q;
+            auto o_ptr = reinterpret_cast<Element *>(params.o_ptr) + batch_idx * params.o_batch_stride
+                + head_idx * params.o_head_stride + row * params.o_row_stride;
+            #pragma unroll
+            for (int k = 0; k < size<2>(rO); ++k) {
+                if (Is_even_K || tOpOaccum(k)) {
+                    const int col = get<1>(tOcOaccum(0, m, k));
+                    Tensor gO = make_tensor(make_gmem_ptr(o_ptr + col),
+                                            Shape<Int<decltype(size<0>(rO))::value>>{}, Stride<_1>{});
+                    // TODO: Should check if this is using vectorized store, but it seems pretty fast
+                    copy(rO(_, m, k), gO);
+                    //if (cute::thread0()) { print ("final\n"); print_tensor(gO); }
+                    // if (bidx == 0 && tidx == 0) { printf("tidx = %d, idx = %d, batch_idx = %d, head_idx = %d, row = %d, col = %d\n", tidx, idx, batch_idx, head_idx, row, col); print(rO(_, m, k)); print(gO); }
+                    // reinterpret_cast<uint64_t *>(o_ptr)[col / 4] = recast<uint64_t>(rO)(0, m, k);
+                }
+            }
+        }
+    }
+}
+
+}
diff --git a/hopper/epilogue_fwd_sm90_tma.hpp b/hopper/epilogue_fwd_sm90_tma.hpp
index 993f2e239..26664c104 100644
--- a/hopper/epilogue_fwd_sm90_tma.hpp
+++ b/hopper/epilogue_fwd_sm90_tma.hpp
@@ -20,24 +20,60 @@ using namespace cute;
 template <typename Ktraits, typename Seqlen_traits>
 struct CollectiveEpilogueFwd {
 
+    using InputType = typename Ktraits::Element;
     using Element = typename Ktraits::OutputType;    
     static constexpr int kBlockM = Ktraits::kBlockM;
     static constexpr int kBlockN = Ktraits::kBlockN;
+    static constexpr int kBlockH = Ktraits::kBlockH;
     static constexpr int kHeadDim = Ktraits::kHeadDim;
-    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;    
 
     static constexpr int kNWarps = Ktraits::kNWarps;
     static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
-    static constexpr bool Is_WS = kNWarps >= 12;    
+    static constexpr bool Is_WS = Ktraits::Is_WS;
 
     static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
     static constexpr int NumMmaThreads = kNThreads - NumCopyThreads;
 
+    static constexpr bool Is_split = Ktraits::Is_split;
+    static constexpr bool No_smem_O = Ktraits::No_smem_O;
+
+#ifndef NO_FP8_COLUMN_PERMUTE
+    static constexpr bool epi_column_permute = is_same_v<InputType, cutlass::float_e4m3_t>;
+#else
+    static constexpr bool epi_column_permute = false;
+#endif
+
+    using GmemShapeOT = std::conditional_t<
+        Is_split,
+        typename Seqlen_traits::ShapeOAccumT,
+        typename Seqlen_traits::ShapeT
+    >;
+    using GmemStrideOT = std::conditional_t<
+        Is_split,
+        typename Seqlen_traits::StrideOAccumT,
+        typename Seqlen_traits::StrideT
+    >;
+    using GmemLayoutOT = std::conditional_t<
+        Is_split,
+        typename Seqlen_traits::LayoutOAccumT,
+        typename Seqlen_traits::LayoutT
+    >;
+
+    using GmemLayoutLseT = std::conditional_t<
+        Is_split,
+        typename Seqlen_traits::LayoutLseAccumT,
+        typename Seqlen_traits::LayoutLseT
+    >;
+
     using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
         decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+    using SmemLayoutOCopy = typename Ktraits::SmemLayoutOCopy;
+    using TileShapeOCopy = typename Ktraits::TileShapeOCopy;
 
-    using SmemCopyAtomO = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
+    using SmemCopyAtomO = std::conditional_t<Is_split, 
+        Copy_Atom<UniversalCopy<Element>, Element>, Copy_Atom<cute::SM90_U32x4_STSM_N, Element>>;
     using SharedStorage = cute::array_aligned<Element, cute::cosize_v<SmemLayoutO>>;
 
     using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
@@ -45,11 +81,11 @@ struct CollectiveEpilogueFwd {
         GmemTiledCopyOTMA{},
         make_tensor(
             make_gmem_ptr(static_cast<Element*>(nullptr)), 
-            typename Seqlen_traits::ShapeT{}, 
-            typename Seqlen_traits::StrideT{}
+            GmemShapeOT{}, 
+            GmemStrideOT{}
         ),
-        SmemLayoutO{},
-        select<0, 2>(TileShape_MNK{}),
+        SmemLayoutOCopy{},
+        TileShapeOCopy{},
         _1{}));  // no mcast for O
 
     // These are for storing the output tensor without TMA (e.g., for setting output to zero and var-seq-len)
@@ -76,7 +112,7 @@ struct CollectiveEpilogueFwd {
                                  Stride<_4, _32, _1, _0>>;
     using ValueLayoutrO = Layout<Shape<_1, _2, Shape<_2, _2>, Int<kHeadDim/16>>,
                                 Stride<_0, _2, Stride<_4, _1>, _8>>;
-    using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, Element>{},
+    using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom<UniversalCopy<Element>, Element>{},
                       ThreadLayoutrO{}, ValueLayoutrO{}));
     using TiledCopyShaperO = Shape<_8, Int<kBlockM/8>, _16, Int<kHeadDim/16>>;
     using SmemLayoutrO = decltype(composition(SmemLayoutO{}, Layout<TiledCopyShaperO>{}));
@@ -84,17 +120,17 @@ struct CollectiveEpilogueFwd {
     // Host side kernel arguments
     struct Arguments {
         Element* ptr_O;
-        typename Seqlen_traits::LayoutT const layout_O;
+        GmemLayoutOT const layout_O;
         float* ptr_LSE;
-        typename Seqlen_traits::LayoutLseT const layout_LSE;
+        GmemLayoutLseT const layout_LSE;
     };
 
     // Device side kernel params
     struct Params {
         Element* ptr_O;
-        typename Seqlen_traits::LayoutT const layout_O;
+        GmemLayoutOT const layout_O;
         float* ptr_LSE;
-        typename Seqlen_traits::LayoutLseT const layout_LSE;
+        GmemLayoutLseT const layout_LSE;
         TMA_O tma_store_O;
     };
 
@@ -104,8 +140,8 @@ struct CollectiveEpilogueFwd {
         TMA_O tma_store_O = make_tma_copy(
             GmemTiledCopyOTMA{},
             mO,
-            SmemLayoutO{},
-            select<0, 2>(TileShape_MNK{}),
+            SmemLayoutOCopy{},
+            TileShapeOCopy{},
             _1{}); // no mcast for O
         return {args.ptr_O, args.layout_O, args.ptr_LSE, args.layout_LSE, tma_store_O};
     }
@@ -113,7 +149,7 @@ struct CollectiveEpilogueFwd {
     /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
     CUTLASS_DEVICE
     static void prefetch_tma_descriptors(Params const& epilogue_params) {
-        if constexpr (!Seqlen_traits::kUseVarSeqLen) {
+        if constexpr (!Seqlen_traits::UseVarSeqLen && !No_smem_O) {
             cute::prefetch_tma_descriptor(epilogue_params.tma_store_O.get_tma_descriptor());
         }
     }
@@ -126,169 +162,254 @@ struct CollectiveEpilogueFwd {
           SharedStorage& shared_storage,
           TiledMma tiled_mma,
           int thread_idx,
-          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
-          const Seqlen_traits& seqlen_traits_q
+          cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord,
+          const Seqlen_traits& seqlen_traits_q,
+          const cutlass::FastDivmod& qhead_per_khead_divmod
           ) {
 
-        auto [m_block, bidh, bidb] = block_coord;
-        Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{});
-        auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
-        auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
+        auto [m_block, n_split_idx, bidh, bidb] = block_coord;
+        const int bidh_kv = qhead_per_khead_divmod.divide(bidh);
+        const int h_block = bidh % int(qhead_per_khead_divmod);
 
         Tensor tOrO_out = flash::convert_type<Element>(tOrO);
-        Tensor taccOrO = smem_thr_copy_O.retile_S(tOrO_out);        // ((Atom,AtomNum), MMA_M, MMA_N)
-        Tensor taccOsO = smem_thr_copy_O.partition_D(sO);     // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-        // Make sure all WGs have finished reading V
-        cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::ValueEmpty) /*id*/);
-        cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
-        cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
-        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
-                                            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        if constexpr(!No_smem_O) {
+            if constexpr (!epi_column_permute) {
+                Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{});
+                auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
+                auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
+
+                Tensor taccOrO = smem_thr_copy_O.retile_S(tOrO_out);  // ((Atom,AtomNum), MMA_M, MMA_N)
+                Tensor taccOsO = smem_thr_copy_O.partition_D(sO);     // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+                // Make sure all WGs have finished reading V
+                cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::ValueEmpty) /*id*/);
+                cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
+                cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
+                                                    cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+            } else {
+                TiledCopyrO rmem_tiled_copy_O;
+                Tensor sOacc = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutrO{});
+                auto rmem_thr_copy_O = rmem_tiled_copy_O.get_thread_slice(thread_idx);
+                
+                Tensor taccOsO = rmem_thr_copy_O.partition_D(sOacc);
+                Tensor taccOrO = make_tensor(tOrO_out.data(), shape(taccOsO));
+
+                // Make sure all WGs have finished reading V
+                cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::ValueEmpty) /*id*/);        
+                cute::copy(rmem_tiled_copy_O, taccOrO, taccOsO);
+                cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
+                                                    cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+            }
+        }
 
         Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
-        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
-            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
         Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));
         auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-        Tensor taccOcO = thread_mma.partition_C(caccO);                           // (MMA,MMA_M,MMA_K)
+        Tensor taccOcO = thread_mma.partition_C(caccO);  // (MMA,MMA_M,MMA_K)
         static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
         static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
         // taccOcO has shape ((2, 2, V), MMA_M, MMA_K), we only take only the row indices.
         Tensor taccOcO_row = taccOcO(make_coord(_0{}, _, _0{}), _, _0{});
-        CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));                     // MMA_M
-        if (get<1>(taccOcO_row(_0{})) == 0) {
+        CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));  // 2 * MMA_M        
+        
+        if constexpr(!Seqlen_traits::UseGQAPacking) {
+            Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor<Is_split>(
+                mLSE, Shape<Int<kBlockM>>{}, bidh, bidb, n_split_idx)(_, m_block);
+            if (get<1>(taccOcO_row(_0{})) == 0) {
+                #pragma unroll
+                for (int mi = 0; mi < size(lse); ++mi) {
+                    const int row = get<0>(taccOcO_row(mi));                
+                    if (row < seqlen_traits_q.actual_seq_len - m_block * kBlockM) {
+                        gLSE(row) = lse(mi);
+                    }
+                }
+            }
+        } else {
+            // shape<1>(epilogue_params.layout_O) == h/h_k
+            // In common case where ceil_div(h/h_k, kBlockH) == 1,
+            // int(qhead_per_khead_divmod) == 1, bidh_kv == bidh, h_block == 0
+            const int h_offset = shape<1>(epilogue_params.layout_O) * bidh_kv +
+                    h_block * kBlockH;
+            const int m_bound = seqlen_traits_q.actual_seq_len - m_block * (kBlockM/kBlockH);
+            const int h_bound = shape<1>(epilogue_params.layout_O) - h_block * kBlockH;
             #pragma unroll
             for (int mi = 0; mi < size(lse); ++mi) {
                 const int row = get<0>(taccOcO_row(mi));                
-                if (row < seqlen_traits_q.actual_seq_len - m_block * kBlockM) { gLSE(row) = lse(mi); }
+                const int h_local = row % kBlockH;
+                const int m_local = row/kBlockH;             
+                if(h_local < h_bound && m_local < m_bound) {
+                    Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor<Is_split>(mLSE,
+                        Shape<Int<kBlockM/kBlockH>>{}, h_offset + h_local, bidb, n_split_idx)
+                        (_, m_block);
+                    gLSE(m_local) = lse(mi);
+                }
             }
         }
-
-        int write_warp_idx = kNWarps - 1;
-        if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
-            cutlass::arch::NamedBarrier::sync(
-                NumMmaThreads + cutlass::NumThreadsPerWarp, 
-                cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
-            );
+       
+        if constexpr (No_smem_O) { 
+            flash::write_rmem_to_gmem<Seqlen_traits::UseGQAPacking, epi_column_permute>(
+                tOrO_out, epilogue_params.ptr_O, epilogue_params.layout_O, TileShapeOCopy{}, 
+                m_block, h_block, bidh, bidh_kv, bidb, n_split_idx,
+                tiled_mma, seqlen_traits_q, thread_idx);
+        } else {
+            int write_warp_idx = kNWarps - 1;
+            if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
+                cutlass::arch::NamedBarrier::sync(
+                    NumMmaThreads + cutlass::NumThreadsPerWarp, 
+                    cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
+                );
+            }
+            TiledCopyO gmem_tiled_copy_O;
+            Tensor sO_out = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutOCopy{});        
+            if constexpr(!Seqlen_traits::UseGQAPacking) {
+                flash::write_O<!Seqlen_traits::UseVarSeqLen, No_smem_O, Is_split, NumCopyThreads>(
+                    epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, 
+                    epilogue_params.layout_O, TileShapeOCopy{}, sO_out, 
+                    m_block, bidh, bidb, n_split_idx, seqlen_traits_q, write_warp_idx, tiled_mma, tOrO_out
+                );
+            } else {
+                Tensor mO = epilogue_params.tma_store_O.get_tma_tensor(epilogue_params.layout_O.shape());
+                Tensor gO = seqlen_traits_q.get_o_local_tile_tensor<Is_split>(
+                    mO, TileShapeOCopy{}, bidh_kv, bidb, n_split_idx)
+                    (_, _, _, m_block, h_block);  // (bM/bH, bH, K)
+                auto block_tma_O = epilogue_params.tma_store_O.get_slice(_0{});
+                Tensor tOgO = block_tma_O.partition_D(gO);  // (TMA, TMA_M, TMA_K)
+                Tensor tOsO = block_tma_O.partition_S(sO_out);  // (TMA, TMA_M, TMA_K)
+                int const lane_predicate = cute::elect_one_sync();
+                int const warp_idx = cutlass::canonical_warp_idx_sync();
+                if (warp_idx == write_warp_idx && lane_predicate) {
+                    cute::copy(epilogue_params.tma_store_O, tOsO, tOgO);
+                    tma_store_arrive();
+                }
+            }
         }
-        TiledCopyO gmem_tiled_copy_O;
-        flash::write_O<!Seqlen_traits::kUseVarSeqLen, NumCopyThreads>(
-            epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, 
-            epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, 
-            m_block, bidh, bidb, seqlen_traits_q, write_warp_idx
-        );
     }
 
-    template <typename SharedStorage, typename FrgTensorO, typename FrgTensorLSE, typename TiledMma>
     CUTLASS_DEVICE void
-    store_fp8(Params const& epilogue_params,
-          FrgTensorO const& tOrO,
-          FrgTensorLSE const& lse,
+    store_tail() {
+        if constexpr(!No_smem_O) { tma_store_wait<0>(); }
+    }
+
+    // Write 0 to output and -inf to LSE
+    template<typename SharedStorage>
+    CUTLASS_DEVICE void
+    store_zero(
+          Params const& epilogue_params,
           SharedStorage& shared_storage,
-          TiledMma tiled_mma,
           int thread_idx,
-          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
+          cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord,
           const Seqlen_traits& seqlen_traits_q
           ) {
-        // using SmemLayoutrO = typename Ktraits::SmemLayoutrO;
-        // using TiledCopyrO = typename Ktraits::TiledCopyrO;
-        auto [m_block, bidh, bidb] = block_coord;        
-
-        TiledCopyrO rmem_tiled_copy_O;
-        Tensor sOacc = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutrO{});
-        auto rmem_thr_copy_O = rmem_tiled_copy_O.get_thread_slice(thread_idx);
-        
-        Tensor taccOsO = rmem_thr_copy_O.partition_D(sOacc);
-        Tensor tOrO_out = flash::convert_type<Element>(tOrO); // Element is Ktraits::OutputType
-        Tensor taccOrO = make_tensor(tOrO_out.data(), shape(taccOsO));
-
-        // Make sure all WGs have finished reading V
-        cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::ValueEmpty) /*id*/);        
-        cute::copy(rmem_tiled_copy_O, taccOrO, taccOsO);
-        cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
-        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
-                                            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-        
-        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
-        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
-            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
-        Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));
-        auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-        Tensor taccOcO = thread_mma.partition_C(caccO);                           // (MMA,MMA_M,MMA_K)
-        static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
-        static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
-        // taccOcO has shape ((2, 2, V), MMA_M, MMA_K), we only take only the row indices.
-        Tensor taccOcO_row = taccOcO(make_coord(_0{}, _, _0{}), _, _0{});
-        CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));                     // MMA_M        
-        int const seqlen_q = [&] {
-            if constexpr(Seqlen_traits::kUseVarSeqLen) { return seqlen_traits_q.actual_seq_len; }
-            else { return shape<2>(epilogue_params.layout_LSE); }
-        }();        
-        if (get<1>(taccOcO_row(_0{})) == 0) {
+        static_assert(!Seqlen_traits::UseGQAPacking, "Don't call store_zero for gqa packed layouts.");
+        auto [m_block, n_split_idx, bidh, bidb] = block_coord;
+
+        if constexpr(!Is_split) {
+            Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.layout_O);
+            Tensor gO = seqlen_traits_q.get_o_local_tile_tensor<Is_split>(
+                mO, select<0, 2>(TileShape_MNK{}), bidh, bidb, n_split_idx
+            )(_, _, m_block);  // (M, K)
+
+            TiledCopyO gmem_tiled_copy_O;
+            auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+            Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+            Tensor tOrO = make_fragment_like(tOgO);
+            clear(tOrO);
+            // Construct identity layout for sO
+            Tensor cO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+            // Repeat the partitioning with identity layouts
+            Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+            Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
             #pragma unroll
-            for (int mi = 0; mi < size(lse); ++mi) {
-                const int row = get<0>(taccOcO_row(mi));
-                if (row < seqlen_q - m_block * kBlockM) { gLSE(row) = lse(mi); }
-            }
+            for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(epilogue_params.layout_O.shape()); }
+            // Clear_OOB_K must be false since we don't want to write zeros to gmem
+            flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/true, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+                gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, seqlen_traits_q.actual_seq_len - m_block * kBlockM
+            );
         }
         
-        int write_warp_idx = kNWarps - 1;
-        if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
-            cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp,
-                                              cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor<Is_split>(
+            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb, n_split_idx)(_, m_block);
+        static_assert(kBlockM <= NumMmaThreads);
+        if (thread_idx < min(kBlockM, seqlen_traits_q.actual_seq_len - m_block * kBlockM)) {
+            gLSE(thread_idx) = !Is_split ? INFINITY : -INFINITY;
         }
-        TiledCopyO gmem_tiled_copy_O;
-        Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{});
-        flash::write_O<!Seqlen_traits::kUseVarSeqLen, NumCopyThreads>(
-            epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, 
-            epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, 
-            m_block, bidh, bidb, seqlen_traits_q, write_warp_idx
-        );
-    }
-
-    CUTLASS_DEVICE void
-    store_tail() {
-        tma_store_wait<0>();
     }
 
     // Write 0 to output and -inf to LSE
     template<typename SharedStorage>
     CUTLASS_DEVICE void
-    store_zero(
+    store_zero_gqa(
           Params const& epilogue_params,
           SharedStorage& shared_storage,
           int thread_idx,
-          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
-          const Seqlen_traits& seqlen_traits_q
+          cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord,
+          const Seqlen_traits& seqlen_traits_q,
+          const cutlass::FastDivmod& qhead_per_khead_divmod
           ) {
-        auto [m_block, bidh, bidb] = block_coord;
-        Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.layout_O);
-        Tensor gO = seqlen_traits_q.get_local_tile_tensor(
-            mO, select<0, 2>(TileShape_MNK{}), bidh, bidb
-        )(_, _, m_block);  // (M, K)
+        static_assert(Seqlen_traits::UseGQAPacking, "Special store_zero method for GQA packed layouts.");
+        auto [m_block, n_split_idx, bidh, bidb] = block_coord;
+        const int bidh_kv = qhead_per_khead_divmod.divide(bidh);
+        const int h_block = bidh % int(qhead_per_khead_divmod);        
+        const int h_bound = min(shape<1>(epilogue_params.layout_O) - h_block * kBlockH, kBlockH);
+        const int m_bound = min(seqlen_traits_q.actual_seq_len - m_block * (kBlockM/kBlockH), kBlockM/kBlockH);
+        
+        if constexpr(!Is_split) {
+            Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.layout_O);
+            Tensor gO = seqlen_traits_q.get_o_local_tile_tensor<Is_split>(
+                        mO, TileShapeOCopy{}, bidh_kv, bidb, n_split_idx)
+                            (_, _, _, m_block, h_block); // (bM/bH, bH, K)
+            TiledCopyO gmem_tiled_copy_O;
+            auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+            if constexpr(kNumRows <= kBlockH) {
+                // slice into bM/bH and write out zero tiles (bH, K)
+                Tensor tOgO = gmem_thr_copy_O.partition_D(gO(0,_,_));
+                Tensor tOrO = make_fragment_like(tOgO);
+                clear(tOrO);
+                Tensor cO = cute::make_identity_tensor(select<1, 2>(TileShapeOCopy{}));
+                Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+                // dummy predicate, unused since Is_even_K=true
+                Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+                #pragma unroll
+                for(int m = 0; m < m_bound; ++m) {                
+                    tOgO = gmem_thr_copy_O.partition_D(gO(m,_,_));
+                    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/true,
+                                /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+                        gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, h_bound
+                    );
+                }
+            } else {
+                // slice into bH and write out zero tiles (bM/bH, K)
+                Tensor tOgO = gmem_thr_copy_O.partition_D(gO(_,0,_));
+                Tensor tOrO = make_fragment_like(tOgO);
+                clear(tOrO);
+                Tensor cO = cute::make_identity_tensor(select<0, 2>(TileShapeOCopy{}));
+                Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+                // dummy predicate, unused since Is_even_K=true
+                Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+                #pragma unroll
+                for(int h = 0; h < h_bound; ++h) {                
+                    tOgO = gmem_thr_copy_O.partition_D(gO(_,h,_));
+                    flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/true,
+                                /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+                        gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, m_bound
+                    );
+                }
+            }
+        }
+
+        const int h_offset = shape<1>(epilogue_params.layout_O) * bidh_kv + h_block * kBlockH;
+        const int thread_idx_h = thread_idx % kBlockH;
+        const int thread_idx_m = thread_idx / kBlockH;
+        
         Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
-        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
-            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
-
-        TiledCopyO gmem_tiled_copy_O;
-        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
-        Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
-        Tensor tOrO = make_fragment_like(tOgO);
-        clear(tOrO);
-        // Construct identity layout for sO
-        Tensor cO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
-        // Repeat the partitioning with identity layouts
-        Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
-        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
-        #pragma unroll
-        for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(epilogue_params.layout_O.shape()); }
-        // Clear_OOB_K must be false since we don't want to write zeros to gmem
-        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-            gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, seqlen_traits_q.actual_seq_len - m_block * kBlockM
-        );
-        static_assert(kBlockM <= NumMmaThreads);
-        if (thread_idx < seqlen_traits_q.actual_seq_len - m_block * kBlockM) { gLSE(thread_idx) = -INFINITY; }
+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor<Is_split>(
+            mLSE, Shape<Int<kBlockM/kBlockH>>{}, h_offset + thread_idx_h, bidb, n_split_idx)(_, m_block);
+        if(thread_idx_h < h_bound && thread_idx_m < m_bound) {
+            gLSE(thread_idx_m) = !Is_split ? INFINITY : -INFINITY;
+        }
     }
 
 };
diff --git a/hopper/flash.h b/hopper/flash.h
index 24ca27f69..bc1790dc2 100644
--- a/hopper/flash.h
+++ b/hopper/flash.h
@@ -49,6 +49,12 @@ struct Flash_fwd_params : public Qkv_params {
     index_t o_row_stride;
     index_t o_head_stride;
 
+    // The stride between rows of Oaccum.
+    index_t oaccum_batch_stride;
+    index_t oaccum_row_stride;
+    index_t oaccum_head_stride;
+    index_t oaccum_split_stride;
+
     // The pointer to the P matrix.
     void * __restrict__ p_ptr;
 
@@ -58,6 +64,7 @@ struct Flash_fwd_params : public Qkv_params {
 
     // The dimensions.
     int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q, total_k;
+    int b_k;
 
     // The scaling factors for the kernel.
     float scale_softmax;
@@ -119,10 +126,8 @@ struct Flash_fwd_params : public Qkv_params {
     bool is_e4m3;
     bool is_causal;
     bool is_local;
-
-    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
-    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
-    bool is_seqlens_k_cumulative;
+    bool is_kv_cache;
+    bool use_gqa_packing;
 
     bool is_rotary_interleaved;
 
@@ -132,6 +137,7 @@ struct Flash_fwd_params : public Qkv_params {
     index_t alibi_slopes_batch_stride;
 
     bool unpadded_lse; // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q].
+    bool seqlenq_ngroups_swapped;  // q has been transposed from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d).
 
     int * __restrict__ tile_count_semaphore;
     float * __restrict__ descale_q_ptr;
@@ -187,4 +193,5 @@ struct Flash_bwd_params : public Flash_fwd_params {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, int kBlockH> void run_mha_fwd_gqa_(Flash_fwd_params &params, cudaStream_t stream);
 template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
diff --git a/hopper/flash_api.cpp b/hopper/flash_api.cpp
index 638752e4d..a4ffa254f 100644
--- a/hopper/flash_api.cpp
+++ b/hopper/flash_api.cpp
@@ -21,6 +21,7 @@
 void set_params_fprop(Flash_fwd_params &params,
                       // sizes
                       const size_t b,
+                      const size_t b_k,
                       const size_t seqlen_q,
                       const size_t seqlen_k,
                       const size_t seqlen_q_rounded,
@@ -52,6 +53,7 @@ void set_params_fprop(Flash_fwd_params &params,
 
     params.is_bf16 = q.dtype() == torch::kBFloat16;
     params.is_e4m3 = q.dtype() == torch::kFloat8_e4m3fn;
+    params.is_kv_cache = false;
 
     // Set the pointers and strides.
     params.q_ptr = q.data_ptr();
@@ -97,6 +99,7 @@ void set_params_fprop(Flash_fwd_params &params,
 
     // Set the dimensions.
     params.b = b;
+    params.b_k = b_k;
     params.h = h;
     params.h_k = h_k;
     params.h_h_k_ratio = h / h_k;
@@ -137,8 +140,8 @@ void set_params_fprop(Flash_fwd_params &params,
     params.window_size_left = window_size_left;
     params.window_size_right = window_size_right;
 
-    params.is_causal = window_size_left == seqlen_k && window_size_right == 0;
-    if ((window_size_left < seqlen_k || window_size_right < seqlen_k) && !params.is_causal) {
+    params.is_causal = window_size_left == int(seqlen_k) && window_size_right == 0;
+    if ((window_size_left < int(seqlen_k) || window_size_right < int(seqlen_k)) && !params.is_causal) {
         params.is_local = true;
     }
 
@@ -147,13 +150,12 @@ void set_params_fprop(Flash_fwd_params &params,
             "This flash attention build does not support local attention.");
     #endif
 
-    params.is_seqlens_k_cumulative = true;
-
     #ifdef FLASHATTENTION_DISABLE_UNEVEN_K
         TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32.");
     #endif
 
     params.unpadded_lse = unpadded_lse;
+    params.seqlenq_ngroups_swapped = seqlenq_ngroups_swapped;
 }
 
 void set_params_dgrad(Flash_bwd_params &params,
@@ -192,7 +194,7 @@ void set_params_dgrad(Flash_bwd_params &params,
                       bool deterministic) {
 
     set_params_fprop(params,
-                     b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,
+                     b, b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,
                      q, k, v, out,
                      cu_seqlens_q_d,
                      cu_seqlens_k_d,
@@ -236,11 +238,198 @@ void set_params_dgrad(Flash_bwd_params &params,
     params.deterministic = deterministic;
 }
 
-void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
-    // HEADDIM_SWITCH(params.d, [&] {
-    //     run_mha_fwd_<cutlass::half_t, kHeadSize>(params, stream);
-    // });
-    if (!params.is_e4m3) {
+
+// Find the number of splits that maximizes the occupancy. For example, if we have
+// batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency = 0.89) is
+// better than having 3 splits (efficiency = 0.67). However, we also don't want too many
+// splits as that would incur more HBM reads/writes.
+// So we find the best efficiency, then find the smallest number of splits that gets 80%
+// of the best efficiency.
+inline int num_splits_heuristic(int batch_nheads_mblocks, int batch_nheads, int num_SMs, int num_n_blocks,
+    int max_splits, int head_size, bool use_one_mma_wg) {
+    // Goal of the starting threshold is to determine whether to split or not.
+    // Empirically, the efficiency threshold can be much lower than 80% depending on num_n_blocks.
+    int num_m_blocks = batch_nheads_mblocks/batch_nheads;
+    float start_threshold;
+    float num_n_blocksf = float(num_n_blocks);
+    if (head_size == 128) {
+        if (std::log2f(num_n_blocksf) <= 4) { // 2048 -- .25
+            start_threshold = .20f + (std::log2f(num_n_blocksf) - 3) * .05f;
+        } else if (std::log2f(num_n_blocksf) <= 5) { // 4096 -- .25
+            start_threshold = .25f;
+        } else if (std::log2f(num_n_blocksf) <= 6) { // 8192 -- .36
+            start_threshold = .28f + (std::log2f(num_n_blocksf) - 5) * .08f;
+        } else if (std::log2f(num_n_blocksf) <= 7) { // 16K -- .42
+            start_threshold = .36f + (std::log2f(num_n_blocksf) - 6) * .06f;
+        } else {
+            // Just split freely
+            start_threshold = .8f;
+        }
+        if (num_m_blocks > 1 && start_threshold < .5f)
+            start_threshold += .05f * (std::log2f(num_n_blocksf) - 2);
+    } else if (head_size == 256) {
+        // TODO for hdim 256
+        if (num_n_blocks <= 40) {
+            start_threshold = .24f;
+        } else if (std::log2f(num_n_blocksf) <= 8) {
+            start_threshold = .33f + std::max(0.f, (std::log2f(num_n_blocksf) - std::log2f(50)) * 0.02971f);
+        } else {
+            // Just split freely
+            start_threshold = .8f;
+        }
+    } else if (head_size == 64) {
+        if (use_one_mma_wg) {
+            if (std::log2f(num_n_blocksf) <= 4) { // 2K -- .33
+                start_threshold = .33f;
+            } else if (std::log2f(num_n_blocksf) <= 5) { // 4K -- .37
+                start_threshold = .33f + (std::log2f(num_n_blocksf) - 4) * .04f;
+            } else if (std::log2f(num_n_blocksf) <= 6) { // 8K -- .40
+                start_threshold = .37f + (std::log2f(num_n_blocksf) - 5) * .03f;
+            } else if (std::log2f(num_n_blocksf) <= 7) { // 16K -- .43
+                start_threshold = .4f + (std::log2f(num_n_blocksf) - 6) * .03f;
+            } else if (std::log2f(num_n_blocksf) <= 8) { // 32K -- .46
+                start_threshold = .43f + (std::log2f(num_n_blocksf) - 7) * .03f;
+            } else {
+                start_threshold = .8f;
+            }
+        } else {
+            if (std::log2f(num_n_blocksf) <= 6) { // 8K -- .5
+                start_threshold = .5f;
+            } else {
+                start_threshold = .8f;
+            }
+        }
+    } else {
+        // placeholder for other hdims
+        start_threshold = .8f;
+    }
+
+    float first_wave = float(batch_nheads_mblocks) / num_SMs;
+    // printf("Start threshold and wave = %f, %f.\n", start_threshold, first_wave);
+    // Only use start_threshold if initial work doesn't exceed one wave
+    if ((first_wave/ceil(first_wave) > start_threshold && first_wave <= 1.f) ||
+        (first_wave/ceil(first_wave) > .8f)) {
+        return 1;
+    }
+    // if (first_wave_batch_nheads > start_threshold) { return 1; }
+    // if (first_wave_batch_nheads > start_threshold || first_wave > .8f) { return 1; }
+    // if (float(batch_nheads)/num_SMs > start_threshold) { return 1; }
+
+    // If num_n_blocks is too small, use 1 split
+    // For example, we never split for hdim = 128 and seqlen_k = 512,
+    // or for hdim = 128, seqlen_k = 1024, and one MMA warpgroup.
+    if (num_n_blocks < 8 || (use_one_mma_wg && num_n_blocks < 10)) { return 1; }
+
+    max_splits = std::min({max_splits, num_SMs, num_n_blocks});
+    float max_efficiency = 0.f;
+    std::vector<float> efficiency;
+    efficiency.reserve(max_splits);
+    
+    // NOTE: disable split eligibility check for FA3 since we have dynamic tile scheduler
+    // for exiting splits with no work early, and check leads to efficiency quantization issues.
+    // Comment from FA2:
+    // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits,
+    // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks
+    // (i.e. it's 11 splits anyway).
+    // So we check if the number of blocks per split is the same as the previous num_splits.
+    // auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
+    // auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
+    //     return num_splits == 1 || ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1);
+    // };
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        // if (!is_split_eligible(num_splits)) {
+        //     efficiency.push_back(0.f);
+        // } else {
+            float n_waves = float(batch_nheads_mblocks * num_splits) / num_SMs;
+            float eff = n_waves / ceil(n_waves);
+            // printf("num_splits = %d, n_waves = %f, ceil(n_waves) = %f,  eff = %f\n", num_splits, n_waves, ceil(n_waves), eff);
+            if (eff > max_efficiency) { max_efficiency = eff; }
+            efficiency.push_back(eff);
+        // }
+    }
+    // Correct for excessive splitting with e.g. 1 bsz*nheads*mblocks
+    // Empirically, efficiency threshold in these cases is about 40% for 64K seqlen_k
+    float threshold = num_m_blocks == 1 ? std::min(0.3f + batch_nheads * 0.1f, 0.8f) : 0.8f;
+    threshold = threshold * max_efficiency;
+    // printf("Max efficiency = %f. Threshold = %f.\n", max_efficiency, threshold);
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        // if (!is_split_eligible(num_splits)) { continue; }
+        if (efficiency[num_splits - 1] > threshold) {
+            // printf("num_splits chosen = %d, threshold = %f, efficiency = %f.\n", num_splits, threshold, efficiency[num_splits - 1]);
+            return num_splits;
+        }
+    }
+    return 1;
+}
+
+std::tuple<at::Tensor, at::Tensor> set_params_splitkv(Flash_fwd_params &params, const int batch_size,
+    const int num_heads, const int num_heads_k, const int head_size, const int max_seqlen_k, const int max_seqlen_q,
+    const int head_size_rounded, const float p_dropout,
+    const int num_splits, cudaDeviceProp *dprops, bool use_gqa_packing, bool is_causal, struct c10::TensorOptions opts) {
+    auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
+
+    params.num_splits = num_splits;
+    at::Tensor softmax_lse_accum;
+    at::Tensor out_accum;
+
+    if (p_dropout == 0.0f) {  // SplitKV is not implemented for dropout
+        if (num_splits < 1) {
+            const int gqa_ratio = num_heads / num_heads_k;
+            const int block_h = 1 << static_cast<int>(std::ceil(std::log2(std::clamp(gqa_ratio, 1, 32))));
+            const int block_m = head_size == 64 ? 192 : 128;
+            const bool use_one_mma_wg = max_seqlen_q <= 64/block_h;
+            
+            int block_n = 128;
+            if (head_size == 128 && !is_causal) {
+                block_n = 176;
+            } else if (head_size == 256) {
+                block_n = use_one_mma_wg ? 96 : 80;
+            }
+            const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n;
+            const int batch_nheads = use_gqa_packing ? batch_size * num_heads_k : batch_size * num_heads;
+            const int batch_nheads_mblocks = use_gqa_packing
+                ? ceildiv(max_seqlen_q, block_m / block_h) * batch_nheads
+                : ceildiv(max_seqlen_q, block_m) * batch_nheads;
+            params.num_splits = num_splits_heuristic(batch_nheads_mblocks, batch_nheads,
+                dprops->multiProcessorCount, num_n_blocks, 128, head_size, use_one_mma_wg);
+            // printf("Num splits heuristic = %d.\n", params.num_splits);
+	    }
+        if (params.num_splits > 1) {
+            softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+            out_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
+            params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
+            params.oaccum_ptr = out_accum.data_ptr();
+            params.oaccum_row_stride = out_accum.stride(-2);
+            params.oaccum_head_stride = out_accum.stride(-3);
+            params.oaccum_batch_stride = out_accum.stride(-4);
+            params.oaccum_split_stride = out_accum.stride(0);
+        }
+        TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
+    }
+
+    return std::make_tuple(softmax_lse_accum, out_accum);
+}
+
+
+void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) { 
+
+    int dtype = 1;
+    if (params.is_bf16) { dtype = 2; }
+    else if (params.is_e4m3) { dtype = 3; }
+    PREC_SWITCH(dtype, Element, [&] {
+      HEADDIM_SWITCH(params.d, kHeadSize, [&] {
+        if(!params.use_gqa_packing) {
+          run_mha_fwd_<Element, kHeadSize>(params, stream);
+        } else {
+          QUERYHEAD_SWITCH(params.h_h_k_ratio, kBlockH, [&] {
+            run_mha_fwd_gqa_<Element, kHeadSize, kBlockH>(params, stream);
+          });
+        }
+      });
+    });
+
+#if 0
+    if (!params.is_e4m3) { 
         if (params.is_bf16) {
             if (params.d == 64) {
                 run_mha_fwd_<cutlass::bfloat16_t, 64>(params, stream);
@@ -265,8 +454,9 @@ void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split
             run_mha_fwd_<cutlass::float_e4m3_t, 128>(params, stream);
         } else if (params.d == 256) {
             run_mha_fwd_<cutlass::float_e4m3_t, 256>(params, stream);
-        }        
+        }
     }
+#endif
 }
 
 std::vector<at::Tensor>
@@ -280,19 +470,17 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         c10::optional<at::Tensor> &descale_v_, // 1
         bool is_causal,
         int window_size_left,
-        int window_size_right) {
+        int window_size_right,
+        bool use_gqa_packing = false
+        ) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
-    TORCH_CHECK(is_sm90, "FlashAttention only supports Hopper GPUs or newer.");
+    TORCH_CHECK(is_sm90, "FlashAttention-3 only supports Hopper GPUs or newer.");
 
     auto q_dtype = q.dtype();
-    // TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
-    //             "FlashAttention only support fp16 and bf16 data type for now");
-    // TODO: will add e4m3 later
-    // TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kFloat8_e4m3fn,
-    //             "FlashAttention only support fp16 and bf16 data type");
-    //             "FlashAttention only support fp16 and fp8 (e4m3) data type for now");
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16 || q_dtype == at::ScalarType::Float8_e4m3fn,
+                "FlashAttention-3 only support fp16, bf16, or fp8 e4m3 data type");
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
 
@@ -313,6 +501,8 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
     TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    // Guard against mistaken setting of gqa flag
+    if (num_heads == num_heads_k) { use_gqa_packing = false; }
 
     TORCH_CHECK(head_size_og == 64 || head_size_og == 128 || head_size_og == 256, "Only support head size 64, 128, and 256 for now");
 
@@ -336,7 +526,7 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         out = out_.value();
         // TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
         TORCH_CHECK(q_dtype == at::ScalarType::Float8_e4m3fn
-                    ? (out.dtype() == at::kHalf)
+                    ? (out.dtype() == at::kBFloat16)
                     : (out.dtype() == q_dtype),
                 "Output must have the same dtype as input dtype if dtype is "
                 "not fp8, or fp16 for fp8 input.");
@@ -346,7 +536,7 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
     } else {
         if (q_dtype == at::ScalarType::Float8_e4m3fn)
-            out = torch::empty_like(q_padded, at::kHalf);
+            out = torch::empty_like(q_padded, at::kBFloat16);
         else
             out = torch::empty_like(q_padded);
     }
@@ -370,7 +560,7 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
 
     Flash_fwd_params params;
     set_params_fprop(params,
-                     batch_size,
+                     batch_size, batch_size,
                      seqlen_q, seqlen_k,
                      seqlen_q_rounded, seqlen_k_rounded,
                      num_heads, num_heads_k,
@@ -387,26 +577,27 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
                      /*window_size_left=*/window_size_left,
                      /*window_size_right=*/window_size_right);
 
-    auto tile_count_semaphore = is_causal ? torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1}, opts.dtype(torch::kInt32));
+    auto tile_count_semaphore = is_causal || params.is_local
+        ? torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1}, opts.dtype(torch::kInt32));
     params.tile_count_semaphore = tile_count_semaphore.data_ptr<int>();
 
+    at::Tensor descale_q, descale_k, descale_v;
     if(q_dtype == at::ScalarType::Float8_e4m3fn) {
-        at::Tensor descale_q, descale_k, descale_v;
-        if (descale_q_.has_value() && descale_k_.has_value() && descale_k_.has_value()) {
+        if (descale_q_.has_value()) {
             descale_q = descale_q_.value();
-            descale_k = descale_k_.value();
-            descale_v = descale_v_.value();
             CHECK_DEVICE(descale_q);
-            CHECK_DEVICE(descale_k);
-            CHECK_DEVICE(descale_v);
             CHECK_SHAPE(descale_q, 1);
+        } else { descale_q = torch::ones({1}, opts.dtype(at::kFloat)); }
+        if (descale_k_.has_value()) {
+            descale_k = descale_k_.value();
+            CHECK_DEVICE(descale_k);
             CHECK_SHAPE(descale_k, 1);
+        } else { descale_k = torch::ones({1}, opts.dtype(at::kFloat)); }
+        if (descale_v_.has_value()) {
+            descale_v = descale_v_.value();
+            CHECK_DEVICE(descale_v);
             CHECK_SHAPE(descale_v, 1);
-        } else {
-            descale_q = torch::ones({1}, opts.dtype(at::kFloat));
-            descale_k = torch::ones({1}, opts.dtype(at::kFloat));
-            descale_v = torch::ones({1}, opts.dtype(at::kFloat));
-        }
+        } else { descale_v = torch::ones({1}, opts.dtype(at::kFloat)); }
         params.descale_q_ptr = descale_q.data_ptr<float>();
         params.descale_k_ptr = descale_k.data_ptr<float>();
         params.descale_v_ptr = descale_v.data_ptr<float>();
@@ -415,6 +606,8 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         params.descale_k_ptr = nullptr;
         params.descale_v_ptr = nullptr;
     }
+    
+    params.use_gqa_packing = use_gqa_packing;
 
     if (seqlen_k > 0) {
         auto stream = at::cuda::getCurrentCUDAStream().stream();
@@ -550,7 +743,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
 
     Flash_fwd_params params;
     set_params_fprop(params,
-                     batch_size,
+                     batch_size, batch_size,
                      max_seqlen_q, max_seqlen_k,
                      seqlen_q_rounded, seqlen_k_rounded,
                      num_heads, num_heads_k,
@@ -1022,6 +1215,340 @@ mha_varlen_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x
     return { dq, dk, dv, softmax_d, dq_accum, softmax_lse_log2 };
 }
 
+std::vector<at::Tensor>
+mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_heads x head_size
+                const at::Tensor &kcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                const at::Tensor &vcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                c10::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
+                c10::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
+                c10::optional<const at::Tensor> &seqlens_k_, // batch_size
+                c10::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
+                c10::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
+                c10::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
+                c10::optional<const at::Tensor> &leftpad_k_, // batch_size
+                c10::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+                c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+                c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+                const float softmax_scale,
+                c10::optional<at::Tensor> &descale_q_, // 1
+                c10::optional<at::Tensor> &descale_k_, // 1
+                c10::optional<at::Tensor> &descale_v_, // 1
+                bool is_causal,
+                int window_size_left,
+                int window_size_right,
+                const float softcap,
+                bool is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
+                int num_splits,
+                int max_seqlen_k_hint,
+                bool use_gqa_packing
+                ) {
+
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    // bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90, "FlashAttention-3 only supports Hopper GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16 || q_dtype == at::ScalarType::Float8_e4m3fn,
+                "FlashAttention-3 only support fp16, bf16, or fp8 e4m3 data type");
+    TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(vcache.dtype() == q_dtype, "query and value must have the same dtype");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(kcache); CHECK_DEVICE(vcache);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(kcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(vcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+    at::Tensor block_table;
+    const bool paged_KV = block_table_.has_value();
+    if (paged_KV) {
+        TORCH_CHECK(!cache_batch_idx_.has_value(), "Paged KVcache does not support cache_batch_idx");
+        block_table = block_table_.value();
+        CHECK_DEVICE(block_table);
+        TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must have dtype torch.int32");
+        TORCH_CHECK(block_table.stride(-1) == 1, "block_table must have contiguous last dimension");
+    }
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size_og = sizes[3];
+
+    const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
+    const int num_blocks = !paged_KV ? 0 : kcache.size(0);
+    const int page_block_size = !paged_KV ? 1 : kcache.size(1);
+    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
+    const int num_heads_k = kcache.size(2);
+    const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    // Guard against mistaken setting of gqa flag
+    if (num_heads == num_heads_k) { use_gqa_packing = false; }
+
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
+    if (is_causal) { window_size_right = 0; }
+
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped =
+        seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 &&
+        window_size_right < 0 && head_size_og % 8 == 0 &&
+        !alibi_slopes_.has_value() && !use_gqa_packing;
+    if (seqlenq_ngroups_swapped) {
+        const int ngroups = num_heads / num_heads_k;
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
+        seqlen_q = ngroups;
+        num_heads = num_heads_k;
+    }
+
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
+    if (!paged_KV) {
+        CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+        CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+    } else {
+        CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og);
+        CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, head_size_og);
+        CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq);
+    }
+
+    at::Tensor q_padded, kcache_padded, vcache_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        kcache_padded = torch::nn::functional::pad(kcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        vcache_padded = torch::nn::functional::pad(vcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        kcache_padded = kcache;
+        vcache_padded = vcache;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        // TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        TORCH_CHECK(q_dtype == at::ScalarType::Float8_e4m3fn
+                    ? (out.dtype() == at::kBFloat16)
+                    : (out.dtype() == q_dtype),
+                "Output must have the same dtype as input dtype if dtype is "
+                "not fp8, or fp16 for fp8 input.");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        if (q_dtype == at::ScalarType::Float8_e4m3fn) {
+            out = torch::empty_like(q_padded, at::kBFloat16);
+        }
+        else
+            out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+
+    auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size, batch_size_c,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, kcache_padded, vcache_padded, out,
+                     /*cu_seqlens_q_d=*/nullptr,
+                     /*cu_seqlens_k_d=*/nullptr,
+                     /*seqused_q=*/nullptr,
+                     /*seqused_k=*/nullptr,
+                     /*p_ptr=*/nullptr,
+                     softmax_lse.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right
+                     );
+
+    at::Tensor descale_q, descale_k, descale_v;
+    if(q_dtype == at::ScalarType::Float8_e4m3fn) {
+        if (descale_q_.has_value()) {
+            descale_q = descale_q_.value();
+            CHECK_DEVICE(descale_q);
+            CHECK_SHAPE(descale_q, 1);
+        } else { descale_q = torch::ones({1}, opts.dtype(at::kFloat)); }
+        if (descale_k_.has_value()) {
+            descale_k = descale_k_.value();
+            CHECK_DEVICE(descale_k);
+            CHECK_SHAPE(descale_k, 1);
+        } else { descale_k = torch::ones({1}, opts.dtype(at::kFloat)); }
+        if (descale_v_.has_value()) {
+            descale_v = descale_v_.value();
+            CHECK_DEVICE(descale_v);
+            CHECK_SHAPE(descale_v, 1);
+        } else { descale_v = torch::ones({1}, opts.dtype(at::kFloat)); }
+        params.descale_q_ptr = descale_q.data_ptr<float>();
+        params.descale_k_ptr = descale_k.data_ptr<float>();
+        params.descale_v_ptr = descale_v.data_ptr<float>();
+    } else {
+        params.descale_q_ptr = nullptr;
+        params.descale_k_ptr = nullptr;
+        params.descale_v_ptr = nullptr;
+    }
+    
+    params.is_kv_cache = true;
+
+    params.use_gqa_packing = use_gqa_packing;
+
+    at::Tensor k, v, k_padded, v_padded;
+    if (k_.has_value()) {
+        TORCH_CHECK(v_.has_value(), "If key is supplied, value must also be passed in");
+        TORCH_CHECK(seqlens_k_.has_value(), "If key is supplied, seqlens_k must also be passed in");
+        TORCH_CHECK(seqlen_q <= seqlen_k, "If key is supplied, it must have seqlen <= the seqlen of the KV cache");
+        k = k_.value();
+        v = v_.value();
+        TORCH_CHECK(k.dtype() == q_dtype, "Key must have the same dtype as query");
+        TORCH_CHECK(v.dtype() == q_dtype, "Value must have the same dtype as query");
+        CHECK_DEVICE(k); CHECK_DEVICE(v);
+        TORCH_CHECK(k.stride(-1) == 1, "Key tensor must have contiguous last dimension");
+        TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension");
+        int seqlen_knew = k.size(1);
+        CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og);
+        CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, head_size_og);
+        if (head_size_og % 8 != 0) {
+            k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+            v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        } else {
+            k_padded = k;
+            v_padded = v;
+        }
+        params.seqlen_knew = seqlen_knew;
+        params.knew_ptr = k_padded.data_ptr();
+        params.vnew_ptr = v_padded.data_ptr();
+        // All stride are in elements, not bytes.
+        params.knew_batch_stride = k_padded.stride(0);
+        params.vnew_batch_stride = v_padded.stride(0);
+        params.knew_row_stride = k_padded.stride(-3);
+        params.vnew_row_stride = v_padded.stride(-3);
+        params.knew_head_stride = k_padded.stride(-2);
+        params.vnew_head_stride = v_padded.stride(-2);
+    }
+
+    if (seqlens_k_.has_value()) {
+        auto seqlens_k = seqlens_k_.value();
+        TORCH_CHECK(seqlens_k.dtype() == torch::kInt32, "seqlens_k must have dtype int32");
+        CHECK_DEVICE(seqlens_k);
+        CHECK_CONTIGUOUS(seqlens_k);
+        CHECK_SHAPE(seqlens_k, batch_size);
+        params.seqused_k = static_cast<int *>(seqlens_k.data_ptr());
+    }
+    if (leftpad_k_.has_value()) {
+        TORCH_CHECK(!paged_KV, "We don't support Paged KV and leftpad_k running at the same time yet");
+        auto leftpad_k = leftpad_k_.value();
+        TORCH_CHECK(leftpad_k.dtype() == torch::kInt32, "leftpad_k must have dtype int32");
+        CHECK_DEVICE(leftpad_k);
+        CHECK_CONTIGUOUS(leftpad_k);
+        CHECK_SHAPE(leftpad_k, batch_size);
+        TORCH_CHECK(false, "Left Padding K is not supported");
+        //params.leftpad_k = static_cast<int *>(leftpad_k.data_ptr());
+    }
+
+    if (rotary_cos_.has_value()) {
+        TORCH_CHECK(k_.has_value(), "If rotary cos/sin are provided, new key / value to be appended to KV cache must also be provided");
+        auto rotary_cos = rotary_cos_.value();
+        CHECK_DEVICE(rotary_cos);
+        params.rotary_dim = rotary_cos.size(1) * 2;
+        TORCH_CHECK(params.rotary_dim <= head_size, "rotary_dim must be <= headdim");
+        TORCH_CHECK(params.rotary_dim % 16 == 0, "Only rotary dimensions divisible by 16 are currently supported");
+        const int seqlen_ro = rotary_cos.size(0);
+        TORCH_CHECK(seqlen_ro >= seqlen_k, "cos/sin seqlen must be at least the seqlen of KV cache");
+        CHECK_SHAPE(rotary_cos, seqlen_ro, params.rotary_dim / 2);
+        CHECK_CONTIGUOUS(rotary_cos);
+        TORCH_CHECK(rotary_cos.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query");
+
+        TORCH_CHECK(rotary_sin_.has_value(), "If rotary cos is provided, rotary sin must also be provided");
+        auto rotary_sin = rotary_sin_.value();
+        CHECK_DEVICE(rotary_sin);
+        CHECK_SHAPE(rotary_sin, seqlen_ro, params.rotary_dim / 2);
+        CHECK_CONTIGUOUS(rotary_sin);
+        TORCH_CHECK(rotary_sin.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query");
+        params.rotary_cos_ptr = rotary_cos.data_ptr();
+        params.rotary_sin_ptr = rotary_sin.data_ptr();
+        params.is_rotary_interleaved = is_rotary_interleaved;
+    } else {
+        params.rotary_dim = 0;
+    }
+
+    if (cache_batch_idx_.has_value()) {
+        auto cache_batch_idx = cache_batch_idx_.value();
+        CHECK_DEVICE(cache_batch_idx);
+        CHECK_CONTIGUOUS(cache_batch_idx);
+        TORCH_CHECK(cache_batch_idx.scalar_type() == torch::kInt32, "cache_batch_idx must have dtype int32");
+        params.cache_batch_idx = reinterpret_cast<int *>(cache_batch_idx.data_ptr());
+    }
+
+    // Keep references to these tensors to extend their lifetime
+    at::Tensor softmax_lse_accum, out_accum;
+    std::tie(softmax_lse_accum, out_accum) = set_params_splitkv(
+       params, batch_size, num_heads, num_heads_k, head_size, max_seqlen_k_hint, seqlen_q,
+       head_size_rounded, /*dropout*/ 0.f, num_splits, dprops, use_gqa_packing, is_causal, opts);
+    
+    auto tile_count_semaphore = is_causal || params.is_local || params.num_splits != 1
+        ? torch::zeros({1}, opts.dtype(torch::kInt32))
+        : torch::empty({1}, opts.dtype(torch::kInt32));
+    params.tile_count_semaphore = tile_count_semaphore.data_ptr<int>();
+
+    if (paged_KV) {
+        params.block_table = block_table.data_ptr<int>();
+        params.block_table_batch_stride = block_table.stride(0);
+    }
+    params.page_block_size = page_block_size;
+
+    TORCH_CHECK(!alibi_slopes_.has_value(), "Alibi Slopes are not supported yet");
+    //set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    // Only split kernel supports appending to KV cache, or indexing to the cache with cache_batch_idx,
+    // or paged KV cache
+    //run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value() || paged_KV);
+    run_mha_fwd(params, stream);
+
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+        if (k_.has_value()) {
+            // It's expensive to copy the KV cache here for the case where head size not divisible by 8,
+            // but we don't expect to get this case in practice. This is just so that the code works for that case.
+            kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}));
+            vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}));
+        }
+    }
+
+    if (seqlenq_ngroups_swapped) {
+        out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
+    }
+
+    return {out, softmax_lse};
+}
+
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.doc() = "FlashAttention";
@@ -1029,4 +1556,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("bwd", &mha_bwd, "Backward pass");
     m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)");
     m.def("varlen_bwd", &mha_varlen_bwd, "Varlen backward pass");
+    m.def("fwd_kvcache", &mha_fwd_kvcache, "Forward pass, with KV-cache");
 }
diff --git a/hopper/flash_attn_interface.py b/hopper/flash_attn_interface.py
index db2bd1a11..e1e4ffd30 100644
--- a/hopper/flash_attn_interface.py
+++ b/hopper/flash_attn_interface.py
@@ -14,7 +14,7 @@
 def maybe_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 
-def _flash_attn_forward(q, k, v, softmax_scale, causal, window_size, descale_q = None, descale_k = None, descale_v = None):
+def _flash_attn_forward(q, k, v, softmax_scale, causal, window_size, descale_q = None, descale_k = None, descale_v = None, gqa_parallel=False):
     q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
     out, q, k, v, out_padded, softmax_lse, S_dmask = flashattn_hopper_cuda.fwd(
         q,
@@ -28,6 +28,7 @@ def _flash_attn_forward(q, k, v, softmax_scale, causal, window_size, descale_q =
         causal,
         window_size[0],
         window_size[1],
+        gqa_parallel
     )
     return out, q, k, v, out_padded, softmax_lse, S_dmask
 
@@ -175,6 +176,7 @@ def forward(
         descale_q=None,
         descale_k=None,
         descale_v=None,
+        gqa_parallel=False,
     ):
         if softmax_scale is None:
             softmax_scale = q.shape[-1] ** (-0.5)
@@ -187,13 +189,15 @@ def forward(
             window_size,
             descale_q=descale_q,
             descale_k=descale_k,
-            descale_v=descale_v,
+            descale_v=descale_v,     
+            gqa_parallel=gqa_parallel,
         )
         ctx.save_for_backward(q, k, v, out_padded, softmax_lse)
         ctx.softmax_scale = softmax_scale
         ctx.causal = causal
         ctx.window_size = window_size
         ctx.deterministic = deterministic
+        ctx.gqa_parallel = gqa_parallel
         return out, softmax_lse
 
     @staticmethod
@@ -218,7 +222,7 @@ def backward(ctx, dout, *args):
         dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
         dk = dk[..., : dout.shape[-1]]
         dv = dv[..., : dout.shape[-1]]
-        return dq, dk, dv, None, None, None, None, None, None, None
+        return dq, dk, dv, None, None, None, None, None, None, None, None
 
 
 class FlashAttnVarlenFunc(torch.autograd.Function):
@@ -309,6 +313,7 @@ def flash_attn_func(
     descale_q=None,
     descale_k=None,
     descale_v=None,
+    gqa_parallel=False,
 ):
     """dropout_p should be set to 0.0 during evaluation
     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
@@ -372,6 +377,7 @@ def flash_attn_func(
         descale_q,
         descale_k,
         descale_v,
+        gqa_parallel     
     )
 
 
@@ -445,3 +451,180 @@ def flash_attn_varlen_func(
         seqused_q,
         seqused_k,
     )
+
+
+def flash_attn_with_kvcache(
+    q,
+    k_cache,
+    v_cache,
+    # k=None,
+    # v=None,
+    # rotary_cos=None,
+    # rotary_sin=None,
+    cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    # cache_leftpad: Optional[torch.Tensor] = None,
+    # block_table: Optional[torch.Tensor] = None,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    # softcap=0.0, # 0.0 means deactivated
+    # rotary_interleaved=True,
+    # alibi_slopes=None,
+    num_splits=0,
+    return_softmax_lse=False,
+    gqa_parallel=None,
+    max_seqlen_k_hint=None,
+    descale_q=None,
+    descale_k=None,
+    descale_v=None,
+):
+    """
+    NOTE: The KV cache API for FlashAttention-3 is a work in progress. We reproduce the description
+    from the FlashAttention-2 method of the same name below.
+
+    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
+    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
+    the previous step, and update them with the new keys/values from the current step, and do
+    attention with the updated cache, all in 1 kernel.
+
+    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
+    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
+    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.
+
+    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
+    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
+    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
+    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).
+
+    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.
+
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+
+    Note: Does not support backward pass.
+
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+            page_block_size must be a multiple of 256.
+        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
+            k with k_cache, starting at the indices specified by cache_seqlens.
+        v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k.
+        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
+            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
+        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
+        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
+            KV cache.
+        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
+            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
+            If the indices are not distinct, and k and v are provided, the values updated in the cache
+                 might come from any of the duplicate indices.
+        cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0.
+        block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        softcap: float. Anything > 0 activates softcapping attention.
+        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
+            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
+            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
+            (i.e. GPT-NeoX style).
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
+           If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
+           to automatically determine the number of splits.
+           Don't change this unless you know what you are doing.
+        return_softmax_lse: bool. Whether to return the logsumexp of the attention scores.
+
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+
+    # unimplemented kwargs
+    k=None
+    v=None
+    rotary_cos=None
+    rotary_sin=None
+    cache_leftpad=None
+    block_table=None
+    softcap=0.0
+    rotary_interleaved=True
+    alibi_slopes=None
+
+    assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension"
+    assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension"
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+    if cache_seqlens is not None and isinstance(cache_seqlens, int):
+        cache_seqlens = torch.full(
+            (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
+        )
+        cache_seqlens = maybe_contiguous(cache_seqlens)
+    cache_batch_idx = maybe_contiguous(cache_batch_idx)
+    # block_table = maybe_contiguous(block_table)
+    if gqa_parallel is None:
+        gqa_parallel = True if q.shape[1] <= 64 else False
+    # not in gqa/mqa setup
+    if q.shape[2] == k_cache.shape[2]:
+        gqa_parallel = False
+    if max_seqlen_k_hint is None:
+       max_seqlen_k_hint = k_cache.shape[1]
+    out, softmax_lse = flashattn_hopper_cuda.fwd_kvcache(
+        q,
+        k_cache,
+        v_cache,
+        k,
+        v,
+        cache_seqlens,
+        rotary_cos,
+        rotary_sin,
+        cache_batch_idx,
+        cache_leftpad,
+        block_table,
+        alibi_slopes,
+        None,
+        softmax_scale,
+        descale_q,
+        descale_k,
+        descale_v,
+        causal,
+        window_size[0],
+        window_size[1],
+        softcap,
+        rotary_interleaved,
+        num_splits,
+        max_seqlen_k_hint,
+        gqa_parallel
+    )
+    return (out, softmax_lse) if return_softmax_lse else out
diff --git a/hopper/flash_fwd_hdim128_bf16_gqa16_sm90.cu b/hopper/flash_fwd_hdim128_bf16_gqa16_sm90.cu
new file mode 100644
index 000000000..d839721b1
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_bf16_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 128, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::bfloat16_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_bf16_gqa2_sm90.cu b/hopper/flash_fwd_hdim128_bf16_gqa2_sm90.cu
new file mode 100644
index 000000000..85d328151
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_bf16_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 128, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::bfloat16_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_bf16_gqa32_sm90.cu b/hopper/flash_fwd_hdim128_bf16_gqa32_sm90.cu
new file mode 100644
index 000000000..4bf5525c7
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_bf16_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 128, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::bfloat16_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_bf16_gqa4_sm90.cu b/hopper/flash_fwd_hdim128_bf16_gqa4_sm90.cu
new file mode 100644
index 000000000..486c762ff
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_bf16_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 128, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::bfloat16_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_bf16_gqa8_sm90.cu b/hopper/flash_fwd_hdim128_bf16_gqa8_sm90.cu
new file mode 100644
index 000000000..157081389
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_bf16_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 128, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::bfloat16_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_e4m3_gqa16_sm90.cu b/hopper/flash_fwd_hdim128_e4m3_gqa16_sm90.cu
new file mode 100644
index 000000000..45ce0357d
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_e4m3_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 128, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_fp8_gqa<cutlass::float_e4m3_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_e4m3_gqa2_sm90.cu b/hopper/flash_fwd_hdim128_e4m3_gqa2_sm90.cu
new file mode 100644
index 000000000..1941fe4a2
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_e4m3_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 128, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_fp8_gqa<cutlass::float_e4m3_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_e4m3_gqa32_sm90.cu b/hopper/flash_fwd_hdim128_e4m3_gqa32_sm90.cu
new file mode 100644
index 000000000..c3c2d5e2f
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_e4m3_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 128, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_fp8_gqa<cutlass::float_e4m3_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_e4m3_gqa4_sm90.cu b/hopper/flash_fwd_hdim128_e4m3_gqa4_sm90.cu
new file mode 100644
index 000000000..834109070
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_e4m3_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 128, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_fp8_gqa<cutlass::float_e4m3_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_e4m3_gqa8_sm90.cu b/hopper/flash_fwd_hdim128_e4m3_gqa8_sm90.cu
new file mode 100644
index 000000000..98cdac676
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_e4m3_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 128, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_fp8_gqa<cutlass::float_e4m3_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_fp16_gqa16_sm90.cu b/hopper/flash_fwd_hdim128_fp16_gqa16_sm90.cu
new file mode 100644
index 000000000..988041bf6
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_fp16_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 128, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::half_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_fp16_gqa2_sm90.cu b/hopper/flash_fwd_hdim128_fp16_gqa2_sm90.cu
new file mode 100644
index 000000000..92936c1d7
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_fp16_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 128, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::half_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_fp16_gqa32_sm90.cu b/hopper/flash_fwd_hdim128_fp16_gqa32_sm90.cu
new file mode 100644
index 000000000..103931349
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_fp16_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 128, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::half_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_fp16_gqa4_sm90.cu b/hopper/flash_fwd_hdim128_fp16_gqa4_sm90.cu
new file mode 100644
index 000000000..2d369fcb3
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_fp16_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 128, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::half_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim128_fp16_gqa8_sm90.cu b/hopper/flash_fwd_hdim128_fp16_gqa8_sm90.cu
new file mode 100644
index 000000000..e556921af
--- /dev/null
+++ b/hopper/flash_fwd_hdim128_fp16_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 128, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_gqa<cutlass::half_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_bf16_gqa16_sm90.cu b/hopper/flash_fwd_hdim256_bf16_gqa16_sm90.cu
new file mode 100644
index 000000000..2c9c35652
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_bf16_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 256, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::bfloat16_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_bf16_gqa2_sm90.cu b/hopper/flash_fwd_hdim256_bf16_gqa2_sm90.cu
new file mode 100644
index 000000000..5e72b41c4
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_bf16_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 256, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::bfloat16_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_bf16_gqa32_sm90.cu b/hopper/flash_fwd_hdim256_bf16_gqa32_sm90.cu
new file mode 100644
index 000000000..90ae2162a
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_bf16_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 256, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::bfloat16_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_bf16_gqa4_sm90.cu b/hopper/flash_fwd_hdim256_bf16_gqa4_sm90.cu
new file mode 100644
index 000000000..b7c6345b2
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_bf16_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 256, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::bfloat16_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_bf16_gqa8_sm90.cu b/hopper/flash_fwd_hdim256_bf16_gqa8_sm90.cu
new file mode 100644
index 000000000..566760319
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_bf16_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 256, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::bfloat16_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_e4m3_gqa16_sm90.cu b/hopper/flash_fwd_hdim256_e4m3_gqa16_sm90.cu
new file mode 100644
index 000000000..9c0f7d626
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_e4m3_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 256, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_fp8_gqa<cutlass::float_e4m3_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_e4m3_gqa2_sm90.cu b/hopper/flash_fwd_hdim256_e4m3_gqa2_sm90.cu
new file mode 100644
index 000000000..c41ac3d4e
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_e4m3_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 256, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_fp8_gqa<cutlass::float_e4m3_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_e4m3_gqa32_sm90.cu b/hopper/flash_fwd_hdim256_e4m3_gqa32_sm90.cu
new file mode 100644
index 000000000..b486e1a39
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_e4m3_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 256, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_fp8_gqa<cutlass::float_e4m3_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_e4m3_gqa4_sm90.cu b/hopper/flash_fwd_hdim256_e4m3_gqa4_sm90.cu
new file mode 100644
index 000000000..2b9701786
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_e4m3_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 256, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_fp8_gqa<cutlass::float_e4m3_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_e4m3_gqa8_sm90.cu b/hopper/flash_fwd_hdim256_e4m3_gqa8_sm90.cu
new file mode 100644
index 000000000..ebe0f92ca
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_e4m3_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 256, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_fp8_gqa<cutlass::float_e4m3_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_fp16_gqa16_sm90.cu b/hopper/flash_fwd_hdim256_fp16_gqa16_sm90.cu
new file mode 100644
index 000000000..91fc6200e
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_fp16_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 256, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::half_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_fp16_gqa2_sm90.cu b/hopper/flash_fwd_hdim256_fp16_gqa2_sm90.cu
new file mode 100644
index 000000000..21a81044a
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_fp16_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 256, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::half_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_fp16_gqa32_sm90.cu b/hopper/flash_fwd_hdim256_fp16_gqa32_sm90.cu
new file mode 100644
index 000000000..502a66281
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_fp16_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 256, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::half_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_fp16_gqa4_sm90.cu b/hopper/flash_fwd_hdim256_fp16_gqa4_sm90.cu
new file mode 100644
index 000000000..e6dc49dc6
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_fp16_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 256, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::half_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim256_fp16_gqa8_sm90.cu b/hopper/flash_fwd_hdim256_fp16_gqa8_sm90.cu
new file mode 100644
index 000000000..046c9e304
--- /dev/null
+++ b/hopper/flash_fwd_hdim256_fp16_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 256, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_gqa<cutlass::half_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_bf16_gqa16_sm90.cu b/hopper/flash_fwd_hdim64_bf16_gqa16_sm90.cu
new file mode 100644
index 000000000..0381c601e
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_bf16_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 64, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::bfloat16_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_bf16_gqa2_sm90.cu b/hopper/flash_fwd_hdim64_bf16_gqa2_sm90.cu
new file mode 100644
index 000000000..6be1d9c58
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_bf16_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 64, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::bfloat16_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_bf16_gqa32_sm90.cu b/hopper/flash_fwd_hdim64_bf16_gqa32_sm90.cu
new file mode 100644
index 000000000..154efcac5
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_bf16_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 64, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::bfloat16_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_bf16_gqa4_sm90.cu b/hopper/flash_fwd_hdim64_bf16_gqa4_sm90.cu
new file mode 100644
index 000000000..b8fe56a32
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_bf16_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 64, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::bfloat16_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_bf16_gqa8_sm90.cu b/hopper/flash_fwd_hdim64_bf16_gqa8_sm90.cu
new file mode 100644
index 000000000..cda356c26
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_bf16_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::bfloat16_t, 64, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::bfloat16_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_e4m3_gqa16_sm90.cu b/hopper/flash_fwd_hdim64_e4m3_gqa16_sm90.cu
new file mode 100644
index 000000000..74e61967a
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_e4m3_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 64, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_fp8_gqa<cutlass::float_e4m3_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_e4m3_gqa2_sm90.cu b/hopper/flash_fwd_hdim64_e4m3_gqa2_sm90.cu
new file mode 100644
index 000000000..ff8213c05
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_e4m3_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 64, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_fp8_gqa<cutlass::float_e4m3_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_e4m3_gqa32_sm90.cu b/hopper/flash_fwd_hdim64_e4m3_gqa32_sm90.cu
new file mode 100644
index 000000000..22ce8ed06
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_e4m3_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 64, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_fp8_gqa<cutlass::float_e4m3_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_e4m3_gqa4_sm90.cu b/hopper/flash_fwd_hdim64_e4m3_gqa4_sm90.cu
new file mode 100644
index 000000000..b0f09e780
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_e4m3_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 64, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_fp8_gqa<cutlass::float_e4m3_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_e4m3_gqa8_sm90.cu b/hopper/flash_fwd_hdim64_e4m3_gqa8_sm90.cu
new file mode 100644
index 000000000..16775723d
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_e4m3_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::float_e4m3_t, 64, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_fp8_gqa<cutlass::float_e4m3_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_fp16_gqa16_sm90.cu b/hopper/flash_fwd_hdim64_fp16_gqa16_sm90.cu
new file mode 100644
index 000000000..cbe5159d1
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_fp16_gqa16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 64, 16>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::half_t, 16>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_fp16_gqa2_sm90.cu b/hopper/flash_fwd_hdim64_fp16_gqa2_sm90.cu
new file mode 100644
index 000000000..f18c68b23
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_fp16_gqa2_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 64, 2>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::half_t, 2>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_fp16_gqa32_sm90.cu b/hopper/flash_fwd_hdim64_fp16_gqa32_sm90.cu
new file mode 100644
index 000000000..a4cf2813d
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_fp16_gqa32_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 64, 32>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::half_t, 32>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_fp16_gqa4_sm90.cu b/hopper/flash_fwd_hdim64_fp16_gqa4_sm90.cu
new file mode 100644
index 000000000..8e9932dbd
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_fp16_gqa4_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 64, 4>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::half_t, 4>(params, stream);
+}
diff --git a/hopper/flash_fwd_hdim64_fp16_gqa8_sm90.cu b/hopper/flash_fwd_hdim64_fp16_gqa8_sm90.cu
new file mode 100644
index 000000000..79cbce7d0
--- /dev/null
+++ b/hopper/flash_fwd_hdim64_fp16_gqa8_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_gqa_<cutlass::half_t, 64, 8>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_gqa<cutlass::half_t, 8>(params, stream);
+}
diff --git a/hopper/flash_fwd_kernel.h b/hopper/flash_fwd_kernel.h
index 9517c5e0c..4c5a109ad 100644
--- a/hopper/flash_fwd_kernel.h
+++ b/hopper/flash_fwd_kernel.h
@@ -24,31 +24,31 @@ namespace flash {
 
 using namespace cute;
 
-template <typename Ktraits, bool Is_causal, bool Is_local, typename TileScheduler, typename Seqlen_traits>
+template <typename Ktraits, bool Is_causal, bool Is_local, typename TileScheduler, typename Seqlen_traits, typename Seqlen_traits_Q = Seqlen_traits>
 __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
-    compute_attn_ws(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits>::Params const mainloop_params,
-                    CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits>::Params const epilogue_params,
+    compute_attn_ws(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits, Seqlen_traits_Q>::Params const mainloop_params,
+                    CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits_Q>::Params const epilogue_params,
                     CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params,
-                    Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k
+                    Seqlen_traits_Q seqlen_traits_q, Seqlen_traits seqlen_traits_k
                     ) {
 
     using Element = typename Ktraits::Element;
-    using ElementAccum = typename Ktraits::ElementAccum;
-    using SoftType = ElementAccum;
     using TileShape_MNK = typename Ktraits::TileShape_MNK;
     using ClusterShape = typename Ktraits::ClusterShape_MNK;
 
     static_assert(Ktraits::Is_WS);
     static constexpr bool Is_WS = Ktraits::Is_WS;
+    static constexpr bool No_smem_O = Ktraits::No_smem_O;
 
     static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
     static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
     static constexpr int kBlockM = Ktraits::kBlockM;
+    static constexpr int kBlockH = Ktraits::kBlockH;
     // static constexpr int kBlockN = Ktraits::kBlockN;
-    // constexpr int kHeadDim = Ktraits::kHeadDim;
+    // static constexpr int kHeadDim = Ktraits::kHeadDim;
 
-    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits>;
-    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits>;
+    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits, Seqlen_traits_Q>;
+    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits_Q>;
 
     using MainloopPipeline = typename Ktraits::MainloopPipeline;
     using PipelineParams = typename MainloopPipeline::Params;
@@ -80,7 +80,7 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
     if (warp_idx == 0 && lane_predicate) {
         shared_storage.barrier_Q.init(1 /*numThreads*/);
-        shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/);
+        if constexpr (!No_smem_O) { shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/); }
     }
     // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
     MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{});
@@ -97,10 +97,10 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
         __syncthreads();
     }
 
-    static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
+    // static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
+    static_assert(Ktraits::kNWarps == 8 || Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
     if (warp_group_idx == 0) {  // Producer
         cutlass::arch::warpgroup_reg_dealloc<Ktraits::kNWarps == 12 ? 24 : 32>();
-        // cutlass::arch::warpgroup_reg_dealloc<56>();
 
         int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
         if (warp_idx_in_warpgroup == 0) {  // Load Q, K, V
@@ -114,32 +114,37 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
                  work_tile_info.is_valid(scheduler_params);
                  work_tile_info = scheduler.template get_next_work</*IsProducer=*/true>(scheduler_params, work_tile_info)) {
                 auto block_coord = work_tile_info.get_block_coord(scheduler_params);
-                auto [m_block, bidh, bidb] = block_coord;
+                auto [m_block, n_split_idx, bidh, bidb] = block_coord;
 
                 seqlen_traits_q.init(bidb);
                 seqlen_traits_k.init(bidb);
-                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
-                    continue;
+                if constexpr(seqlen_traits_q.UseVarSeqLen) {
+                    // NOTE: to support in future with gqa packed layouts, changed kBlockM to kBlockM/kBlockH
+                    if (m_block * (kBlockM/kBlockH) >= seqlen_traits_q.actual_seq_len) {
+                        continue;
+                    }
                 }
-                const int n_block_max = collective_mainloop.get_n_block_max(
-                    mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-                const int n_block_min = collective_mainloop.get_n_block_min(
-                    mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-                if ((Is_causal || Is_local || seqlen_traits_k.kUseVarSeqLen) && n_block_max <= n_block_min) {
-                    scheduler.prefetch_next_work(scheduler_params, work_tile_info);
-                    scheduler.broadcast_next_work(work_tile_info);
-                    continue;
+                int n_block_min = 0, n_block_max;
+                collective_mainloop.get_n_block_min_max(
+                        mainloop_params, m_block, n_split_idx, seqlen_traits_q, seqlen_traits_k,
+                        n_block_min, n_block_max);
+                if constexpr (Is_causal || Is_local || seqlen_traits_k.UseVarSeqLen || Ktraits::Is_split) {
+                    if(n_block_max <= n_block_min) {
+                        scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+                        scheduler.broadcast_next_work(work_tile_info);
+                        continue;
+                    }
                 }
-                collective_mainloop.load(mainloop_params, pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v,
-                                         shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
-                                         seqlen_traits_q, seqlen_traits_k);
+                collective_mainloop.load(
+                    mainloop_params, pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v,
+                    shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
+                    seqlen_traits_q, seqlen_traits_k, n_block_min, n_block_max);
                 ++work_idx;
             }
             collective_mainloop.load_tail(pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v);
         }
     } else {  // Consumer
-        cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 12 ? 240 : 160>();
-        // cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 12 ? 224 : 160>();
+        cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 16 ? 160 : Ktraits::kNWarps == 12 ? 240 : 256>();
 
         TileScheduler scheduler(&shared_storage.tile_count_semaphore);
         // Initialize matmul objects.
@@ -162,28 +167,41 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
             flash::Softmax<2 * (2 * kBlockM / NumMmaThreads)> softmax(mainloop_params.softmax_scale_log2);
 
             auto block_coord = work_tile_info.get_block_coord(scheduler_params);
-            auto [m_block, bidh, bidb] = block_coord;
+            auto [m_block, n_split_idx, bidh, bidb] = block_coord;
 
             seqlen_traits_q.init(bidb);
             seqlen_traits_k.init(bidb);
-            if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
-                continue;
-            }
-            const int n_block_max = collective_mainloop.get_n_block_max(
-                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-            const int n_block_min = collective_mainloop.get_n_block_min(
-                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-            if ((Is_causal || Is_local || seqlen_traits_k.kUseVarSeqLen) && n_block_max <= n_block_min) {  // We exit early and write 0 to gO and -inf to gLSE.
-                collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
-                continue;
+            if constexpr(seqlen_traits_q.UseVarSeqLen) {
+                // NOTE: to support in future with gqa packed layouts, changed kBlockM to kBlockM/kBlockH
+                if (m_block * (kBlockM/kBlockH) >= seqlen_traits_q.actual_seq_len) {
+                    continue;
+                }
             }
-
-            collective_mainloop.mma(mainloop_params, pipeline_k, pipeline_v, smem_pipe_read_k, smem_pipe_read_v,
-                                    tOrO, softmax, n_block_max, n_block_min, threadIdx.x - NumCopyThreads, work_idx, m_block, shared_storage,
-                                    seqlen_traits_q, seqlen_traits_k);
-                                    // tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads + (work_idx >> 30), work_idx, shared_storage);
-            collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
-                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+            int n_block_max, n_block_min = 0;
+            collective_mainloop.get_n_block_min_max(
+                    mainloop_params, m_block, n_split_idx, seqlen_traits_q, seqlen_traits_k,
+                    n_block_min, n_block_max);
+            if constexpr (Is_causal || Is_local || seqlen_traits_k.UseVarSeqLen || Ktraits::Is_split) {
+                if(n_block_max <= n_block_min) {  // We exit early and write 0 to gO and -inf to gLSE.
+                    if constexpr(!Seqlen_traits_Q::UseGQAPacking) {
+                        collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads,
+                            block_coord, seqlen_traits_q);
+                    } else {
+                        collective_epilogue.store_zero_gqa(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads,
+                            block_coord, seqlen_traits_q, mainloop_params.qhead_per_khead_divmod);
+                    }
+                    continue;
+                }   
+            }         
+
+            collective_mainloop.mma(
+                mainloop_params, pipeline_k, pipeline_v, smem_pipe_read_k, smem_pipe_read_v,
+                tOrO, softmax, n_block_min, n_block_max, threadIdx.x - NumCopyThreads, work_idx,
+                m_block, shared_storage, seqlen_traits_q, seqlen_traits_k);
+                // tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads + (work_idx >> 30), work_idx, shared_storage);
+            collective_epilogue.store(
+                epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
+                threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q, mainloop_params.qhead_per_khead_divmod);
 
             ++work_idx;
         }
@@ -192,35 +210,34 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
 }
 
-template <typename Ktraits, bool Is_causal, typename TileScheduler, typename Seqlen_traits>
+template <typename Ktraits, bool Is_causal, bool Is_local, typename TileScheduler, typename Seqlen_traits, typename Seqlen_traits_Q = Seqlen_traits>
 __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
-    compute_attn_ws_fp8(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, /*Is_local=*/false, Seqlen_traits>::Params const mainloop_params,
-                        CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits>::Params const epilogue_params,
+    compute_attn_ws_fp8(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits, Seqlen_traits_Q>::Params const mainloop_params,
+                        CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits_Q>::Params const epilogue_params,
                         CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params,
-                        Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k
+                        Seqlen_traits_Q seqlen_traits_q, Seqlen_traits seqlen_traits_k
                         ) {
 
     using Element = typename Ktraits::Element;
     static_assert(cutlass::sizeof_bits_v<Element> == 8);
-    using ElementAccum = typename Ktraits::ElementAccum;
-    using SoftType = ElementAccum;
     using TileShape_MNK = typename Ktraits::TileShape_MNK;
     using ClusterShape = typename Ktraits::ClusterShape_MNK;
 
     static_assert(Ktraits::Is_WS);
     static constexpr bool Is_WS = Ktraits::Is_WS;
-    static constexpr bool kUseVarSeqLen = Seqlen_traits::kUseVarSeqLen;
+    static constexpr bool No_smem_O = Ktraits::No_smem_O;
 
     static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
     static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
     static constexpr int kBlockM = Ktraits::kBlockM;
+    static constexpr int kBlockH = Ktraits::kBlockH;
     // static constexpr int kBlockN = Ktraits::kBlockN;
     // static constexpr int kHeadDim = Ktraits::kHeadDim;
-    static constexpr bool Delay_V_release = Is_causal && Ktraits::kHeadDim == 128;    
+    static constexpr bool Delay_V_release = Is_causal && Ktraits::kHeadDim == 128 && Ktraits::kNWarps != 8;    
     static constexpr bool Use_max_offset = true;
 
-    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, /*Is_local=*/false, Seqlen_traits>;
-    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits>;
+    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits, Seqlen_traits_Q>;
+    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits_Q>;
 
     using MainloopPipeline = typename Ktraits::MainloopPipeline;
     using MainloopPipelineVt = typename Ktraits::MainloopPipelineNoTMA;
@@ -260,7 +277,7 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
     if (warp_idx == 0 && lane_predicate) {
         shared_storage.barrier_Q.init(1 /*numThreads*/);
-        shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/);
+        if constexpr (!No_smem_O) { shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/); }
     }
     // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
     MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{});
@@ -277,6 +294,7 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
     float descale_v = *mainloop_params.descale_v_ptr;
     shared_storage.softmax_scale_qk_log2 = mainloop_params.softmax_scale_log2 * descale_q * descale_k;
     shared_storage.descale_v = descale_v;
+    shared_storage.seqlen_init_k = seqlen_traits_k.UseVarSeqLen || bool(seqlen_traits_k.seq_used);
 
     // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster
     if constexpr (size(ClusterShape{}) > 1) {
@@ -286,10 +304,10 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
         __syncthreads();
     }
 
-    static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
+    static_assert(Ktraits::kNWarps == 8 || Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
     if (warp_group_idx == 0) {  // Producer
-        cutlass::arch::warpgroup_reg_dealloc<Ktraits::kNWarps == 12 ? 40 : 32>();
-        
+        cutlass::arch::warpgroup_reg_dealloc<Ktraits::kNWarps == 16 ? 32 : Ktraits::kNWarps == 12 ? 40 : 56>();
+            
         PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>(); 
         PipelineState smem_pipe_read, smem_pipe_release;
 
@@ -300,19 +318,22 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
                 work_tile_info.is_valid(scheduler_params);
                 work_tile_info = scheduler.template get_next_work</*IsProducer=*/true>(scheduler_params, work_tile_info)) {
             auto block_coord = work_tile_info.get_block_coord(scheduler_params);
-            auto [m_block, bidh, bidb] = block_coord;
+            auto [m_block, n_split_idx, bidh, bidb] = block_coord;
 
-            if constexpr(kUseVarSeqLen) {
-                seqlen_traits_q.init(bidb);
-                seqlen_traits_k.init(bidb);
-                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
+            if constexpr (seqlen_traits_q.UseVarSeqLen) { seqlen_traits_q.init(bidb); }
+            if (shared_storage.seqlen_init_k) { seqlen_traits_k.init_no_guard(bidb); }
+            if constexpr(seqlen_traits_q.UseVarSeqLen) {
+                // NOTE: to support in future with gqa packed layout, changed kBlockM to kBlockM/kBlockH
+                if (m_block * (kBlockM/kBlockH) >= seqlen_traits_q.actual_seq_len) {
                     continue;
                 }
             }
-            int n_block_max = collective_mainloop.get_n_block_max(
-                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-            if constexpr(Is_causal) {
-                if(n_block_max <= 0) {
+            int n_block_min = 0, n_block_max;
+            collective_mainloop.get_n_block_min_max(
+                    mainloop_params, m_block, n_split_idx, seqlen_traits_q, seqlen_traits_k,
+                    n_block_min, n_block_max);
+            if constexpr (Is_causal || Is_local ||seqlen_traits_k.UseVarSeqLen || Ktraits::Is_split) {
+                if(n_block_max <= n_block_min) {
                     scheduler.prefetch_next_work(scheduler_params, work_tile_info);
                     scheduler.broadcast_next_work(work_tile_info);
                     // need to sync producer warpgroup
@@ -321,10 +342,9 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
                 }
             }
             collective_mainloop.load_fp8(
-                mainloop_params, pipeline_k, pipeline_v, pipeline_vt,
-                smem_pipe_write, smem_pipe_read, shared_storage,
-                scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
-                seqlen_traits_q, seqlen_traits_k);
+                mainloop_params, pipeline_k, pipeline_v, pipeline_vt, smem_pipe_write, smem_pipe_read,
+                shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
+                seqlen_traits_q, seqlen_traits_k, n_block_min, n_block_max);
             ++work_idx;
             // don't need to sync producer warpgroup here
             // if constexpr (Is_causal) {
@@ -332,7 +352,7 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
         }
         collective_mainloop.load_tail_one_write(pipeline_k, pipeline_v, smem_pipe_write);
     } else {  // Consumer
-        cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 12 ? 232 : 160>();        
+        cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 16 ? 160 : Ktraits::kNWarps == 12 ? 232 : 256>();        
 
         TileScheduler scheduler(&shared_storage.tile_count_semaphore);
         // Initialize matmul objects.
@@ -344,6 +364,7 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
         scheduler.init_consumer();
 
         int work_idx = 0;
+
         CUTLASS_PRAGMA_NO_UNROLL
         for (auto work_tile_info = scheduler.get_initial_work();
              work_tile_info.is_valid(scheduler_params);
@@ -353,37 +374,42 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
             flash::Softmax<2 * (2 * kBlockM / NumMmaThreads), Use_max_offset> softmax(shared_storage.softmax_scale_qk_log2);
 
             auto block_coord = work_tile_info.get_block_coord(scheduler_params);
-            auto [m_block, bidh, bidb] = block_coord;
+            auto [m_block, n_split_idx, bidh, bidb] = block_coord;
 
-            if constexpr(kUseVarSeqLen) {
-                seqlen_traits_q.init(bidb);
-                seqlen_traits_k.init(bidb);
-                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
+            if constexpr (seqlen_traits_q.UseVarSeqLen) { seqlen_traits_q.init(bidb); }
+            if (shared_storage.seqlen_init_k) { seqlen_traits_k.init_no_guard(bidb); }
+            if constexpr(seqlen_traits_q.UseVarSeqLen) {
+                // NOTE: to support in future with gqa packed layout, changed kBlockM to kBlockM/kBlockH
+                if (m_block * (kBlockM/kBlockH) >= seqlen_traits_q.actual_seq_len) {
                     continue;
                 }
             }
-            int n_block_max = collective_mainloop.get_n_block_max(
-                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-            if constexpr(Is_causal) {
-                if(n_block_max <= 0) {  // We exit early and write 0 to gO and -inf to gLSE.
-                    collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+            int n_block_max, n_block_min = 0;
+            collective_mainloop.get_n_block_min_max(
+                    mainloop_params, m_block, n_split_idx, seqlen_traits_q, seqlen_traits_k,
+                    n_block_min, n_block_max);
+            if constexpr (Is_causal || Is_local || seqlen_traits_k.UseVarSeqLen || Ktraits::Is_split) {
+                if(n_block_max <= n_block_min) {  // We exit early and write 0 to gO and -inf to gLSE.
+                    if constexpr(!Seqlen_traits_Q::UseGQAPacking) {
+                        collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads,
+                            block_coord, seqlen_traits_q);
+                    } else {
+                        collective_epilogue.store_zero_gqa(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads,
+                            block_coord, seqlen_traits_q, mainloop_params.qhead_per_khead_divmod);
+                    }
                     continue;
                 }
             }
             
             collective_mainloop.mma_fp8<Delay_V_release>(
                 mainloop_params, pipeline_k, pipeline_vt, smem_pipe_read, smem_pipe_release,
-                tOrO, softmax, n_block_max,
-                threadIdx.x - NumCopyThreads, work_idx, m_block,
-                shared_storage, seqlen_traits_q, seqlen_traits_k); 
-
-        #ifndef NO_FP8_COLUMN_PERMUTE
-            collective_epilogue.store_fp8(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
-                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
-        #else
-            collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
-                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
-        #endif
+                tOrO, softmax, n_block_min, n_block_max, threadIdx.x - NumCopyThreads, work_idx, m_block,
+                shared_storage, seqlen_traits_q, seqlen_traits_k);
+
+            collective_epilogue.store(
+                epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
+                threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q, mainloop_params.qhead_per_khead_divmod);
+
             ++work_idx;
         }
         collective_epilogue.store_tail();
diff --git a/hopper/flash_fwd_launch_template.h b/hopper/flash_fwd_launch_template.h
index 0c0790e4b..46363b18f 100644
--- a/hopper/flash_fwd_launch_template.h
+++ b/hopper/flash_fwd_launch_template.h
@@ -16,46 +16,55 @@
 #include "kernel_traits.h"
 #include "seq_len.h"
 #include "utils.h"
+#include "combine.h"
 
-
-template<typename Kernel_traits, bool Is_causal, bool Is_local, typename Seqlen_traits>
+template<typename Kernel_traits, bool Is_causal, bool Is_local, typename Seqlen_traits, typename Seqlen_traits_Q = Seqlen_traits>
 void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
     static_assert(!(Is_causal && Is_local), "Is_causal and Is_local cannot be true at the same time.");
     using Element = typename Kernel_traits::Element;
+    using ElementAccum = typename Kernel_traits::ElementAccum;
     using OutputType = typename Kernel_traits::OutputType;
     using TileShape_MNK = typename Kernel_traits::TileShape_MNK;
     using ClusterShape = typename Kernel_traits::ClusterShape_MNK;
 
-    // print(typename Kernel_traits::SmemLayoutVt{}); printf("\n"); print(typename Kernel_traits::SmemLayoutVt_tmp{});
-    using CollectiveMainloop = flash::CollectiveMainloopFwd<Kernel_traits, Is_causal, Is_local, Seqlen_traits>;
-    using CollectiveEpilogue = flash::CollectiveEpilogueFwd<Kernel_traits, Seqlen_traits>;
+    constexpr static bool Is_split = Kernel_traits::Is_split;
+    static_assert(Seqlen_traits_Q::UseGQAPacking == (Kernel_traits::kBlockH > 1), "If kBlockH > 1, use gqa packed layouts");
+    static_assert(!(Is_split && Seqlen_traits::UseVarSeqLen), "Split KV not yet supported for variable seqlen.");
+
+    using CollectiveMainloop = flash::CollectiveMainloopFwd<Kernel_traits, Is_causal, Is_local, Seqlen_traits, Seqlen_traits_Q>;
+    using CollectiveEpilogue = flash::CollectiveEpilogueFwd<Kernel_traits, Seqlen_traits_Q>;
     using Scheduler = std::conditional_t<
-        Seqlen_traits::kUseVarSeqLen || Is_local, 
+        Seqlen_traits::UseVarSeqLen, 
         flash::SingleTileScheduler,
-        std::conditional_t<!Is_causal,
-            flash::StaticPersistentTileScheduler,
-            flash::DynamicPersistentTileScheduler<Kernel_traits::kNThreads - cutlass::NumThreadsPerWarpGroup, Kernel_traits::NumProducerThreads>
+        std::conditional_t<!Is_causal && !Is_local && !Is_split,
+            flash::StaticPersistentTileScheduler<Is_split>,
+            flash::DynamicPersistentTileScheduler<
+                Kernel_traits::kNThreads - cutlass::NumThreadsPerWarpGroup,
+                Kernel_traits::NumProducerThreads,
+                Is_split
+            >
     >>;
     // using Scheduler = flash::SingleTileScheduler;
-    Seqlen_traits seqlen_traits_q(
+    Seqlen_traits_Q seqlen_traits_q(
         params.total_q, params.seqlen_q, params.cu_seqlens_q, params.seqused_q);
     Seqlen_traits seqlen_traits_k(
         params.total_k, params.seqlen_k, params.cu_seqlens_k, params.seqused_k);
+
     typename CollectiveMainloop::Params mainloop_params =
         CollectiveMainloop::to_underlying_arguments({
-            static_cast<Element const*>(params.q_ptr),
+            static_cast<Element const*>(params.q_ptr),            
             seqlen_traits_q.get_gmem_layout(
-                params.seqlen_q, params.d, params.h, params.b, 
+                params.seqlen_q, params.d, params.h_k, params.b, params.h_h_k_ratio, 
                 params.q_row_stride, params.q_head_stride, params.q_batch_stride
             ),  // layout_Q
             static_cast<Element const*>(params.k_ptr),
             seqlen_traits_k.get_gmem_layout(
-                params.seqlen_k, params.d, params.h_k, params.b, 
+                params.seqlen_k, params.d, params.h_k, params.b_k, 
                 params.k_row_stride, params.k_head_stride, params.k_batch_stride
             ),  // layout_K
             static_cast<Element const*>(params.v_ptr),
             seqlen_traits_k.get_gmem_layout(
-                params.seqlen_k, params.d, params.h_k, params.b, 
+                params.seqlen_k, params.d, params.h_k, params.b_k, 
                 params.v_row_stride, params.v_head_stride, params.v_batch_stride
             ),  // layout_V
             params.scale_softmax_log2,
@@ -63,32 +72,53 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
             params.descale_k_ptr,
             params.descale_v_ptr,
             params.window_size_left,
-            params.window_size_right
-        });
-    typename CollectiveEpilogue::Params epilogue_params =
-        CollectiveEpilogue::to_underlying_arguments({
-            static_cast<OutputType*>(params.o_ptr),
-            seqlen_traits_q.get_gmem_layout(
-                params.seqlen_q, params.d, params.h, params.b,
-                params.o_row_stride, params.o_head_stride, params.o_batch_stride
-            ),  // layout_O
-            static_cast<float*>(params.softmax_lse_ptr),
-            seqlen_traits_q.get_lse_gmem_layout(
-                params.seqlen_q, params.h, params.b
-            )  // layout_LSE
+            params.window_size_right,
+            ceil_div(params.h_h_k_ratio, Kernel_traits::kBlockH),
+            params.cache_batch_idx,
+            Is_split ? params.num_splits : 1
         });
+    typename CollectiveEpilogue::Params epilogue_params = [&] {
+        if constexpr(!Is_split) {
+            return CollectiveEpilogue::to_underlying_arguments({            
+                static_cast<OutputType*>(params.o_ptr),
+                seqlen_traits_q.get_gmem_layout(
+                    params.seqlen_q, params.d, params.h_k, params.b, params.h_h_k_ratio, 
+                    params.o_row_stride, params.o_head_stride, params.o_batch_stride
+                ),  // layout_O
+                static_cast<float*>(params.softmax_lse_ptr),            
+                seqlen_traits_q.get_lse_gmem_layout(
+                    params.seqlen_q, params.h, params.b
+                )  // layout_LSE
+            });
+        } else {
+            return CollectiveEpilogue::to_underlying_arguments({
+                static_cast<OutputType*>(params.oaccum_ptr), 
+                seqlen_traits_q.get_oaccum_gmem_layout(
+                    params.seqlen_q, params.d, params.h_k, params.b, params.h_h_k_ratio, params.num_splits,
+                    params.oaccum_row_stride, params.oaccum_head_stride, params.oaccum_batch_stride,  
+                    params.oaccum_split_stride
+                ), // layout_O
+                static_cast<float*>(params.softmax_lseaccum_ptr),            
+                seqlen_traits_q.get_lseaccum_gmem_layout(
+                    params.seqlen_q, params.h, params.b, params.num_splits
+                ), // layout_LSE
+            });
+        }
+    }();
 
-    int num_blocks_m = cutlass::ceil_div(params.seqlen_q, Kernel_traits::kBlockM);
-    num_blocks_m = cutlass::ceil_div(num_blocks_m, size<0>(ClusterShape{})) * size<0>(ClusterShape{});
-    typename Scheduler::Arguments scheduler_args = {num_blocks_m, params.h, params.b, params.tile_count_semaphore};
-    typename Scheduler::Params scheduler_params = Scheduler::to_underlying_arguments(scheduler_args);
+    int num_blocks_m = cutlass::ceil_div(params.seqlen_q, Kernel_traits::kBlockM/Kernel_traits::kBlockH);
+    num_blocks_m = cutlass::ceil_div(num_blocks_m, size<0>(ClusterShape{})) * size<0>(ClusterShape{});    
+    int num_blocks_h = params.h_k * ceil_div(params.h_h_k_ratio, Kernel_traits::kBlockH);
+    typename Scheduler::Arguments scheduler_args =
+        {num_blocks_m, Is_split ? params.num_splits : 1, num_blocks_h, params.b, params.tile_count_semaphore};
+    typename Scheduler::Params scheduler_params = Scheduler::to_underlying_arguments(scheduler_args);    
 
     // Get the ptr to kernel function.
     void *kernel;
     if constexpr(cutlass::sizeof_bits_v<Element> == 8)
-        kernel = (void *)flash::compute_attn_ws_fp8<Kernel_traits, Is_causal, Scheduler, Seqlen_traits>;
+        kernel = (void *)flash::compute_attn_ws_fp8<Kernel_traits, Is_causal, Is_local, Scheduler, Seqlen_traits, Seqlen_traits_Q>;
     else
-        kernel = (void *)flash::compute_attn_ws<Kernel_traits, Is_causal, Is_local, Scheduler, Seqlen_traits>;
+        kernel = (void *)flash::compute_attn_ws<Kernel_traits, Is_causal, Is_local, Scheduler, Seqlen_traits, Seqlen_traits_Q>;
     int smem_size = sizeof(typename Kernel_traits::SharedStorage);
     // int smem_size_q = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_q));
     // int smem_size_k = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_k));
@@ -106,148 +136,407 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
     dim3 grid_dims = Scheduler::get_grid_dim(scheduler_args, multiprocessor_count);
     static constexpr int ctaSize = Kernel_traits::kNWarps * 32;
     dim3 block_dims(ctaSize);
-    dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
-    cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
-    cutlass::launch_kernel_on_cluster(
-        launch_params, kernel, mainloop_params, epilogue_params, 
-        scheduler_params, seqlen_traits_q, seqlen_traits_k);
+    if constexpr(size(ClusterShape{}) > 1) {
+        dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
+        cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
+        cutlass::launch_kernel_on_cluster(
+            launch_params, kernel, mainloop_params, epilogue_params, 
+            scheduler_params, seqlen_traits_q, seqlen_traits_k);
+    } else {
+        if constexpr(cutlass::sizeof_bits_v<Element> == 8) {
+            flash::compute_attn_ws_fp8<Kernel_traits, Is_causal, Is_local, Scheduler, Seqlen_traits, Seqlen_traits_Q>
+                <<<grid_dims, block_dims, smem_size, stream>>>
+                (mainloop_params, epilogue_params, scheduler_params, seqlen_traits_q, seqlen_traits_k);
+        } else {
+            flash::compute_attn_ws<Kernel_traits, Is_causal, Is_local, Scheduler, Seqlen_traits, Seqlen_traits_Q>
+                <<<grid_dims, block_dims, smem_size, stream>>>
+                (mainloop_params, epilogue_params, scheduler_params, seqlen_traits_q, seqlen_traits_k);
+        }
+
+    }
     CHECK_CUDA_KERNEL_LAUNCH();
+
+    if constexpr (Is_split) {
+      using FinalOutputType = typename Kernel_traits::FinalOutputType;
+      static_assert(is_same_v<OutputType, float>, "Assume OutputType of main kernel is float.");
+      static_assert(is_same_v<ElementAccum, float>, "ElementAccum must be float.");
+      // We want kBlockM to be as small as possible for more parallelism.
+      // With 128 threads we can load 512 elements at a time, so if headdim is divisible by 128, kBlockM = 4.
+      // If headdim is divisible by 64, then we set kBlockM = 8, etc.
+      constexpr static int kHeadDim = Kernel_traits::kHeadDim;
+      constexpr static int kBlockM = kHeadDim % 128 == 0 ? 4 : (kHeadDim % 64 == 0 ? 8 : 16);
+      constexpr static bool Is_even_K = true; // always true for our current setting
+      void *kernel_combine;
+      int smem_size_combine;
+      NUM_SPLITS_SWITCH(params.num_splits, kLogMaxSplits, [&] {
+        constexpr static int kMaxSplits = 1 << kLogMaxSplits;
+        kernel_combine = (void *) flash::combine_attn_seqk_parallel<
+          FinalOutputType, ElementAccum, kHeadDim, kBlockM, kLogMaxSplits, Is_even_K, Flash_fwd_params>;
+        smem_size_combine = sizeof(
+          flash::SharedStorageLSE<float, Shape<Int<kMaxSplits>, Int<kBlockM+1>>, Shape<Int<kMaxSplits>>>);
+      });
+      if (smem_size_combine >= 48 * 1024) {
+        CHECK_CUDA(cudaFuncSetAttribute(kernel_combine, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_combine));
+      }
+      dim3 grid_combine((params.b * params.h * params.seqlen_q + kBlockM - 1) / kBlockM);
+      dim3 block_dims_combine(128);
+      dim3 cluster_dims_combine(1, 1, 1);
+      cutlass::ClusterLaunchParams launch_params_combine{
+          grid_combine, block_dims_combine, cluster_dims_combine, smem_size_combine, stream};
+      cutlass::launch_kernel_on_cluster(launch_params_combine, kernel_combine, params);
+      CHECK_CUDA_KERNEL_LAUNCH();
+    }
 }
 
 template<typename T>
 void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 64;
+    constexpr static bool UseCluster = false;
+
     BOOL_SWITCH(params.is_causal, Is_causal, [&] {
-        BOOL_SWITCH(params.is_local, Is_local, [&] {
-            SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        MMA_3WG_SWITCH(params.seqlen_q, kNumMmaWGs, [&] {
+          SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+              // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 192) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+              //             && kNumMmaWGs == 3 && !Seqlen_traits::UseVarSeqLen, UseCluster, [&] {
                 run_flash_fwd<
-                    Flash_fwd_kernel_traits<Headdim, 192, 128, 16, 2, false, 1, T>, 
-                    Is_causal, Is_local && !Is_causal, Seqlen_traits
+                  Flash_fwd_kernel_traits<Headdim, kNumMmaWGs * 64, 128, 4 + kNumMmaWGs * 4,
+                      2, false, UseCluster ? 2 : 1, T, !Seqlen_traits::UseVarSeqLen && Is_split>,
+                  Is_causal,
+                  Is_local && !Is_causal,
+                  Seqlen_traits
                 >(params, stream);
+              // });
             });
+          });
         });
+      });
     });
 }
 
 template<typename T>
 void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 128;
-    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+    
+    MMA_2WG_SWITCH(params.seqlen_q, kNumMmaWGs, [&] {
+      BOOL_SWITCH(params.is_causal, Is_causal, [&] {
         BOOL_SWITCH(params.is_local, Is_local, [&] {
-            SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
-                // Only use Cluster if number of tiles along seqlen_q is even and not Is_causal
-                BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Is_local && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
-                    run_flash_fwd<
-                        Flash_fwd_kernel_traits<Headdim, 128, (Is_causal || Is_local) ? 128 : 176, 12, 2, false, UseCluster ? 2 : 1, T>, 
-                        Is_causal, Is_local && !Is_causal, Seqlen_traits
-                    >(params, stream);
-                });
+          SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+              // Only use Cluster if number of tiles along seqlen_q is even
+              // and not Is_causal, Is_split, or varseqlen
+              BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+                          && kNumMmaWGs == 2 && !Seqlen_traits::UseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<
+                  Flash_fwd_kernel_traits<Headdim, kNumMmaWGs * 64, (Is_causal || Is_local) ? 128 : 176,
+                      4 + kNumMmaWGs * 4, 2, false, UseCluster ? 2 : 1, 
+                      T, !Seqlen_traits::UseVarSeqLen && Is_split>, 
+                  Is_causal,
+                  Is_local && !Is_causal,
+                  Seqlen_traits
+                >(params, stream);
+              });
             });
+          });
         });
+      });
     });
 }
 
+
+
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 256;
-    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+
+    MMA_2WG_SWITCH(params.seqlen_q, kNumMmaWGs, [&] {
+      BOOL_SWITCH(params.is_causal, Is_causal, [&] {
         BOOL_SWITCH(params.is_local, Is_local, [&] {
-            SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
-                // Only use Cluster if number of tiles along seqlen_q is even
-                BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Is_local && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
-                    run_flash_fwd<
-                        Flash_fwd_kernel_traits<Headdim, 128, 80, 12, 2, false, UseCluster ? 2 : 1, T>, 
-                        Is_causal, Is_local && !Is_causal, Seqlen_traits
-                    >(params, stream);
-                });
+          SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+              // Only use Cluster if number of tiles along seqlen_q is even
+              // and not Is_causal, Is_split, or varseqlen
+              BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+                          && kNumMmaWGs == 2 && !Seqlen_traits::UseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<
+                  Flash_fwd_kernel_traits<Headdim, kNumMmaWGs * 64, kNumMmaWGs == 1 ? 96 : 80,
+                      4 + kNumMmaWGs * 4, 2, false, UseCluster ? 2 : 1,
+                      T, !Seqlen_traits::UseVarSeqLen && Is_split>, 
+                  Is_causal,
+                  Is_local && !Is_causal,
+                  Seqlen_traits
+                >(params, stream);
+              });
             });
+          });
         });
+      });
     });
 }
 
 template<typename T>
 void run_mha_fwd_hdim64_fp8(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 64;
-    constexpr static int kBlockM = 192;
     constexpr static int kBlockN = 128;
-    constexpr static int kNWarps = 4 + kBlockM/16;
-    constexpr static int kStages = 4;    
+    constexpr static int kStages = 4;
+    // constexpr static bool UseCluster = false;
+    // constexpr static int kBlockM = 192;
+    // constexpr static int kNWarps = 4 + kBlockM/16;
     using Seqlen_traits = flash::FixedSeqLenTraits;
-    if(params.is_causal) {
-        run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-                        false, 1, T>, /*Is_causal=*/true, /*Is_local=*/false, Seqlen_traits>(params, stream);
-    } else {
-        BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0, UseCluster, [&] {
-            run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-                            false, UseCluster ? 2 : 1, T>, /*Is_causal=*/false, /*Is_local=*/false, Seqlen_traits>(params, stream);
+
+    MMA_3WG_SWITCH(params.seqlen_q, kNumMmaWGs, [&] {
+      BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        BOOL_SWITCH(params.is_local, Is_local, [&] {
+          BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 192) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+                        && kNumMmaWGs == 3, UseCluster, [&] {
+              run_flash_fwd<
+                Flash_fwd_kernel_traits_fp8<Headdim, kNumMmaWGs * 64, kBlockN, 4 + kNumMmaWGs * 4,
+                  kStages, false, UseCluster ? 2 : 1, T, Is_split>,
+                Is_causal,
+                Is_local && !Is_causal,
+                Seqlen_traits
+              >(params, stream);
+            });
+          });
         });
-    }
-    // BOOL_SWITCH(params.is_causal, Is_causal, [&] {
-        // SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
-            // Only use Cluster if number of tiles along seqlen_q is even
-            // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal &&
-            //             !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
-            //     run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-            //                   false, UseCluster ? 2 : 1, T>, Is_causal, Seqlen_traits>(params, stream);            
-            // });
-        // });
-    // });
+      });
+    });
 }
 
 template<typename T>
 void run_mha_fwd_hdim128_fp8(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 128;
-    constexpr static int kBlockM = 128;
     constexpr static int kBlockN = 256;
-    constexpr static int kNWarps = 4 + kBlockM/16;
     constexpr static int kStages = 2;
+    // constexpr static int kBlockM = 128;
+    // constexpr static int kNWarps = 4 + kBlockM/16;
     using Seqlen_traits = flash::FixedSeqLenTraits;
-    if(params.is_causal) {
-        run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-                        false, 1, T>, /*Is_causal=*/true, /*Is_local=*/false, Seqlen_traits>(params, stream);
-    } else {
-        BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0, UseCluster, [&] {
-            run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-                            false, UseCluster ? 2 : 1, T>, /*Is_causal=*/false, /*Is_local=*/false, Seqlen_traits>(params, stream);
+
+    MMA_2WG_SWITCH(params.seqlen_q, kNumMmaWGs, [&] {
+      BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        BOOL_SWITCH(params.is_local, Is_local, [&] {
+          BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+                        && kNumMmaWGs == 2, UseCluster, [&] {
+              run_flash_fwd<
+                Flash_fwd_kernel_traits_fp8<Headdim, kNumMmaWGs * 64, kBlockN, 4 + kNumMmaWGs * 4,
+                  kStages, false, UseCluster ? 2 : 1, T, Is_split>,
+                Is_causal,
+                Is_local && !Is_causal,
+                Seqlen_traits
+              >(params, stream);
+            });
+          });
         });
-    }
-    // BOOL_SWITCH(params.is_causal, Is_causal, [&] {
-        // SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
-            // Only use Cluster if number of tiles along seqlen_q is even
-            // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal &&
-            //             !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
-            //     run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-            //                   false, UseCluster ? 2 : 1, T>, Is_causal, Seqlen_traits>(params, stream);
-            // });
-        // });
-    // });
+      });
+    });
 }
 
 template<typename T>
 void run_mha_fwd_hdim256_fp8(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 256; 
-    constexpr static int kBlockM = 128;
     constexpr static int kBlockN = 128;
-    constexpr static int kNWarps = 4 + kBlockM/16;
     constexpr static int kStages = 2;
+    // constexpr static int kBlockM = 128;
+    // constexpr static int kNWarps = 4 + kBlockM/16;
     using Seqlen_traits = flash::FixedSeqLenTraits;
-    if(params.is_causal) {
-        run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-                        false, 1, T>, /*Is_causal=*/true, /*Is_local=*/false, Seqlen_traits>(params, stream);
-    } else {
-        BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0, UseCluster, [&] {
-            run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-                            false, UseCluster ? 2 : 1, T>, /*Is_causal=*/false, /*Is_local=*/false, Seqlen_traits>(params, stream);
+
+    MMA_2WG_SWITCH(params.seqlen_q, kNumMmaWGs, [&] {
+      BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        BOOL_SWITCH(params.is_local, Is_local, [&] {
+          BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+                        && kNumMmaWGs == 2, UseCluster, [&] {
+              run_flash_fwd<
+                Flash_fwd_kernel_traits_fp8<Headdim, kNumMmaWGs * 64, kBlockN, 4 + kNumMmaWGs * 4,
+                  kStages, false, UseCluster ? 2 : 1, T, Is_split>,
+                Is_causal,
+                Is_local && !Is_causal,
+                Seqlen_traits
+              >(params, stream);
+            });
+          });
         });
-    }
-    // BOOL_SWITCH(params.is_causal, Is_causal, [&] {
-        // SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
-            // Only use Cluster if number of tiles along seqlen_q is even
-            // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal &&
-            //             !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
-            //     run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
-            //                   false, UseCluster ? 2 : 1, T>, Is_causal, Seqlen_traits>(params, stream);
-            // });
-        // });
-    // });
+      });
+    });
+}
+
+/*
+** GQA methods
+*/
+
+template<typename T, int kBlockH>
+void run_mha_fwd_hdim64_gqa(Flash_fwd_params &params, cudaStream_t stream) {
+  constexpr static int Headdim = 64;
+  constexpr static bool UseCluster = false;
+  using Seqlen_traits = flash::FixedSeqLenTraits;
+  using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
+
+  MMA_3WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+          // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 192/kBlockH) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+          //             && kNumMmaWGs == 3, UseCluster, [&] {
+            run_flash_fwd<
+              Flash_fwd_kernel_traits<Headdim, kNumMmaWGs * 64, 128, 4 + kNumMmaWGs * 4,
+                  2, false, UseCluster ? 2 : 1, T, !Seqlen_traits::UseVarSeqLen && Is_split, kBlockH>,
+              Is_causal,
+              Is_local && !Is_causal,
+              Seqlen_traits,
+              Seqlen_traits_Q
+            >(params, stream);
+          // });
+        });
+      });
+    });
+  });
+}
+
+template<typename T, int kBlockH>
+void run_mha_fwd_hdim128_gqa(Flash_fwd_params &params, cudaStream_t stream) {
+  constexpr static int Headdim = 128;
+  constexpr static bool UseCluster = false;
+  using Seqlen_traits = flash::FixedSeqLenTraits;
+  using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
+
+  MMA_2WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+          // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128/kBlockH) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+          //             && kNumMmaWGs == 2, UseCluster, [&] {
+            run_flash_fwd<
+              Flash_fwd_kernel_traits<Headdim, kNumMmaWGs * 64, (Is_causal || Is_local) ? 128 : 176,
+                  4 + kNumMmaWGs * 4, 2, false, UseCluster ? 2 : 1, T, Is_split, kBlockH>, 
+              Is_causal,
+              Is_local && !Is_causal,
+              Seqlen_traits,
+              Seqlen_traits_Q
+            >(params, stream);
+          // });
+        });
+      });
+    });
+  });
+}
+
+template<typename T, int kBlockH>
+void run_mha_fwd_hdim256_gqa(Flash_fwd_params &params, cudaStream_t stream) {
+  constexpr static int Headdim = 256;
+  constexpr static bool UseCluster = false;
+  using Seqlen_traits = flash::FixedSeqLenTraits;
+  using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
+
+  MMA_2WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+          // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128/kBlockH) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+          //             && kNumMmaWGs == 2, UseCluster, [&] {
+            run_flash_fwd<
+              Flash_fwd_kernel_traits<Headdim, kNumMmaWGs * 64, kNumMmaWGs == 1 ? 96 : 80,
+                  4 + kNumMmaWGs * 4, 2, false, UseCluster ? 2 : 1, T, Is_split, kBlockH>, 
+              Is_causal,
+              Is_local && !Is_causal,
+              Seqlen_traits,
+              Seqlen_traits_Q
+            >(params, stream);
+          // });
+        });
+      });
+    });
+  });
+}
+
+template<typename T, int kBlockH>
+void run_mha_fwd_hdim64_fp8_gqa(Flash_fwd_params &params, cudaStream_t stream) {
+  constexpr static int Headdim = 64;
+  constexpr static int kBlockN = 128;
+  constexpr static int kStages = 4;
+  constexpr static bool UseCluster = false;
+  using Seqlen_traits = flash::FixedSeqLenTraits;
+  using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
+
+  MMA_3WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+          // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 192/kBlockH) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+          //             && kNumMmaWGs == 3, UseCluster, [&] {
+            run_flash_fwd<
+              Flash_fwd_kernel_traits_fp8<Headdim, kNumMmaWGs * 64, kBlockN, 4 + kNumMmaWGs * 4,
+                kStages, false, UseCluster ? 2 : 1, T, Is_split, kBlockH>,
+              Is_causal,
+              Is_local && !Is_causal,
+              Seqlen_traits,
+              Seqlen_traits_Q
+            >(params, stream);
+          // });
+        });
+      });
+    });
+  });
+}
+
+template<typename T, int kBlockH>
+void run_mha_fwd_hdim128_fp8_gqa(Flash_fwd_params &params, cudaStream_t stream) {
+  constexpr static int Headdim = 128;
+  constexpr static int kBlockN = 256;
+  constexpr static int kStages = 2;
+  constexpr static bool UseCluster = false;
+  using Seqlen_traits = flash::FixedSeqLenTraits;
+  using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
+
+  MMA_2WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+          // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128/kBlockH) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+          //             && kNumMmaWGs == 2, UseCluster, [&] {
+            run_flash_fwd<
+              Flash_fwd_kernel_traits_fp8<Headdim, kNumMmaWGs * 64, kBlockN, 4 + kNumMmaWGs * 4,
+                kStages, false, UseCluster ? 2 : 1, T, Is_split, kBlockH>,
+              Is_causal,
+              Is_local && !Is_causal,
+              Seqlen_traits,
+              Seqlen_traits_Q
+            >(params, stream);
+          // });
+        });
+      });
+    });
+  });
+}
+
+template<typename T, int kBlockH>
+void run_mha_fwd_hdim256_fp8_gqa(Flash_fwd_params &params, cudaStream_t stream) {
+  constexpr static int Headdim = 256;
+  constexpr static int kBlockN = 128;
+  constexpr static int kStages = 2;
+  constexpr static bool UseCluster = false;
+  using Seqlen_traits = flash::FixedSeqLenTraits;
+  using Seqlen_traits_Q = flash::FixedGQASeqLenTraits;
+
+  MMA_2WG_SWITCH(kBlockH * params.seqlen_q, kNumMmaWGs, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+      BOOL_SWITCH(params.is_local, Is_local, [&] {
+        BOOL_SWITCH(params.num_splits > 1, Is_split, [&] {
+          // BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128/kBlockH) % 2 == 0 && !Is_causal && !Is_local && !Is_split
+          //             && kNumMmaWGs == 2, UseCluster, [&] {
+            run_flash_fwd<
+              Flash_fwd_kernel_traits_fp8<Headdim, kNumMmaWGs * 64, kBlockN, 4 + kNumMmaWGs * 4,
+                kStages, false, UseCluster ? 2 : 1, T, Is_split, kBlockH>,
+              Is_causal,
+              Is_local && !Is_causal,
+              Seqlen_traits,
+              Seqlen_traits_Q
+            >(params, stream);
+          // });
+        });
+      });
+    });
+  });
 }
diff --git a/hopper/kernel_traits.h b/hopper/kernel_traits.h
index 684833ab8..e9210731d 100644
--- a/hopper/kernel_traits.h
+++ b/hopper/kernel_traits.h
@@ -33,6 +33,42 @@ struct SharedStorageQKVO {
     };
 };
 
+// Use if Oaccum is too large for SharedStorageQKVO
+template <int kStages, class Gemm1Type, class Gemm2Type, class OutputType, class SmemLayoutQ,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutO>
+struct SharedStorageQKVOaccum {
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;    
+    union {    
+        struct {    
+            cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
+            cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;
+        };
+        cute::array_aligned<OutputType, cute::cosize_v<SmemLayoutO>> smem_o;
+    };
+    struct {
+        cutlass::arch::ClusterTransactionBarrier barrier_Q;
+        cutlass::arch::ClusterBarrier barrier_O;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+        int tile_count_semaphore;
+    };
+};
+
+// SharedStorage struct with no smem for O
+template <int kStages, class Gemm1Type, class Gemm2Type, class SmemLayoutQ,
+          class SmemLayoutK, class SmemLayoutV>
+struct SharedStorageQKV {
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
+    cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;
+    struct {
+        cutlass::arch::ClusterTransactionBarrier barrier_Q;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+        int tile_count_semaphore;
+    };
+};
+
 template <int kStages, class Gemm1Type, class Gemm2Type, class OutputType, class SmemLayoutQ,
           class SmemLayoutK, class SmemLayoutV, class SmemLayoutO>
 struct SharedStorageQKVOVt {
@@ -54,16 +90,67 @@ struct SharedStorageQKVOVt {
     int tile_count_semaphore;
     float softmax_scale_qk_log2;
     float descale_v;
+    bool seqlen_init_k;
+  };
+};
+
+// Use if Oaccum is too large for SharedStorageQKVOVt
+template <int kStages, class Gemm1Type, class Gemm2Type, class OutputType, class SmemLayoutQ,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutO>
+struct SharedStorageQKVOVtaccum {
+  struct {
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
+    union {
+        struct {
+            cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;  
+            cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v_out;
+        };
+        cute::array_aligned<OutputType, cute::cosize_v<SmemLayoutO>> smem_o;
+    };
+  };
+  struct {    
+    cutlass::arch::ClusterTransactionBarrier barrier_Q;
+    cutlass::arch::ClusterBarrier barrier_O;
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+    typename cutlass::PipelineAsync<kStages>::SharedStorage pipeline_vt;
+    int tile_count_semaphore;
+    float softmax_scale_qk_log2;
+    float descale_v;
+    bool seqlen_init_k;
+  };
+};
+
+template <int kStages, class Gemm1Type, class Gemm2Type, class SmemLayoutQ,
+          class SmemLayoutK, class SmemLayoutV>
+struct SharedStorageQKVVt {
+  struct {
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
+    cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;  
+    cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v_out;
+  };
+  struct {    
+    cutlass::arch::ClusterTransactionBarrier barrier_Q;
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+    typename cutlass::PipelineAsync<kStages>::SharedStorage pipeline_vt;
+    int tile_count_semaphore;
+    float softmax_scale_qk_log2;
+    float descale_v;
+    bool seqlen_init_k;
   };
 };
 
 // If Share_Q_K_smem is true, that forces Is_Q_in_regs to be true
 template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, int kStages_, bool Is_Q_in_regs_=false,
-         int kClusterM_ = 1, typename elem_type=cutlass::half_t>
+         int kClusterM_ = 1, typename elem_type=cutlass::half_t, bool Is_split_=false, int kBlockH_ = 1>
 struct Flash_fwd_kernel_traits {
     using Element = elem_type;
     using ElementAccum = float;
-    using OutputType = elem_type;
+    using FinalOutputType = elem_type;
+    using OutputType = std::conditional_t<Is_split_, float, FinalOutputType>;
     using index_t = int64_t;
 
     // The number of threads.
@@ -72,14 +159,16 @@ struct Flash_fwd_kernel_traits {
     static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarp;
 
     static constexpr bool Is_Q_in_regs = Is_Q_in_regs_;
-    static_assert(kNWarps_ == 4 || kNWarps_ == 8 || kNWarps_ == 12 || kNWarps_ == 16);
-    static constexpr bool Is_WS = kNWarps_ >= 12;
+    static_assert(kNWarps_ == 8 || kNWarps_ == 12 || kNWarps_ == 16);
+    static constexpr bool Is_WS = true;
     static_assert(!(Is_WS && Is_Q_in_regs), "Warp-specialization does not support Q in registers");
 
     static constexpr int kBlockM = kBlockM_;
     static constexpr int kBlockN = kBlockN_;
+    static constexpr int kBlockH = kBlockH_;
     static constexpr int kHeadDim = kHeadDim_;
     static_assert(kHeadDim % 32 == 0);
+    static_assert(kBlockM % kBlockH == 0);
     using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
 
     static constexpr int kClusterM = kClusterM_;
@@ -87,6 +176,9 @@ struct Flash_fwd_kernel_traits {
 
     static constexpr int kStages = kStages_;
 
+    static constexpr bool Is_split = Is_split_;
+    static constexpr bool No_smem_O = Is_split;
+
     using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
     using TiledMma0 = decltype(cute::make_tiled_mma(
         std::conditional_t<
@@ -104,6 +196,14 @@ struct Flash_fwd_kernel_traits {
         decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
 
+    // for gmem -> smem Q copy 
+    using FactoringLayoutQ = Layout<Shape<Int<kBlockM/kBlockH>, Int<kBlockH>, Int<kHeadDim>>,
+        Stride<Int<kBlockH>, _1, Int<kBlockM>>>;
+    using TileShapeQCopy = std::conditional_t<(kBlockH > 1),
+        decltype(shape(FactoringLayoutQ{})), decltype(select<0, 2>(TileShape_MNK{}))>;
+    using SmemLayoutQCopy = std::conditional_t<(kBlockH > 1),
+        decltype(composition(SmemLayoutQ{}, FactoringLayoutQ{})), SmemLayoutQ>;
+
     using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
         decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutK =
@@ -122,15 +222,20 @@ struct Flash_fwd_kernel_traits {
                     make_ordered_layout(
                         make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{}), Int<kStages>{}),
                         Step<_2, _1, _3>{})));
-
+    
     using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, OutputType,
         decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+    // for smem -> gmem O copy
+    using TileShapeOCopy = TileShapeQCopy;
+    using SmemLayoutOCopy = std::conditional_t<(kBlockH > 1),
+        decltype(composition(SmemLayoutO{}, FactoringLayoutQ{})), SmemLayoutO>;
 
     using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
 
-    using SharedStorage = SharedStorageQKVO<kStages, Element, Element, Element, SmemLayoutQ,
-                                            SmemLayoutK, SmemLayoutV, SmemLayoutO>;
+    using SharedStorage = std::conditional_t<!No_smem_O,
+        SharedStorageQKVO<kStages, Element, Element, OutputType, SmemLayoutQ, SmemLayoutK, SmemLayoutV, SmemLayoutO>,
+        SharedStorageQKV<kStages, Element, Element, SmemLayoutQ, SmemLayoutK, SmemLayoutV>>;
 
     using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
     using MainloopPipelineNoTMA = typename cutlass::PipelineAsync<kStages>;
@@ -141,13 +246,19 @@ struct Flash_fwd_kernel_traits {
 
 // Traits struct for fp8 kernel with in-kernel transpose
 template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, int kStages_, bool Is_Q_in_regs_=false,
-         int kClusterM_ = 1, typename elem_type=cutlass::float_e4m3_t>
+         int kClusterM_ = 1, typename elem_type=cutlass::float_e4m3_t, bool Is_split_ = false, int kBlockH_ = 1>
 struct Flash_fwd_kernel_traits_fp8 {
     using Element = elem_type;
     static_assert(cutlass::sizeof_bits_v<Element> == 8);
     using ElementAccum = float;
-    using OutputType = cutlass::half_t;
-    using index_t = int64_t;      
+    using FinalOutputType = cutlass::bfloat16_t;
+    using OutputType = std::conditional_t<Is_split_, float, FinalOutputType>;
+    using index_t = int64_t;
+
+    static constexpr bool Is_split = Is_split_;
+    static constexpr bool No_smem_O = false;
+    // NOTE: not using smem for epilogue degrades perf substantially.
+    // static constexpr bool No_smem_O = Is_split;
 
     // The number of threads.
     static constexpr int kNWarps = kNWarps_;
@@ -155,14 +266,16 @@ struct Flash_fwd_kernel_traits_fp8 {
     static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarpGroup;
 
     static constexpr bool Is_Q_in_regs = Is_Q_in_regs_;
-    static_assert(kNWarps_ == 12 || kNWarps_ == 16);
+    static_assert(kNWarps_ == 8 || kNWarps_ == 12 || kNWarps_ == 16);
     static constexpr bool Is_WS = true;    
     static_assert(!Is_Q_in_regs, "Warp-specialization does not support Q in registers");    
 
     static constexpr int kBlockM = kBlockM_;
     static constexpr int kBlockN = kBlockN_;
+    static constexpr int kBlockH = kBlockH_;
     static constexpr int kHeadDim = kHeadDim_;
     static_assert(kHeadDim % 32 == 0);
+    static_assert(kBlockM % kBlockH == 0);
     using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
 
     static constexpr int kClusterM = kClusterM_;
@@ -171,6 +284,9 @@ struct Flash_fwd_kernel_traits_fp8 {
     static constexpr int kStages = kStages_;
     static_assert(kStages > 1);
 
+    // Use this to save enough smem when writing out in float precision.
+    static constexpr bool VO_union_all = Is_split && (kBlockM != 64) && (kHeadDim == 256);
+
     using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;    
     using TiledMma0 = decltype(cute::make_tiled_mma(
         cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShape_MNK>(),
@@ -184,6 +300,14 @@ struct Flash_fwd_kernel_traits_fp8 {
         decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
 
+    // for gmem -> smem Q copy
+    using FactoringLayoutQ = Layout<Shape<Int<kBlockM/kBlockH>, Int<kBlockH>, Int<kHeadDim>>,
+        Stride<Int<kBlockH>, _1, Int<kBlockM>>>;
+    using TileShapeQCopy = std::conditional_t<(kBlockH > 1),
+        decltype(shape(FactoringLayoutQ{})), decltype(select<0, 2>(TileShape_MNK{}))>;
+    using SmemLayoutQCopy = std::conditional_t<(kBlockH > 1),
+        decltype(composition(SmemLayoutQ{}, FactoringLayoutQ{})), SmemLayoutQ>;
+
     using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
         decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutK =
@@ -226,6 +350,10 @@ struct Flash_fwd_kernel_traits_fp8 {
     using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, OutputType,
         decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
     using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+    // for smem -> gmem O copy
+    using TileShapeOCopy = TileShapeQCopy;
+    using SmemLayoutOCopy = std::conditional_t<(kBlockH > 1),
+        decltype(composition(SmemLayoutO{}, FactoringLayoutQ{})), SmemLayoutO>;
 
     // used for rmem -> smem O copy in fp8 kernel to undo column permutation
     using ThreadLayoutrO = Layout<Shape<_8, Int<kBlockM/16>, _4, _1>,
@@ -240,8 +368,11 @@ struct Flash_fwd_kernel_traits_fp8 {
 
     using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
 
-    using SharedStorage = SharedStorageQKVOVt<kStages, Element, Element, OutputType, SmemLayoutQ,
-                          SmemLayoutK, SmemLayoutV, SmemLayoutO>;
+    using SharedStorage = std::conditional_t<!No_smem_O,
+        std::conditional_t<!VO_union_all,
+            SharedStorageQKVOVt<kStages, Element, Element, OutputType, SmemLayoutQ, SmemLayoutK, SmemLayoutV, SmemLayoutO>,
+            SharedStorageQKVOVtaccum<kStages, Element, Element, OutputType, SmemLayoutQ, SmemLayoutK, SmemLayoutV, SmemLayoutO>>,
+        SharedStorageQKVVt<kStages, Element, Element, SmemLayoutQ, SmemLayoutK, SmemLayoutV>>;
 
     using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
     using MainloopPipelineNoTMA = typename cutlass::PipelineAsync<kStages>;
diff --git a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
index 111421580..cf8a8fa7c 100644
--- a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
+++ b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -79,7 +79,7 @@ struct SmemTransposeFp8_64x64 {
   }
 };
 
-template <typename Ktraits, bool Is_causal, bool Is_local, typename Seqlen_traits>
+template <typename Ktraits, bool Is_causal, bool Is_local, typename Seqlen_traits, typename Seqlen_traits_Q = Seqlen_traits>
 struct CollectiveMainloopFwd {
 
     using Element = typename Ktraits::Element;
@@ -87,12 +87,19 @@ struct CollectiveMainloopFwd {
     using ClusterShape = typename Ktraits::ClusterShape_MNK;
 
     static constexpr int kStages = Ktraits::kStages;
-    static constexpr int kHeadDim = Ktraits::kHeadDim;    
+    static constexpr int kHeadDim = Ktraits::kHeadDim;
+    // static constexpr int kBlockM = Ktraits::kBlockM;
+    // static constexpr int kBlockN = Ktraits::kBlockN;
+    // static constexpr int kBlockH = Ktraits::kBlockH;
+    static constexpr bool Is_split = Ktraits::Is_split;
+    static constexpr bool No_smem_O = Ktraits::No_smem_O;
 
     using GmemTiledCopyQ = cute::SM90_TMA_LOAD;
     using GmemTiledCopyKV = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{})));
     
     using SmemLayoutQ = typename Ktraits::SmemLayoutQ;
+    using SmemLayoutQCopy = typename Ktraits::SmemLayoutQCopy;
+    using TileShapeQCopy = typename Ktraits::TileShapeQCopy;
     using SmemLayoutK = typename Ktraits::SmemLayoutK;
     using SmemLayoutV = typename Ktraits::SmemLayoutV;
     using SmemLayoutVt = typename Ktraits::SmemLayoutVt;
@@ -101,11 +108,11 @@ struct CollectiveMainloopFwd {
         GmemTiledCopyQ{},
         make_tensor(
             make_gmem_ptr(static_cast<Element const*>(nullptr)), 
-            repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), 
-            typename Seqlen_traits::StrideT{}
+            repeat_like(typename Seqlen_traits_Q::StrideT{}, int32_t(0)), 
+            typename Seqlen_traits_Q::StrideT{}
         ),
-        SmemLayoutQ{},
-        select<0, 2>(TileShape_MNK{}),
+        SmemLayoutQCopy{},
+        TileShapeQCopy{},
         _1{}));  // no mcast for Q
 
     using TMA_K = decltype(make_tma_copy(
@@ -142,14 +149,13 @@ struct CollectiveMainloopFwd {
     static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v<Element> / 8);
 
     // static constexpr bool UseSchedulerBarrier = kHeadDim <= 128;
-    static constexpr bool UseSchedulerBarrier =
-        cutlass::sizeof_bits_v<Element> == 8 ? kHeadDim >= 128
-                                             : kHeadDim <= 128;    
+    static constexpr bool UseSchedulerBarrier = Ktraits::kNWarps >= 12 && 
+        (cutlass::sizeof_bits_v<Element> == 8 ? kHeadDim >= 128 : kHeadDim <= 128);
 
     // Host side kernel arguments
     struct Arguments {
         Element const* ptr_Q;
-        typename Seqlen_traits::LayoutT layout_Q;
+        typename Seqlen_traits_Q::LayoutT layout_Q;
         Element const* ptr_K;
         typename Seqlen_traits::LayoutT layout_K;
         Element const* ptr_V;
@@ -160,11 +166,14 @@ struct CollectiveMainloopFwd {
         float const* descale_v_ptr;
         int window_size_left;
         int window_size_right;
+        int const qhead_per_khead;
+        int const* cache_batch_idx;
+        int const num_splits;
     };
 
     // Device side kernel params
     struct Params {
-        typename Seqlen_traits::LayoutT layout_Q;
+        typename Seqlen_traits_Q::LayoutT layout_Q;
         typename Seqlen_traits::LayoutT layout_K;
         typename Seqlen_traits::LayoutT layout_V;
         cutlass::FastDivmod qhead_per_khead_divmod;
@@ -177,6 +186,8 @@ struct CollectiveMainloopFwd {
         float const* descale_v_ptr;
         int window_size_left;
         int window_size_right;
+        int const* cache_batch_idx;
+        cutlass::FastDivmod num_splits_divmod;
     };
 
 
@@ -186,8 +197,8 @@ struct CollectiveMainloopFwd {
         TMA_Q tma_load_Q = make_tma_copy(
             GmemTiledCopyQ{},
             mQ,
-            SmemLayoutQ{},
-            select<0, 2>(TileShape_MNK{}),
+            SmemLayoutQCopy{},
+            TileShapeQCopy{},
             _1{}); // no mcast for Q
         Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.layout_K);
         TMA_K tma_load_K = make_tma_copy(
@@ -204,11 +215,13 @@ struct CollectiveMainloopFwd {
             select<1, 2>(TileShape_MNK{}),
             size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
         return {args.layout_Q, args.layout_K, args.layout_V,
-                cutlass::FastDivmod(cute::ceil_div(get<2>(args.layout_Q.shape()), get<2>(args.layout_K.shape()))),
+                cutlass::FastDivmod(args.qhead_per_khead),
                 tma_load_Q, tma_load_K, tma_load_V,
                 args.softmax_scale_log2,
                 args.descale_q_ptr, args.descale_k_ptr, args.descale_v_ptr,
-                args.window_size_left, args.window_size_right};
+                args.window_size_left, args.window_size_right,
+                args.cache_batch_idx,
+                cutlass::FastDivmod(args.num_splits)};
     }
 
     /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
@@ -220,41 +233,60 @@ struct CollectiveMainloopFwd {
     }
 
     CUTLASS_DEVICE
-    int get_n_block_max(
-          Params const& mainloop_params, int m_block, 
-          const Seqlen_traits& seqlen_traits_q,
-          const Seqlen_traits& seqlen_traits_k
+    void get_n_block_min_max(
+          Params const& mainloop_params,
+          int m_block, 
+          int n_split_idx,
+          const Seqlen_traits_Q& seqlen_traits_q,
+          const Seqlen_traits& seqlen_traits_k,
+          int& n_block_min,
+          int& n_block_max
         ) {
-        static constexpr int kBlockM = get<0>(TileShape_MNK{});
-        static constexpr int kBlockN = get<1>(TileShape_MNK{});        
-        int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q);
-        int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K);        
-        int n_block_max = cute::ceil_div(seqlen_k, kBlockN);
-        if constexpr (Is_causal || Is_local) {
+        // static constexpr int kBlockM = get<0>(TileShape_MNK{});
+        static constexpr int kBlockN = get<1>(TileShape_MNK{});
+        static constexpr int kBlockM_div_H = get<0>(TileShape_MNK{})/Ktraits::kBlockH;
+        int const seqlen_q = seqlen_traits_q.actual_seq_len;
+        int const seqlen_k = seqlen_traits_k.actual_seq_len;
+        n_block_max = cute::ceil_div(seqlen_k, kBlockN);
+        
+        if constexpr(Is_split) {
+            int const n_blocks_per_split
+                = mainloop_params.num_splits_divmod.divide(n_block_max + int(mainloop_params.num_splits_divmod) - 1);
+            n_block_min = n_split_idx * n_blocks_per_split;
+            n_block_max = std::min(n_block_max, (n_split_idx + 1) * n_blocks_per_split);
+        }
+
+        if constexpr (Is_causal) {
             n_block_max = std::min(
                 n_block_max,
-                cute::ceil_div((m_block + 1) * kBlockM + seqlen_k - seqlen_q + mainloop_params.window_size_right, kBlockN));
+                cute::ceil_div((m_block + 1) * kBlockM_div_H + seqlen_k - seqlen_q, kBlockN));
+        } else if constexpr (Is_local) {
+            n_block_max = std::min(
+                n_block_max,
+                cute::ceil_div((m_block + 1) * kBlockM_div_H + seqlen_k - seqlen_q + mainloop_params.window_size_right, kBlockN));
+            n_block_min = std::max(
+                n_block_min,
+                (m_block * kBlockM_div_H + seqlen_k - seqlen_q - mainloop_params.window_size_left) / kBlockN);
         }
-        return n_block_max;
     }
 
     CUTLASS_DEVICE
-    int get_n_block_min(
-          Params const& mainloop_params, int m_block, 
-          const Seqlen_traits& seqlen_traits_q,
-          const Seqlen_traits& seqlen_traits_k
+    void get_n_block_max(
+          Params const& mainloop_params,
+          int m_block, 
+          const Seqlen_traits_Q& seqlen_traits_q,
+          const Seqlen_traits& seqlen_traits_k,
+          int& n_block_max
         ) {
-        static constexpr int kBlockM = get<0>(TileShape_MNK{});
-        static constexpr int kBlockN = get<1>(TileShape_MNK{});        
-        int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q);
-        int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K);        
-        if constexpr (!Is_local) {
-            return 0;
-        } else {
-            return std::max(
-                0, 
-                (m_block * kBlockM + seqlen_k - seqlen_q - mainloop_params.window_size_left) / kBlockN
-            );
+        // static constexpr int kBlockM = get<0>(TileShape_MNK{});
+        static constexpr int kBlockN = get<1>(TileShape_MNK{});
+        static constexpr int kBlockM_div_H = get<0>(TileShape_MNK{})/Ktraits::kBlockH;
+        int const seqlen_q = seqlen_traits_q.actual_seq_len;
+        int const seqlen_k = seqlen_traits_k.actual_seq_len;
+        n_block_max = cute::ceil_div(seqlen_k, kBlockN);
+        if constexpr (Is_causal) {
+            n_block_max = std::min(n_block_max,
+                cute::ceil_div((m_block + 1) * kBlockM_div_H + seqlen_k - seqlen_q, kBlockN));
         }
     }
 
@@ -269,13 +301,15 @@ struct CollectiveMainloopFwd {
          Scheduler& scheduler,
          typename Scheduler::Params const& scheduler_params,
          typename Scheduler::WorkTileInfo& work_tile_info,
-         cute::tuple<int32_t, int32_t, int32_t> block_coord,
+         cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
          int work_idx,
-         const Seqlen_traits& seqlen_traits_q,
-         const Seqlen_traits& seqlen_traits_k
+         const Seqlen_traits_Q& seqlen_traits_q,
+         const Seqlen_traits& seqlen_traits_k,
+         int n_block_min,
+         int n_block_max
          ) {
 
-        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQCopy{});
         Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
         Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{});
 
@@ -283,19 +317,30 @@ struct CollectiveMainloopFwd {
         Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape());
         Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape());
 
-        auto [m_block, bidh, bidb] = block_coord;
-        int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
+        auto [m_block, n_split_idx, bidh, bidb] = block_coord;
+        const int bidb_cache = mainloop_params.cache_batch_idx == nullptr ? bidb : mainloop_params.cache_batch_idx[bidb];
+        const int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
 
         // Prepare the TMA loads
         uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
         constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
         uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-        Tensor gQ = seqlen_traits_q.get_local_tile_tensor(
-            mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block);  // (M, K)
+        Tensor gQ = [&] {
+            // Need this inside lambda to capture structured binding
+            auto [m_block, n_split_idx, bidh, bidb] = block_coord;
+            if constexpr(Seqlen_traits_Q::UseGQAPacking) {
+                return seqlen_traits_q.get_local_tile_tensor(
+                    mQ, TileShapeQCopy{}, bidh_kv, bidb)
+                        (_, _, _, m_block, bidh % int(mainloop_params.qhead_per_khead_divmod));  // (M/H, H, K)
+            } else {
+                return seqlen_traits_q.get_local_tile_tensor(
+                    mQ, TileShapeQCopy{}, bidh, bidb)(_, _, m_block);  // (M, K)
+            }
+        }();
         Tensor gK = seqlen_traits_k.get_local_tile_tensor(
-            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb_cache);  // (N, K, _)
         Tensor gV = seqlen_traits_k.get_local_tile_tensor(
-            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb_cache);  // (N, K, _)
 
         Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{}));
         Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{}));
@@ -314,8 +359,6 @@ struct CollectiveMainloopFwd {
             }
         }
 
-        const int n_block_min = get_n_block_min(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
-        const int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
         int n_block = n_block_max - 1;
 
         int lane_predicate = cute::elect_one_sync();
@@ -337,8 +380,7 @@ struct CollectiveMainloopFwd {
         // Wait for warp 1 to signal that smem_v are ready and V can be copied from gmem
         // Need ClusterBarrier, not just NamedBarrier. Otherwise we might have CTA 0 finishing the
         // TMA store on O first, call TMA multicast load on V, before CTA 1 can finishing TMA store on O.
-        shared_storage.barrier_O.wait((work_idx + 1) % 2);
-
+        if constexpr (!No_smem_O) { shared_storage.barrier_O.wait((work_idx + 1) % 2); }
         if (lane_predicate) {
             // CUTLASS_PRAGMA_NO_UNROLL
             #pragma unroll 2
@@ -353,6 +395,7 @@ struct CollectiveMainloopFwd {
                 ++smem_pipe_write_v;
             }
         }
+
         scheduler.prefetch_next_work(scheduler_params, work_tile_info);
         if (lane_predicate) {
             pipeline_v.producer_acquire(smem_pipe_write_v);
@@ -361,6 +404,7 @@ struct CollectiveMainloopFwd {
             ++smem_pipe_write_v;
         }
         scheduler.broadcast_next_work(work_tile_info);
+        
     }
 
     template <typename Scheduler, typename SharedStorage>
@@ -375,16 +419,18 @@ struct CollectiveMainloopFwd {
          Scheduler& scheduler,
          typename Scheduler::Params const& scheduler_params,
          typename Scheduler::WorkTileInfo& work_tile_info,
-         cute::tuple<int32_t, int32_t, int32_t> block_coord,
+         cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
          int work_idx,
-         const Seqlen_traits& seqlen_traits_q,
-         const Seqlen_traits& seqlen_traits_k         
+         const Seqlen_traits_Q& seqlen_traits_q,
+         const Seqlen_traits& seqlen_traits_k,
+         int n_block_min,
+         int n_block_max
          ) {
         
         using SmemLayoutTransposeV = typename Ktraits::SmemLayoutTransposeV;
         using SmemLayoutTransposeVt = typename Ktraits::SmemLayoutTransposeVt;
 
-        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQCopy{});
         Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
         Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{});
         
@@ -408,19 +454,30 @@ struct CollectiveMainloopFwd {
         Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape());
         Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape());
 
-        auto [m_block, bidh, bidb] = block_coord;
-        int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
+        auto [m_block, split_idx, bidh, bidb] = block_coord;
+        const int bidb_cache = mainloop_params.cache_batch_idx == nullptr ? bidb : mainloop_params.cache_batch_idx[bidb];
+        const int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
 
         // Prepare the TMA loads
         uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
         constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
         uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-        Tensor gQ = seqlen_traits_q.get_local_tile_tensor(
-            mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block);  // (M, K)
+        Tensor gQ = [&] {
+            // Need this inside lambda to capture structured binding
+            auto [m_block, n_split_idx, bidh, bidb] = block_coord;
+            if constexpr(Seqlen_traits_Q::UseGQAPacking) {
+                return seqlen_traits_q.get_local_tile_tensor(
+                    mQ, TileShapeQCopy{}, bidh_kv, bidb)
+                        (_, _, _, m_block, bidh % int(mainloop_params.qhead_per_khead_divmod));  // (M/H, H, K)
+            } else {
+                return seqlen_traits_q.get_local_tile_tensor(
+                    mQ, TileShapeQCopy{}, bidh, bidb)(_, _, m_block);  // (M, K)
+            }
+        }();
         Tensor gK = seqlen_traits_k.get_local_tile_tensor(
-            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb_cache);  // (N, K, _)
         Tensor gV = seqlen_traits_k.get_local_tile_tensor(
-            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb_cache);  // (N, K, _)
 
         Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{}));
         Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{}));
@@ -439,7 +496,6 @@ struct CollectiveMainloopFwd {
             }
         }
 
-        int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
         int n_block = n_block_max - 1;
 
         int lane_predicate = cute::elect_one_sync();
@@ -454,142 +510,62 @@ struct CollectiveMainloopFwd {
         // for fp8, change from NumThreadsPerWarp to NumThreadsPerWarpGroup
         cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarpGroup, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);
 
-        if constexpr(Is_causal) {
-            if (warp_idx_in_warpgroup == 0 && lane_predicate) {
-                shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
-                copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
+        if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+            shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
+            copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
+            if constexpr(!Ktraits::VO_union_all) {
                 pipeline_v.producer_acquire(smem_pipe_write);
                 copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
                     tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));
             }
 
-            shared_storage.barrier_O.wait((work_idx + 1) % 2);            
-                        
-            CUTLASS_PRAGMA_UNROLL
-            for (int iter = 0; iter < kStages && n_block > 0; ++iter, --n_block) {
-                pipeline_v.consumer_wait(smem_pipe_read);
-                // pipeline_vt.producer_acquire(smem_pipe_write);
-                do_transpose_V(smem_pipe_read.index());
-                pipeline_vt.producer_commit(smem_pipe_write);
-                pipeline_v.consumer_release(smem_pipe_read);
-
-                ++smem_pipe_write;
-                ++smem_pipe_read;
-                
-                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
-                    pipeline_k.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index()));
-                    pipeline_v.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index()));
-                }
-            }            
-            
-            #pragma unroll 2
-            for (; n_block > 0; --n_block) {
-                pipeline_v.consumer_wait(smem_pipe_read);
-                pipeline_vt.producer_acquire(smem_pipe_write);
-                do_transpose_V(smem_pipe_read.index());
-                pipeline_vt.producer_commit(smem_pipe_write);
-                pipeline_v.consumer_release(smem_pipe_read);
-
-                ++smem_pipe_write;
-                ++smem_pipe_read;
-                
-                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
-                    pipeline_k.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index()));
-                    pipeline_v.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index()));
-                }                                                                
-            }       
-
-            scheduler.prefetch_next_work(scheduler_params, work_tile_info);
-            scheduler.broadcast_next_work(work_tile_info);
-            
-            pipeline_v.consumer_wait(smem_pipe_read);
-            if (n_block_max > kStages)
-                pipeline_vt.producer_acquire(smem_pipe_write);
-            do_transpose_V(smem_pipe_read.index());
-            pipeline_vt.producer_commit(smem_pipe_write);
-            pipeline_v.consumer_release(smem_pipe_read);
+        }
+        // With fp8 kernel, smem_o is in union with smem_v_out,
+        // except for split kernel + hdim 256,
+        // so could use NamedBarrier instead of ClusterBarrier.
+        // But, this doesn't appear to have any benefit.
+        if constexpr (!No_smem_O) { shared_storage.barrier_O.wait((work_idx + 1) % 2); }
 
-            ++smem_pipe_write;
-            ++smem_pipe_read;
-        } else {
+        if constexpr(Ktraits::VO_union_all) {
             if (warp_idx_in_warpgroup == 0 && lane_predicate) {
-                shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
-                copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
                 pipeline_v.producer_acquire(smem_pipe_write);
                 copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                    tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));        
+                    tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));
             }
-            // With fp8 kernel, smem_o is in union with smem_v_out,
-            // so could use NamedBarrier instead of ClusterBarrier.
-            // But, this doesn't appear to have any benefit.
-            shared_storage.barrier_O.wait((work_idx + 1) % 2);
-
+        }
+            
+        #pragma unroll 2
+        for (; n_block > n_block_min; --n_block) {
             pipeline_v.consumer_wait(smem_pipe_read);
-            // pipeline_vt.producer_acquire(smem_pipe_write);
+            pipeline_vt.producer_acquire(smem_pipe_write);
             do_transpose_V(smem_pipe_read.index());
             pipeline_vt.producer_commit(smem_pipe_write);
             pipeline_v.consumer_release(smem_pipe_read);
 
             ++smem_pipe_write;
             ++smem_pipe_read;
-            --n_block;
-
-            constexpr int extra_iterations = kStages - 1;
-            CUTLASS_PRAGMA_UNROLL
-            for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter) {
-                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
-                    pipeline_k.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tKgK(_, n_block), tKsK(_, smem_pipe_write.index()));
-                    pipeline_v.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));                
-                }
-                
-                pipeline_v.consumer_wait(smem_pipe_read);
-                // pipeline_vt.producer_acquire(smem_pipe_write);
-                do_transpose_V(smem_pipe_read.index());
-                pipeline_vt.producer_commit(smem_pipe_write);
-                pipeline_v.consumer_release(smem_pipe_read);
-                
-                ++smem_pipe_write;
-                ++smem_pipe_read;
-                --n_block;
-            }
+            
+            if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                pipeline_k.producer_acquire(smem_pipe_write);
+                copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                    tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index()));
+                pipeline_v.producer_acquire(smem_pipe_write);
+                copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                    tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index()));
+            }                                                                
+        }       
 
-            // CUTLASS_PRAGMA_NO_UNROLL
-            #pragma unroll 2        
-            for (; n_block >= 0; --n_block) {
-                
-                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
-                    pipeline_k.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tKgK(_, n_block), tKsK(_, smem_pipe_write.index()));
-                    pipeline_v.producer_acquire(smem_pipe_write);
-                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
-                        tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));                                
-                }
-                
-                pipeline_v.consumer_wait(smem_pipe_read);
-                pipeline_vt.producer_acquire(smem_pipe_write);
-                do_transpose_V(smem_pipe_read.index());
-                pipeline_vt.producer_commit(smem_pipe_write);
-                pipeline_v.consumer_release(smem_pipe_read);
-                
-                ++smem_pipe_write;
-                ++smem_pipe_read;
-            }
-            // scheduler.prefetch_next_work(scheduler_params, work_tile_info);
-            // scheduler.broadcast_next_work(work_tile_info);
-        }
+        scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+        scheduler.broadcast_next_work(work_tile_info);
+        
+        pipeline_v.consumer_wait(smem_pipe_read);
+        pipeline_vt.producer_acquire(smem_pipe_write);
+        do_transpose_V(smem_pipe_read.index());
+        pipeline_vt.producer_commit(smem_pipe_write);
+        pipeline_v.consumer_release(smem_pipe_read);
+
+        ++smem_pipe_write;
+        ++smem_pipe_read; 
     }
 
     /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
@@ -635,13 +611,16 @@ struct CollectiveMainloopFwd {
 
     CUTLASS_DEVICE void
     warp_scheduler_barrier_arrive() {
-        if constexpr (!UseSchedulerBarrier) { return; }
-        static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup);
-        if constexpr (NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup) {
-            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (3 - cutlass::canonical_warp_group_idx()) /*id*/);
+        if constexpr (!UseSchedulerBarrier) {
+            return;
         } else {
-            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 2 ? cutlass::canonical_warp_group_idx() + 1 : cutlass::canonical_warp_group_idx() + 1 - 3)  /*id*/);
-            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 1 ? cutlass::canonical_warp_group_idx() + 2 : cutlass::canonical_warp_group_idx() + 2 - 3)  /*id*/);
+            static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup);
+            if constexpr (NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup) {
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (3 - cutlass::canonical_warp_group_idx()) /*id*/);
+            } else {
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 2 ? cutlass::canonical_warp_group_idx() + 1 : cutlass::canonical_warp_group_idx() + 1 - 3)  /*id*/);
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 1 ? cutlass::canonical_warp_group_idx() + 2 : cutlass::canonical_warp_group_idx() + 2 - 3)  /*id*/);
+            }
         }
     }
 
@@ -649,17 +628,19 @@ struct CollectiveMainloopFwd {
     mma_init() {
         // Tell producer (warp 0) that smem_q is ready
         cutlass::arch::NamedBarrier::arrive(NumMmaThreads + Ktraits::NumProducerThreads, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);                
-        if constexpr (!UseSchedulerBarrier) { return; }
-        static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup);
-        if (cutlass::canonical_warp_group_idx() > 1) {
-            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 1 /*id*/);
-        }
-        if constexpr (NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup) {
-            if (cutlass::canonical_warp_group_idx() > 2) {
-                cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 2 /*id*/);
+        if constexpr (!UseSchedulerBarrier) {
+            return;
+        } else {
+            static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup);
+            if (cutlass::canonical_warp_group_idx() > 1) {
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 1 /*id*/);
+            }
+            if constexpr (NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup) {
+                if (cutlass::canonical_warp_group_idx() > 2) {
+                    cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 2 /*id*/);
+                }
             }
         }
-
     }
 
     template <typename SharedStorage, typename FrgTensorO, typename Softmax>
@@ -671,19 +652,20 @@ struct CollectiveMainloopFwd {
         PipelineState& smem_pipe_read_v,
         FrgTensorO& tOrO,
         Softmax& softmax,
-        int n_block_count,
         int n_block_min,
+        int n_block_max,
         int thread_idx,
         int work_idx,
         int m_block,
         SharedStorage& shared_storage,
-        const Seqlen_traits& seqlen_traits_q,
+        const Seqlen_traits_Q& seqlen_traits_q,
         const Seqlen_traits& seqlen_traits_k
         ) {
         static_assert(is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
 
-        static constexpr int kBlockM = get<0>(TileShape_MNK{});
         static constexpr int kBlockN = get<1>(TileShape_MNK{});
+        static constexpr int kBlockH = Ktraits::kBlockH;
+        static constexpr int kBlockM_div_H = get<0>(TileShape_MNK{}) / kBlockH;
 
         Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
         Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
@@ -709,24 +691,26 @@ struct CollectiveMainloopFwd {
         tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero;
         int const seqlen_q = seqlen_traits_q.actual_seq_len;
         int const seqlen_k = seqlen_traits_k.actual_seq_len;
-        int n_block = n_block_count - 1;
+        int n_block = n_block_max - 1;
 
         cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_Q.try_wait(work_idx % 2));
         if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(work_idx % 2); }
 
         Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+        
         consumer_wait(pipeline_k, smem_pipe_read_k);
         warp_scheduler_barrier_sync();
         flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS);
         warp_scheduler_barrier_arrive();
-    
-        if (work_idx != 0) {
-            int lane_predicate = cute::elect_one_sync();
-            if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) {
-                tma_store_wait<0>();
-                #pragma unroll
-                for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
-                    shared_storage.barrier_O.arrive(cta_id, lane_predicate);
+        if constexpr (!No_smem_O) {
+            if (work_idx != 0) {
+                int lane_predicate = cute::elect_one_sync();
+                if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) {
+                    tma_store_wait<0>();
+                    #pragma unroll
+                    for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+                        shared_storage.barrier_O.arrive(cta_id, lane_predicate);
+                    }
                 }
             }
         }
@@ -735,15 +719,16 @@ struct CollectiveMainloopFwd {
         ++smem_pipe_read_k;
 
         auto col_limit_right = [&](int row, int n_block) {
-            return std::min(
-                seqlen_k - n_block * kBlockN, 
-                row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM + mainloop_params.window_size_right
-            );
+            int col_limit_base = row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM_div_H;
+            if constexpr(Is_local)
+                return col_limit_base + mainloop_params.window_size_right;
+            else
+                return col_limit_base;
         };
         auto col_limit_left = [&](int row, int n_block) {
             return std::max(
                 0,
-                row + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM - mainloop_params.window_size_left
+                row + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM_div_H - mainloop_params.window_size_left
             );
         };
         {
@@ -757,14 +742,15 @@ struct CollectiveMainloopFwd {
                     // using std::min is faster than doing col >= limit0 or col >= limit1
                     // Need to cast get<1>(tScS(i)) to (signed) int since by default it's unsigned, and the
                     // right hand side can be negative and might be converted to a very large unsigned integer.
-                    if (int(get<1>(tScS(i))) >= col_limit_right(int(get<0>(tScS(i))), n_block)) {
+                    int row = int(get<0>(tScS(i))) / kBlockH;
+                    if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN, col_limit_right(row, n_block))) {
                         tSrS(i) = -INFINITY;
-                    } else if constexpr (Is_local) {
-                        if (int(get<1>(tScS(i))) < col_limit_left(int(get<0>(tScS(i))), n_block)) {
+                    } else if constexpr(Is_local) {
+                        if (int(get<1>(tScS(i))) < col_limit_left(row, n_block)) {
                             tSrS(i) = -INFINITY;
                         }
                     }
-                }
+                } 
             }
         }
 
@@ -774,7 +760,7 @@ struct CollectiveMainloopFwd {
         Tensor scores_scale = make_fragment_like(softmax.row_max);
         clear(scores_scale);
 
-        constexpr int n_masking_steps = (!Is_causal) ? 1 : cute::ceil_div(kBlockM, kBlockN) + 1;
+        constexpr int n_masking_steps = !Is_causal ? 1 : cute::ceil_div(kBlockM_div_H, kBlockN) + 1;
         // Only go through these if Is_causal, since n_masking_steps = 1 when !Is_causal
         #pragma unroll
         for (int masking_step = 0; masking_step < n_masking_steps - 1 && n_block > n_block_min; ++masking_step, --n_block) {
@@ -792,7 +778,8 @@ struct CollectiveMainloopFwd {
             Tensor tScS = threadMma0.partition_C(cS);
             #pragma unroll
             for (int i = 0; i < size(tSrS); ++i) {
-                if (int(get<1>(tScS(i))) >= col_limit_right(int(get<0>(tScS(i))), n_block - 1)) {
+                int row = int(get<0>(tScS(i))) / kBlockH;
+                if (int(get<1>(tScS(i))) >= col_limit_right(row, n_block - 1)) {
                     tSrS(i) = -INFINITY;
                 }
             }
@@ -823,9 +810,10 @@ struct CollectiveMainloopFwd {
                 Tensor tScS = threadMma0.partition_C(cS);
                 #pragma unroll
                 for (int i = 0; i < size(tSrS); ++i) {
+                    int row = int(get<0>(tScS(i))) / kBlockH;
                     if (
-                        int(get<1>(tScS(i))) >= col_limit_right(int(get<0>(tScS(i))), n_block - 1) ||
-                        int(get<1>(tScS(i))) < col_limit_left(int(get<0>(tScS(i))), n_block - 1)
+                        int(get<1>(tScS(i))) >= col_limit_right(row, n_block - 1) ||
+                        int(get<1>(tScS(i))) < col_limit_left(row, n_block - 1)
                     ) {
                         tSrS(i) = -INFINITY;
                     }
@@ -847,7 +835,7 @@ struct CollectiveMainloopFwd {
         softmax.rescale_o(tOrO, scores_scale);
         consumer_wait(pipeline_v, smem_pipe_read_v);
         flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO);
-        cute::copy(softmax.template finalize</*Check_inf=*/Is_causal || Is_local>(tSrS), scores_scale);
+        cute::copy(softmax.template finalize</*Is_dropout=*/false, Is_split>(tSrS), scores_scale);
         warpgroup_wait<0>();
         pipeline_v.consumer_release(smem_pipe_read_v);  // release V, otherwise producers will hang
         ++smem_pipe_read_v;
@@ -864,18 +852,21 @@ struct CollectiveMainloopFwd {
         PipelineState& smem_pipe_release,        
         FrgTensorO& tOrO,
         Softmax& softmax,
-        int n_block_count,
+        int n_block_min,
+        int n_block_max,
         int thread_idx,
         int work_idx,
         int m_block,
         SharedStorage& shared_storage,
-        const Seqlen_traits& seqlen_traits_q,
+        const Seqlen_traits_Q& seqlen_traits_q,
         const Seqlen_traits& seqlen_traits_k
         ) {
         static_assert(is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
 
-        static constexpr int kBlockM = get<0>(TileShape_MNK{});
+        // static constexpr int kBlockM = get<0>(TileShape_MNK{});
         static constexpr int kBlockN = get<1>(TileShape_MNK{});
+        static constexpr int kBlockH = Ktraits::kBlockH;
+        static constexpr int kBlockM_div_H = get<0>(TileShape_MNK{}) / kBlockH;
 
         Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
         Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
@@ -898,10 +889,9 @@ struct CollectiveMainloopFwd {
         };
 
         tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero;
-        // workaround for fp8 only perf regression pending change to seqlen traits class
-        int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q);
-        int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K);
-        int n_block = n_block_count - 1;
+        int const seqlen_q = seqlen_traits_q.actual_seq_len;
+        int const seqlen_k = seqlen_traits_k.actual_seq_len;
+        int n_block = n_block_max - 1;
         
         cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_Q.try_wait(work_idx % 2));
         if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(work_idx % 2); }
@@ -911,34 +901,50 @@ struct CollectiveMainloopFwd {
         consumer_wait(pipeline_k, smem_pipe_read);                        
         warp_scheduler_barrier_sync();
         flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
-        if (work_idx != 0) {        
-            int lane_predicate = cute::elect_one_sync();
-            if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) {
-                tma_store_wait<0>();
-                #pragma unroll
-                for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
-                    shared_storage.barrier_O.arrive(cta_id, lane_predicate);
-                }
-            }        
+        if constexpr (!No_smem_O) {
+            if (work_idx != 0) {        
+                int lane_predicate = cute::elect_one_sync();
+                if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) {
+                    tma_store_wait<0>();
+                    #pragma unroll
+                    for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+                        shared_storage.barrier_O.arrive(cta_id, lane_predicate);
+                    }
+                }        
+            }
         }
         warpgroup_wait<0>();
         warp_scheduler_barrier_arrive();
         pipeline_k.consumer_release(smem_pipe_read);
 
-        auto col_limit_causal = [&](int row, int n_block) {
-            return row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM;
+        auto col_limit_right = [&](int row, int n_block) {
+            int col_limit_base = row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM_div_H;
+            if constexpr(Is_local)
+                return col_limit_base + mainloop_params.window_size_right;
+            else
+                return col_limit_base;
+        };
+        auto col_limit_left = [&](int row, int n_block) {
+            return std::max(
+                0,
+                row + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM_div_H - mainloop_params.window_size_left
+            );
         };       
         {
             Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
             Tensor tScS = threadMma0.partition_C(cS);
             #pragma unroll
             for (int i = 0; i < size(tSrS); ++i) {
-                if constexpr (!Is_causal) {  // Just masking based on col                
+                if constexpr (!Is_causal && !Is_local) {  // Just masking based on col                
                     if (int(get<1>(tScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; }
                 } else {  // mask based on both row and col
-                    if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN,
-                                                         col_limit_causal(int(get<0>(tScS(i))), n_block))) {
+                    int row = int(get<0>(tScS(i))) / kBlockH;
+                    if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN, col_limit_right(row, n_block))) {
                         tSrS(i) = -INFINITY;
+                    } else if constexpr(Is_local) {
+                        if (int(get<1>(tScS(i))) < col_limit_left(row, n_block)) {
+                            tSrS(i) = -INFINITY;
+                        }
                     }
                 }
             }
@@ -957,11 +963,11 @@ struct CollectiveMainloopFwd {
 
         ++smem_pipe_read;
         --n_block;
-        constexpr int extra_iterations = !Is_causal ? kStages - 1 : cute::ceil_div(kBlockM, kBlockN);        
+        constexpr int extra_iterations = !Is_causal ? kStages - 1 : cute::ceil_div(kBlockM_div_H, kBlockN);        
 
         if constexpr(Is_causal) {
-            CUTLASS_PRAGMA_UNROLL      
-            for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter, --n_block) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int iter = 0; iter < extra_iterations && n_block >= n_block_min; ++iter, --n_block) {
                 Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
                 consumer_wait(pipeline_k, smem_pipe_read);
                 warp_scheduler_barrier_sync();
@@ -971,7 +977,8 @@ struct CollectiveMainloopFwd {
                 Tensor tScS = threadMma0.partition_C(cS);
                 #pragma unroll
                 for (int i = 0; i < size(tSrS); ++i) {
-                    if (int(get<1>(tScS(i))) >= col_limit_causal(int(get<0>(tScS(i))), n_block)) {
+                    int row = int(get<0>(tScS(i))) / kBlockH;
+                    if (int(get<1>(tScS(i))) >= col_limit_right(row, n_block)) {
                         tSrS(i) = -INFINITY;
                     }
                 }
@@ -991,12 +998,12 @@ struct CollectiveMainloopFwd {
                 permute_regs_A_to_C(tOrP);
                 
                 flash::gemm</*zero_init=*/false, /*wg_wait=*/0>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO);            
-                if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); }                
+                if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); }
                 ++smem_pipe_read;
             }
-        } else {
+        } else if constexpr(!Is_local) { 
             CUTLASS_PRAGMA_UNROLL      
-            for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter, --n_block) {
+            for (int iter = 0; iter < extra_iterations && n_block >= n_block_min; ++iter, --n_block) {
                 Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
                 consumer_wait(pipeline_k, smem_pipe_read);
                 if constexpr(Delay_V_release) {
@@ -1009,9 +1016,9 @@ struct CollectiveMainloopFwd {
                 if constexpr(!Delay_V_release) { pipeline_k.consumer_release(smem_pipe_read); }
                 else { consumer_wait(pipeline_vt, smem_pipe_read); }
                 
-                cute::copy(softmax.template max</*Is_first=*/false>(tSrS), scores_scale);
+                cute::copy(softmax.template max</*Is_first=*/false, /*Check_inf=*/Is_local>(tSrS), scores_scale);
                 softmax.rescale_o(tOrO, scores_scale);
-                softmax.template online_softmax</*Is_first=*/false>(tSrS);
+                softmax.template online_softmax</*Is_first=*/false, /*Check_inf=*/Is_local>(tSrS);
                 Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
                 permute_regs_A_to_C(tOrP);
 
@@ -1026,17 +1033,33 @@ struct CollectiveMainloopFwd {
         if constexpr(Delay_V_release) {
             warp_scheduler_barrier_sync();
             CUTLASS_PRAGMA_NO_UNROLL
-            for (; n_block >= 0; --n_block) {
+            for (; n_block >= n_block_min; --n_block) {
                 Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
                 consumer_wait(pipeline_k, smem_pipe_read);                
-                flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);                
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+
+                if constexpr(Is_local) {
+                    Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
+                    Tensor tScS = threadMma0.partition_C(cS);
+                    #pragma unroll
+                    for (int i = 0; i < size(tSrS); ++i) {
+                        int row = int(get<0>(tScS(i))) / kBlockH;
+                        if (
+                            int(get<1>(tScS(i))) >= col_limit_right(row, n_block) ||
+                            int(get<1>(tScS(i))) < col_limit_left(row, n_block)
+                        ) {
+                            tSrS(i) = -INFINITY;
+                        }
+                    }
+                }
+
                 warp_scheduler_barrier_arrive();                
                 pipeline_k.consumer_release(smem_pipe_read);
                 pipeline_vt.consumer_release(smem_pipe_release);
 
-                cute::copy(softmax.template max</*Is_first=*/false>(tSrS), scores_scale);
+                cute::copy(softmax.template max</*Is_first=*/false, /*Check_inf=*/Is_local>(tSrS), scores_scale);
                 softmax.rescale_o(tOrO, scores_scale);
-                softmax.template online_softmax</*Is_first=*/false>(tSrS);
+                softmax.template online_softmax</*Is_first=*/false, /*Check_inf=*/Is_local>(tSrS);
                 Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
                 permute_regs_A_to_C(tOrP);
                 
@@ -1052,17 +1075,33 @@ struct CollectiveMainloopFwd {
         } else {
             if constexpr (kHeadDim == 128) { warp_scheduler_barrier_sync(); }
             CUTLASS_PRAGMA_NO_UNROLL
-            for (; n_block >= 0; --n_block) {
+            for (; n_block >= n_block_min; --n_block) {
                 Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
                 consumer_wait(pipeline_k, smem_pipe_read);
                 if constexpr (kHeadDim == 256) { warp_scheduler_barrier_sync(); }
                 flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+
+                if constexpr(Is_local) {
+                    Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
+                    Tensor tScS = threadMma0.partition_C(cS);
+                    #pragma unroll
+                    for (int i = 0; i < size(tSrS); ++i) {
+                        int row = int(get<0>(tScS(i))) / kBlockH;
+                        if (
+                            int(get<1>(tScS(i))) >= col_limit_right(row, n_block) ||
+                            int(get<1>(tScS(i))) < col_limit_left(row, n_block)
+                        ) {
+                            tSrS(i) = -INFINITY;
+                        }
+                    }
+                }
+
                 warp_scheduler_barrier_arrive();
                 pipeline_k.consumer_release(smem_pipe_read);
 
-                cute::copy(softmax.template max</*Is_first=*/false>(tSrS), scores_scale);
+                cute::copy(softmax.template max</*Is_first=*/false, /*Check_inf=*/Is_local>(tSrS), scores_scale);
                 softmax.rescale_o(tOrO, scores_scale);
-                softmax.template online_softmax</*Is_first=*/false>(tSrS);
+                softmax.template online_softmax</*Is_first=*/false, /*Check_inf=*/Is_local>(tSrS);
                 Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
                 permute_regs_A_to_C(tOrP);
 
@@ -1075,7 +1114,7 @@ struct CollectiveMainloopFwd {
             if constexpr (kHeadDim == 128) { warp_scheduler_barrier_arrive(); }
         }
         cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarpGroup, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);
-        cute::copy(softmax.template finalize</*Check_inf=*/Is_causal>(tSrS, shared_storage.descale_v), scores_scale);
+        cute::copy(softmax.template finalize</*Is_dropout=*/false, Is_split>(tSrS, shared_storage.descale_v), scores_scale);
         softmax.rescale_o(tOrO, scores_scale);
         return;
     }
diff --git a/hopper/seq_len.h b/hopper/seq_len.h
index 76c4d08a3..63c18679f 100644
--- a/hopper/seq_len.h
+++ b/hopper/seq_len.h
@@ -4,6 +4,9 @@
 
 #pragma once
 
+#include <array>
+#include <algorithm>
+
 #include <cutlass/cutlass.h>
 #include <cute/layout.hpp>
 
@@ -11,8 +14,11 @@ namespace flash {
 
 static constexpr int kMaxTileSize = 128;
 
-template <bool UseVarSeqLen> class SeqLenTraits {
+template <bool UseVarSeqLen_, bool UseGQAPacking_> class SeqLenTraits {
 public:
+  static_assert(!(UseVarSeqLen_ && UseGQAPacking_),
+    "Variable sequence length with GQA parallelization not implemented yet.");
+
   // Total number of queries / keys. Unpadded.
   int sum_s = 0;
   // seq len offsets.
@@ -23,17 +29,26 @@ template <bool UseVarSeqLen> class SeqLenTraits {
   int actual_seq_len = -1;
 
   // Whether this is for fixed-seq-len or var-seq-len.
-  static constexpr bool kUseVarSeqLen = UseVarSeqLen;
+  static constexpr bool UseVarSeqLen = UseVarSeqLen_;
+  static constexpr bool UseGQAPacking = UseGQAPacking_;
 
   using ShapeT = std::conditional_t<
       UseVarSeqLen, 
-      cute::Shape<int32_t, int32_t, int32_t>, 
-      cute::Shape<int32_t, int32_t, int32_t, int32_t>
+      cute::Shape<int32_t, int32_t, int32_t>,
+      std::conditional_t<
+        UseGQAPacking,
+        cute::Shape<int32_t, int32_t, int32_t, int32_t, int32_t>,
+        cute::Shape<int32_t, int32_t, int32_t, int32_t>
+      >
   >;
   using StrideT = std::conditional_t<
       UseVarSeqLen, 
       cute::Shape<int64_t, _1, int64_t>, 
-      cute::Shape<int64_t, _1, int64_t, int64_t>
+      std::conditional_t<
+        UseGQAPacking,
+        cute::Shape<int64_t, int64_t, _1, int64_t, int64_t>,
+        cute::Shape<int64_t, _1, int64_t, int64_t>
+      >
   >;
   using LayoutT = cute::Layout<ShapeT, StrideT>;
 
@@ -49,33 +64,102 @@ template <bool UseVarSeqLen> class SeqLenTraits {
   >;
   using LayoutLseT = cute::Layout<ShapeLseT, StrideLseT>;
 
+  // Not used for varseqlen
+  using ShapeOAccumT = std::conditional_t<
+    UseGQAPacking,
+    cute::Shape<int32_t, int32_t, int32_t, int32_t, int32_t, int32_t>,
+    cute::Shape<int32_t, int32_t, int32_t, int32_t, int32_t>
+  >;
+  using StrideOAccumT = std::conditional_t<
+    UseGQAPacking,
+    cute::Shape<int64_t, int64_t, _1, int64_t, int64_t, int64_t>,
+    cute::Shape<int64_t, _1, int64_t, int64_t, int64_t>
+  >;
+  using LayoutOAccumT = cute::Layout<ShapeOAccumT, StrideOAccumT>;
+
+  using ShapeLseAccumT = cute::Shape<int32_t, int32_t, int32_t, int32_t>;
+  using StrideLseAccumT = cute::Shape<int64_t, int64_t, int64_t, _1>;
+  using LayoutLseAccumT = cute::Layout<ShapeLseAccumT, StrideLseAccumT>;
+
   CUTLASS_HOST SeqLenTraits() {}
 
   CUTLASS_HOST SeqLenTraits(
       int sum_s, int max_seq_len, int *cu_seq_len = nullptr, int *seq_used = nullptr): 
       sum_s(sum_s), cu_seq_len(cu_seq_len), seq_used(seq_used), actual_seq_len(max_seq_len) {}
 
+  CUTLASS_DEVICE void init(int bidb) {
+    // TODO: add leftpad, seqlen_new for kv cache support
+    if (seq_used) {
+      actual_seq_len = seq_used[bidb];
+    }
+  }
+
+  CUTLASS_DEVICE void init_no_guard(int bidb) {
+    actual_seq_len = seq_used[bidb];
+  }
+
   // Returns the layout of a tensor in MKHB format in global memory.
   // padded: only useful for var-seq-len for dq_accum and softmax_d.
   CUTLASS_HOST_DEVICE auto get_gmem_layout(
       int m, int k, int h, int b, 
       int64_t m_stride, int64_t h_stride, int64_t b_stride,
       bool padded = false) const {
-    static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen.");
+    static_assert(!UseVarSeqLen, "Specialize default implementation for VarSeqLen.");
+    // static_assert(!UseGQAPacking, "Specialize default implementation for UseGQAPacking.");
     return make_layout(make_shape(m, k, h, b),
                        make_stride(m_stride, cute::_1{}, h_stride, b_stride));
   }
 
   // Returns the layout of a tensor in MKHB format in global memory.
   // padded: only useful for var-seq-len for dq_accum and softmax_d.
+  // Overload that separates h into h_k and h/h_k.
+  CUTLASS_HOST_DEVICE auto get_gmem_layout(
+      int m, int k, int h_k, int b, int h_h_k_ratio,
+      int64_t m_stride, int64_t h_stride, int64_t b_stride,
+      bool padded = false) const {
+    static_assert(!UseVarSeqLen, "Specialize default implementation for VarSeqLen.");
+    static_assert(!UseGQAPacking, "Specialize default implementation for UseGQAPacking.");
+    return make_layout(make_shape(m, k, h_k * h_h_k_ratio, b),
+                       make_stride(m_stride, cute::_1{}, h_stride, b_stride));    
+  }
+
+  // Returns the layout of a tensor in MKHBT format in global memory,
+  // where T is number of splits.
+  CUTLASS_HOST_DEVICE auto get_oaccum_gmem_layout(
+      int m, int k, int h, int b, int num_splits,
+      int64_t m_stride, int64_t h_stride, int64_t b_stride, int64_t split_stride,
+      bool padded = false) const {
+    return make_layout(make_shape(m, k, h, b, num_splits),
+                       make_stride(m_stride, cute::_1{}, h_stride, b_stride, split_stride));
+  }
+
+  // Returns the layout of a tensor in MKHBT format in global memory,
+  // where T is number of splits.
+  // Overload that separates h into h_k and h/h_k.
+  CUTLASS_HOST_DEVICE auto get_oaccum_gmem_layout(
+      int m, int k, int h_k, int b, int h_h_k_ratio, int num_splits,
+      int64_t m_stride, int64_t h_stride, int64_t b_stride, int64_t split_stride,
+      bool padded = false) const {
+    return make_layout(make_shape(m, k, h_k * h_h_k_ratio, b, num_splits),
+                       make_stride(m_stride, cute::_1{}, h_stride, b_stride, split_stride));
+  }
+
+  // Returns the layout of lse tensor in BHM format in global memory.
+  // padded: only useful for var-seq-len for dq_accum and softmax_d.
   CUTLASS_HOST_DEVICE auto get_lse_gmem_layout(
       int m, int h, int b, bool padded = false) const {
-    static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen.");
+    static_assert(!UseVarSeqLen, "Specialize default implementation for VarSeqLen.");
     return make_layout(make_shape(b, h, m),
                        make_stride(int64_t(h * m), int64_t(m), cute::_1()));
   }
 
-  CUTLASS_DEVICE void init(int bidb) {}
+  // Returns the layout of lse tensor in TBHM format in global memory,
+  // where T is number of splits.
+  CUTLASS_HOST_DEVICE auto get_lseaccum_gmem_layout(
+      int m, int h, int b, int num_splits, bool padded = false) const {
+    return make_layout(make_shape(num_splits, b, h, m),
+                       make_stride(int64_t(b * h * m), int64_t(h * m), int64_t(m), cute::_1()));
+  }
 
   template <typename MTensor, typename Shape>
   CUTLASS_DEVICE auto get_local_tile_tensor(
@@ -86,18 +170,57 @@ template <bool UseVarSeqLen> class SeqLenTraits {
     return g_tensor;
   }
 
-  template <typename MTensor, typename Shape>
+  template <bool Is_split, typename MTensor, typename Shape>
   CUTLASS_DEVICE auto get_lse_local_tile_tensor(
       const MTensor &m_tensor, const Shape &tile_shape, 
-      int bidh, int bidb, bool padded = false) const {
-    auto g_tensor = local_tile(m_tensor(bidb, bidh, _), tile_shape, make_coord(_));
-    return g_tensor;
+      int bidh, int bidb, int n_split_idx, bool padded = false) const {
+    // m_tensor has shape (B, H, M) or (splits, B, H, M)
+    // Expect tile shape (bM)
+    // Returns g_tensor of shape = (bM, ceil_div(M,bM))
+    if constexpr(!Is_split) {
+      auto g_tensor = local_tile(m_tensor(bidb, bidh, _), tile_shape, make_coord(_));
+      return g_tensor;
+    } else {
+      auto g_tensor = local_tile(m_tensor(n_split_idx, bidb, bidh, _), tile_shape, make_coord(_));
+      return g_tensor;
+    }
   }
+
+  template <bool Is_split, typename MTensor, typename Shape>
+  CUTLASS_DEVICE auto get_o_local_tile_tensor(
+      const MTensor &m_tensor, const Shape &tile_shape,
+      int bidh, int bidb, int split_idx, bool padded = false) const {
+    // static_assert(!UseVarSeqLen, "Don't use get_o_local_tile_tensor with VarSeqLen.");
+    // m_tensor has shape (M, K, H, B) or (M, K, H, B, splits)
+    // Expect tile shape (bM, K)
+    // Returns g_tensor of shape = (bM, K, ceil_div(M,bM))
+    if constexpr(!Is_split) {
+      auto g_tensor = local_tile(
+        m_tensor(_, _, bidh, bidb), tile_shape, make_coord(_, _0{}));
+      return g_tensor;
+    } else {
+      auto g_tensor = local_tile(
+        m_tensor(_, _, bidh, bidb, split_idx), tile_shape, make_coord(_, _0{}));
+      return g_tensor;
+    }
+  }
+  
 };
 
-using FixedSeqLenTraits = SeqLenTraits<false>;
+using FixedSeqLenTraits = SeqLenTraits<false, false>;
+using VarSeqLenTraits = SeqLenTraits<true, false>;
+using FixedGQASeqLenTraits = SeqLenTraits<false, true>;
+
+template <>
+CUTLASS_DEVICE void VarSeqLenTraits::init(int bidb) {
+  actual_seq_len = 
+      seq_used ? seq_used[bidb] : (cu_seq_len[bidb + 1] - cu_seq_len[bidb]);
+}
 
-using VarSeqLenTraits = SeqLenTraits<true>;
+template <>
+CUTLASS_DEVICE void FixedGQASeqLenTraits::init(int bidb) {
+  // no op
+}
 
 // Returns the static layout of a var-seq-len tensor in global memory based on
 // max_seq_len and max_batch_size.
@@ -113,6 +236,16 @@ CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_gmem_layout(
     make_stride(m_stride, cute::_1{}, h_stride));
 }
 
+template <>
+CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_gmem_layout(
+    int m, int k, int h_k, int b, int h_h_k_ratio,
+    int64_t m_stride, int64_t h_stride, int64_t b_stride,
+    bool padded) const {
+  return make_layout(
+    make_shape(sum_s + (padded ? kMaxTileSize * b : 0), k, h_k * h_h_k_ratio), 
+    make_stride(m_stride, cute::_1{}, h_stride));
+}
+
 // padded: only useful for var-seq-len for dq_accum and softmax_d.
 // When padded is True, use B_M + kMaxTileSize * B as the total B_M.
 template <>
@@ -123,12 +256,6 @@ CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_lse_gmem_layout(
     make_stride(int64_t(sum_s + (padded ? kMaxTileSize * b : 0)), cute::_1()));
 }
 
-template <>
-CUTLASS_DEVICE void VarSeqLenTraits::init(int bidb) {
-  actual_seq_len = 
-      seq_used ? seq_used[bidb] : (cu_seq_len[bidb + 1] - cu_seq_len[bidb]);
-}
-
 template <>
 template <typename MTensor, typename Shape>
 CUTLASS_DEVICE auto VarSeqLenTraits::get_local_tile_tensor(
@@ -148,11 +275,33 @@ CUTLASS_DEVICE auto VarSeqLenTraits::get_local_tile_tensor(
   return g_tensor;
 }
 
+// TODO: restructure to not duplicate code
 template <>
-template <typename MTensor, typename Shape>
+template <bool Is_split, typename MTensor, typename Shape>
+CUTLASS_DEVICE auto VarSeqLenTraits::get_o_local_tile_tensor(
+    const MTensor &m_tensor, const Shape &tile_shape,
+    int bidh, int bidb, int n_split_idx, bool padded) const {
+  static_assert(!Is_split, "Don't currently support split kv kernel with VarSeqLenTraits");
+  auto g_offset = local_tile(
+      m_tensor(_, _, bidh), 
+      cute::make_shape(1, get<1>(tile_shape)), 
+      make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0), _0{}));
+  auto g_sequence = make_tensor(
+      g_offset.data(), 
+      make_layout(
+        cute::make_shape(actual_seq_len, get<1>(tile_shape)), 
+        g_offset.stride()
+      ));
+  auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_, _0{}));
+  return g_tensor;
+}
+
+template <>
+template <bool Is_split, typename MTensor, typename Shape>
 CUTLASS_DEVICE auto VarSeqLenTraits::get_lse_local_tile_tensor(
     const MTensor &m_tensor, const Shape &tile_shape,
-    int bidh, int bidb, bool padded) const {
+    int bidh, int bidb, int n_split_idx, bool padded) const {
+  static_assert(!Is_split, "Don't currently support split kv kernel with VarSeqLenTraits");
   auto g_offset = local_tile(
       m_tensor(bidh, _), cute::make_shape(_1{}), 
       make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0)));
@@ -163,6 +312,60 @@ CUTLASS_DEVICE auto VarSeqLenTraits::get_lse_local_tile_tensor(
   return g_tensor;
 }
 
+// Returns layout of QO tensor in (M,H/HK,K,HK,B) format in global memory.
+template <>
+CUTLASS_HOST_DEVICE auto FixedGQASeqLenTraits::get_gmem_layout(
+    int m, int k, int h_k, int b, int h_h_k_ratio,
+    int64_t m_stride, int64_t h_stride, int64_t b_stride, bool padded) const {
+  return make_layout(make_shape(m, h_h_k_ratio, k, h_k, b),
+                     make_stride(m_stride, h_stride, cute::_1{},
+                                 h_stride * h_h_k_ratio, b_stride));
+}
+
+// Returns layout of Oaccum tensor in (M,H/HK,K,HK,B,T) format in global memory.
+template <>
+CUTLASS_HOST_DEVICE auto FixedGQASeqLenTraits::get_oaccum_gmem_layout(
+    int m, int k, int h_k, int b, int h_h_k_ratio, int num_splits,
+    int64_t m_stride, int64_t h_stride, int64_t b_stride, int64_t split_stride,
+    bool padded) const {
+  return make_layout(make_shape(m, h_h_k_ratio, k, h_k, b, num_splits),
+                     make_stride(m_stride, h_stride, cute::_1{},
+                                 h_stride * h_h_k_ratio, b_stride,
+                                 split_stride));
+}
+
+template <>
+template <typename MTensor, typename Shape>
+CUTLASS_DEVICE auto FixedGQASeqLenTraits::get_local_tile_tensor(
+    const MTensor &m_tensor, const Shape &tile_shape, 
+    int bidh_kv, int bidb, bool padded) const {
+  // m_tensor has shape (M, H/H_K, K, H_K, B)
+  // Expect tile_shape (bM/bH, bH, K)
+  // Returns g_tensor of shape (bM/bH, bH, K, ceil_div(M,bM/bH), ceil_div(H/H_K,bH))
+  auto g_tensor = local_tile(
+      m_tensor(_, _, _, bidh_kv, bidb), tile_shape, make_coord(_, _, _0{}));
+  return g_tensor;
+}
+
+template <>
+template <bool Is_split, typename MTensor, typename Shape>
+CUTLASS_DEVICE auto FixedGQASeqLenTraits::get_o_local_tile_tensor(
+    const MTensor &m_tensor, const Shape &tile_shape,
+    int bidh_kv, int bidb, int split_idx, bool padded) const {
+  // m_tensor has shape (M, H/H_K, K, H_K, B) or (M, H/H_K, K, H_K, B, splits)
+  // Expect tile_shape (bM/bH, bH, K)
+  // Returns g_tensor of shape (bM/bH, bH, K, ceil_div(M,bM/bH), ceil_div(H/H_K,bH))
+  if constexpr(!Is_split) {
+    auto g_tensor = local_tile(
+      m_tensor(_, _, _, bidh_kv, bidb), tile_shape, make_coord(_, _, _0{}));
+    return g_tensor;
+  } else {
+    auto g_tensor = local_tile(
+      m_tensor(_, _, _, bidh_kv, bidb, split_idx), tile_shape, make_coord(_, _, _0{}));
+    return g_tensor;
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace flash
diff --git a/hopper/setup.py b/hopper/setup.py
index 029455a3e..f9f3cfd25 100644
--- a/hopper/setup.py
+++ b/hopper/setup.py
@@ -125,7 +125,52 @@ def append_nvcc_threads(nvcc_extra_args):
         "flash_bwd_hdim128_bf16_sm90.cu",
         "flash_fwd_hdim64_e4m3_sm90.cu",
         "flash_fwd_hdim128_e4m3_sm90.cu",
-        "flash_fwd_hdim256_e4m3_sm90.cu"
+        "flash_fwd_hdim256_e4m3_sm90.cu",
+        "flash_fwd_hdim64_fp16_gqa2_sm90.cu",
+        "flash_fwd_hdim64_fp16_gqa4_sm90.cu",
+        "flash_fwd_hdim64_fp16_gqa8_sm90.cu",
+        "flash_fwd_hdim64_fp16_gqa16_sm90.cu",
+        "flash_fwd_hdim64_fp16_gqa32_sm90.cu",
+        "flash_fwd_hdim128_fp16_gqa2_sm90.cu",
+        "flash_fwd_hdim128_fp16_gqa4_sm90.cu",
+        "flash_fwd_hdim128_fp16_gqa8_sm90.cu",
+        "flash_fwd_hdim128_fp16_gqa16_sm90.cu",
+        "flash_fwd_hdim128_fp16_gqa32_sm90.cu",
+        "flash_fwd_hdim256_fp16_gqa2_sm90.cu",
+        "flash_fwd_hdim256_fp16_gqa4_sm90.cu",
+        "flash_fwd_hdim256_fp16_gqa8_sm90.cu",
+        "flash_fwd_hdim256_fp16_gqa16_sm90.cu",
+        "flash_fwd_hdim256_fp16_gqa32_sm90.cu",
+        "flash_fwd_hdim64_bf16_gqa2_sm90.cu",
+        "flash_fwd_hdim64_bf16_gqa4_sm90.cu",
+        "flash_fwd_hdim64_bf16_gqa8_sm90.cu",
+        "flash_fwd_hdim64_bf16_gqa16_sm90.cu",
+        "flash_fwd_hdim64_bf16_gqa32_sm90.cu",
+        "flash_fwd_hdim128_bf16_gqa2_sm90.cu",
+        "flash_fwd_hdim128_bf16_gqa4_sm90.cu",
+        "flash_fwd_hdim128_bf16_gqa8_sm90.cu",
+        "flash_fwd_hdim128_bf16_gqa16_sm90.cu",
+        "flash_fwd_hdim128_bf16_gqa32_sm90.cu",
+        "flash_fwd_hdim256_bf16_gqa2_sm90.cu",
+        "flash_fwd_hdim256_bf16_gqa4_sm90.cu",
+        "flash_fwd_hdim256_bf16_gqa8_sm90.cu",
+        "flash_fwd_hdim256_bf16_gqa16_sm90.cu",
+        "flash_fwd_hdim256_bf16_gqa32_sm90.cu",
+        "flash_fwd_hdim64_e4m3_gqa2_sm90.cu",
+        "flash_fwd_hdim64_e4m3_gqa4_sm90.cu",
+        "flash_fwd_hdim64_e4m3_gqa8_sm90.cu",
+        "flash_fwd_hdim64_e4m3_gqa16_sm90.cu",
+        "flash_fwd_hdim64_e4m3_gqa32_sm90.cu",
+        "flash_fwd_hdim128_e4m3_gqa2_sm90.cu",
+        "flash_fwd_hdim128_e4m3_gqa4_sm90.cu",
+        "flash_fwd_hdim128_e4m3_gqa8_sm90.cu",
+        "flash_fwd_hdim128_e4m3_gqa16_sm90.cu",
+        "flash_fwd_hdim128_e4m3_gqa32_sm90.cu",
+        "flash_fwd_hdim256_e4m3_gqa2_sm90.cu",
+        "flash_fwd_hdim256_e4m3_gqa4_sm90.cu",
+        "flash_fwd_hdim256_e4m3_gqa8_sm90.cu",
+        "flash_fwd_hdim256_e4m3_gqa16_sm90.cu",
+        "flash_fwd_hdim256_e4m3_gqa32_sm90.cu",
     ]
     nvcc_flags = [
         "-O3",
diff --git a/hopper/static_switch.h b/hopper/static_switch.h
index d9ec62224..57dcb8c5f 100644
--- a/hopper/static_switch.h
+++ b/hopper/static_switch.h
@@ -27,41 +27,30 @@
     }                                                                          \
   }()
 
-#define PREC_SWITCH(PRECTYPE, ...)                                             \
+#define PREC_SWITCH(PRECTYPE, NAME, ...)                                       \
   [&] {                                                                        \
-    if (PRECTYPE == 1) {                                                       \
-      using kPrecType = cutlass::half_t;                                       \
-      constexpr static bool kSoftFp16 = false;                                 \
-      constexpr static bool kHybrid = false;                                   \
+    if (PRECTYPE == 3) {                                                       \
+      using NAME = cutlass::float_e4m3_t;                                      \
       return __VA_ARGS__();                                                    \
     } else if (PRECTYPE == 2) {                                                \
-      using kPrecType = cutlass::float_e4m3_t;                                 \
-      constexpr static bool kSoftFp16 = false;                                 \
-      constexpr static bool kHybrid = false;                                   \
+      using NAME = cutlass::bfloat16_t;                                        \
       return __VA_ARGS__();                                                    \
-    } else if (PRECTYPE == 3) {                                                \
-      using kPrecType = cutlass::float_e4m3_t;                                 \
-      constexpr static bool kSoftFp16 = false;                                 \
-      constexpr static bool kHybrid = true;                                    \
-      return __VA_ARGS__();                                                    \
-    } else if (PRECTYPE == 4) {                                                \
-      using kPrecType = cutlass::float_e4m3_t;                                 \
-      constexpr static bool kSoftFp16 = true;                                  \
-      constexpr static bool kHybrid = false;                                   \
+    } else {                                                                   \
+      using NAME = cutlass::half_t;                                            \
       return __VA_ARGS__();                                                    \
     }                                                                          \
   }()
 
-#define HEADDIM_SWITCH(HEADDIM, ...)                                           \
+#define HEADDIM_SWITCH(HEADDIM, CONST_NAME, ...)                               \
   [&] {                                                                        \
     if (HEADDIM == 64) {                                                       \
-      constexpr static int kHeadSize = 64;                                     \
+      constexpr static int CONST_NAME = 64;                                    \
       return __VA_ARGS__();                                                    \
     } else if (HEADDIM == 128) {                                               \
-      constexpr static int kHeadSize = 128;                                    \
+      constexpr static int CONST_NAME = 128;                                   \
       return __VA_ARGS__();                                                    \
-    } else if (HEADDIM == 256) {                                               \
-      constexpr static int kHeadSize = 256;                                    \
+    } else  {                                                                  \
+      constexpr static int CONST_NAME = 256;                                   \
       return __VA_ARGS__();                                                    \
     }                                                                          \
   }()
@@ -76,4 +65,94 @@
       using NAME = flash::FixedSeqLenTraits;                                   \
       return __VA_ARGS__();                                                    \
     }                                                                          \
-  }() 
+  }()
+
+#define SEQLEN_SWITCH_FWD(VAR_SEQ_LEN_Q, SEQ_USED_K, NAME_Q, NAME_K, ...)      \
+  [&] {                                                                        \
+    bool useVarSeqLenQ = VAR_SEQ_LEN_Q;                                        \
+    bool useSeqUsedK = SEQ_USED_K;                                             \
+    if (useVarSeqLenQ) {                                                       \
+      using NAME_Q = flash::VarSeqLenTraits;                                   \
+      using NAME_K = flash::VarSeqLenTraits;                                   \
+      return __VA_ARGS__();                                                    \
+    } else if (useSeqUsedK) {                                                  \
+      using NAME_Q = flash::FixedSeqLenTraits;                                 \
+      using NAME_K = flash::FixedSeqLenTraitsDynamic;                          \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      using NAME_Q = flash::FixedSeqLenTraits;                                 \
+      using NAME_K = flash::FixedSeqLenTraits;                                 \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define QUERYHEAD_SWITCH(QUERYHEADS, CONST_NAME, ...)                          \
+  [&] {                                                                        \
+    if (QUERYHEADS <= 2) {                                                     \
+      constexpr static int CONST_NAME = 2;                                     \
+      return __VA_ARGS__();                                                    \
+    } else if (QUERYHEADS <= 4) {                                              \
+      constexpr static int CONST_NAME = 4;                                     \
+      return __VA_ARGS__();                                                    \
+    } else if (QUERYHEADS <= 8) {                                              \
+      constexpr static int CONST_NAME = 8;                                     \
+      return __VA_ARGS__();                                                    \
+    } else if (QUERYHEADS <= 16) {                                             \
+      constexpr static int CONST_NAME = 16;                                    \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      constexpr static int CONST_NAME = 32;                                    \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define MMA_3WG_SWITCH(QLEN, CONST_NAME, ...)                                  \
+  [&] {                                                                        \
+    if (QLEN <= 64) {                                                          \
+      constexpr static int CONST_NAME = 1;                                     \
+      return __VA_ARGS__();                                                    \
+    } else if (QLEN <= 128) {                                                  \
+      constexpr static int CONST_NAME = 2;                                     \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      constexpr static int CONST_NAME = 3;                                     \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define MMA_2WG_SWITCH(QLEN, CONST_NAME, ...)                                  \
+  [&] {                                                                        \
+    if (QLEN <= 64) {                                                          \
+      constexpr static int CONST_NAME = 1;                                     \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      constexpr static int CONST_NAME = 2;                                     \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define NUM_SPLITS_SWITCH(NUM_SPLITS, LOG_MAX_SPLITS, ...)                     \
+  [&] {                                                                        \
+    if (NUM_SPLITS <= 2) {                                                     \
+      constexpr static int LOG_MAX_SPLITS = 1;                                 \
+      return __VA_ARGS__();                                                    \
+    } else if (NUM_SPLITS <= 4) {                                              \
+      constexpr static int LOG_MAX_SPLITS = 2;                                 \
+      return __VA_ARGS__();                                                    \
+    } else if (NUM_SPLITS <= 8) {                                              \
+      constexpr static int LOG_MAX_SPLITS = 3;                                 \
+      return __VA_ARGS__();                                                    \
+    } else if (NUM_SPLITS <= 16) {                                             \
+      constexpr static int LOG_MAX_SPLITS = 4;                                 \
+      return __VA_ARGS__();                                                    \
+    } else if (NUM_SPLITS <= 32) {                                             \
+      constexpr static int LOG_MAX_SPLITS = 5;                                 \
+      return __VA_ARGS__();                                                    \
+    } else if (NUM_SPLITS <= 64) {                                             \
+      constexpr static int LOG_MAX_SPLITS = 6;                                 \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      constexpr static int LOG_MAX_SPLITS = 7;                                 \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
diff --git a/hopper/test_attn_kvcache.py b/hopper/test_attn_kvcache.py
new file mode 100644
index 000000000..726d44ce3
--- /dev/null
+++ b/hopper/test_attn_kvcache.py
@@ -0,0 +1,486 @@
+import pytest
+from einops import rearrange, repeat
+import torch
+import flash_attn
+import flash_attn_interface
+import itertools
+import math
+import time
+
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(-1, -1),  # -1 means infinite window size
+    query_padding_mask=None,
+    key_padding_mask=None,
+    device=None,
+    key_leftpad=None,
+):
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] < 0:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        return torch.logical_or(
+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+            col_idx < row_idx + sk - sq - window_size[0],
+        )
+
+
+def attention_ref(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    key_leftpad=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads_k, head_dim)
+        v: (batch_size, seqlen_k, nheads_k, head_dim)
+        query_padding_mask: (batch_size, seqlen_q)
+        key_padding_mask: (batch_size, seqlen_k)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
+        causal: whether to apply causal masking
+        window_size: (int, int), left and right window size
+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
+            output back to fp16/bf16.
+        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
+            without changing the math. This is to estimate the numerical error from operation
+            reordering.
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim)
+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+    if softcap > 0:
+        scores = scores / softcap
+        scores = scores.tanh()
+        scores = scores * softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+            key_leftpad=key_leftpad,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    # Some rows might be completely masked out so we fill them with zero instead of NaN
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
+    # We want to mask here so that the attention matrix doesn't have any NaNs
+    # Otherwise we'll get NaN in dV
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+
+
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("num_requests", [1, 4])
+@pytest.mark.parametrize("query_seqlen", [1, 8, 120])
+@pytest.mark.parametrize("context_seqlen", [1024, 3131, 4224])
+@pytest.mark.parametrize("headdim", [64, 128, 256])
+@pytest.mark.parametrize("gqa_parallel", [False, True])
+@pytest.mark.parametrize(
+    "nheads_kv, gqa_ratio",
+    [
+        (1, 1),
+        (2, 5),
+        (3, 3),
+        (1, 32),
+        (5, 7),
+        (8, 1),
+        (1, 16),
+        (12, 4),
+        (8, 2),
+    ],
+)
+def test_flash_attn_kvcache_nosplit(nheads_kv, gqa_ratio, num_requests, query_seqlen, context_seqlen, headdim, causal, gqa_parallel):
+    device = "cuda"
+    num_caches = num_requests
+    cache_seqlen = context_seqlen
+    nheads_q = nheads_kv * gqa_ratio
+
+    k_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    v_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    q = torch.randn((num_requests, query_seqlen, nheads_q, headdim), device="cuda", dtype=torch.bfloat16)
+    # cache_idxs = torch.randperm(num_caches, dtype=torch.int32, device="cuda")[:num_requests]
+    cache_seqlens = torch.tensor([context_seqlen] * num_requests, dtype=torch.int32, device="cuda")
+    torch.cuda.synchronize()
+
+    out_ref, _ = attention_ref(
+        q,
+        k_cache,
+        v_cache,
+        causal=causal,
+    )
+
+    out_fa3, lse_fa3 = flash_attn_interface.flash_attn_with_kvcache(
+                    q=q,
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    cache_seqlens=cache_seqlens,
+                    # cache_batch_idx=cache_idxs,
+                    causal=causal,
+                    num_splits=1,
+                    return_softmax_lse=True,
+                    gqa_parallel=gqa_parallel
+                )
+
+
+    torch.cuda.synchronize()
+    assert ((out_ref - out_fa3).abs().max().item() <= 4e-3)
+    assert ((out_ref - out_fa3).abs().mean().item() <= 2e-4)
+
+
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("num_requests", [1, 3])
+@pytest.mark.parametrize("query_seqlen", [1, 8, 120])
+@pytest.mark.parametrize("context_seqlen", [1600, 4000, 5555])
+@pytest.mark.parametrize("headdim", [64, 128, 256])
+@pytest.mark.parametrize("gqa_parallel", [True, False])
+@pytest.mark.parametrize(
+    "nheads_kv, gqa_ratio",
+    [
+        (1, 1),
+        (2, 5),
+        (3, 3),
+        (1, 32),
+        (5, 7),
+        (8, 1),
+        (1, 16),
+        (12, 4),
+        (8, 2),
+    ],
+)
+def test_flash_attn_kvcache_nosplit_fp8(nheads_kv, gqa_ratio, num_requests, query_seqlen, context_seqlen, headdim, causal, gqa_parallel):
+    device = "cuda"
+    num_caches = num_requests
+    cache_seqlen = context_seqlen
+    nheads_q = nheads_kv * gqa_ratio
+
+    k_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    v_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    q = torch.randn((num_requests, query_seqlen, nheads_q, headdim), device="cuda", dtype=torch.bfloat16)
+    q = q.to(torch.float8_e4m3fn)
+    k_cache = k_cache.to(torch.float8_e4m3fn)
+    v_cache = v_cache.to(torch.float8_e4m3fn)
+    # cache_idxs = torch.randperm(num_caches, dtype=torch.int32, device="cuda")[:num_requests]
+    cache_seqlens = torch.tensor([context_seqlen] * num_requests, dtype=torch.int32, device="cuda")
+    torch.cuda.synchronize()
+
+    out_ref, _ = attention_ref(
+        q,
+        k_cache,
+        v_cache,
+        causal=causal,
+    )
+
+    descale_q = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+    descale_k = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+    descale_v = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+    out_fa3, lse_fa3 = flash_attn_interface.flash_attn_with_kvcache(
+                    q=q,
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    cache_seqlens=cache_seqlens,
+                    # cache_batch_idx=cache_idxs,
+                    causal=causal,
+                    num_splits=1,
+                    return_softmax_lse=True,
+                    gqa_parallel=gqa_parallel,
+                    descale_q=descale_q, descale_k=descale_k, descale_v=descale_v
+                )
+
+
+    torch.cuda.synchronize()
+    assert ((out_ref - out_fa3).abs().max().item() <= 4e-2)
+    assert ((out_ref - out_fa3).abs().mean().item() <= 2e-3)
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("use_heuristic_only", [True])
+# @pytest.mark.parametrize("use_heuristic_only", [False])
+@pytest.mark.parametrize("causal", [True, False])
+# @pytest.mark.parametrize("num_requests", [1, 4, 16])
+@pytest.mark.parametrize("num_requests", [1, 3])
+# @pytest.mark.parametrize("query_seqlen", [1, 16, 32, 128])
+@pytest.mark.parametrize("query_seqlen", [1, 8, 25])
+# @pytest.mark.parametrize("context_seqlen", [4096, 16384, 65536])
+@pytest.mark.parametrize("context_seqlen", [1600, 4000, 5555])
+@pytest.mark.parametrize("headdim", [64, 128, 256])
+@pytest.mark.parametrize("cache_seqlen_rand", [True, False])
+@pytest.mark.parametrize("gqa_parallel", [True, False])
+@pytest.mark.parametrize(
+    "nheads_kv, gqa_ratio",
+    [
+        (1, 1),
+        (4, 1),
+        (2, 2),
+        (3, 3),
+        (4, 4),
+        (2, 5),
+        (3, 9),
+        (1, 16),
+        (1, 32),
+    ],
+)
+def test_flash_attn_kvcache_output(nheads_kv, gqa_ratio, num_requests, query_seqlen, context_seqlen, headdim, causal, use_heuristic_only, cache_seqlen_rand, gqa_parallel, dtype):
+    device = "cuda"
+    num_caches = 16
+    if context_seqlen <= 65536:
+        cache_seqlen = 65536
+    else:
+        cache_seqlen = context_seqlen
+    nheads_q = nheads_kv * gqa_ratio
+    if use_heuristic_only:
+        max_splits = 1
+    else:
+        max_splits = 128
+
+    k_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    v_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    q = torch.randn((num_requests, query_seqlen, nheads_q, headdim), device="cuda", dtype=torch.bfloat16)
+
+    q = q.to(dtype)
+    k_cache = k_cache.to(dtype)
+    v_cache = v_cache.to(dtype)
+    cache_idxs = torch.randperm(num_caches, dtype=torch.int32, device="cuda")[:num_requests]
+    cache_seqlens = torch.randint(1, context_seqlen-1, (num_requests,), dtype=torch.int32).to(device) if cache_seqlen_rand else torch.tensor([context_seqlen] * num_requests, dtype=torch.int32, device="cuda")
+    torch.cuda.synchronize()
+
+    out_ref, lse_ref = flash_attn_interface.flash_attn_with_kvcache(
+                    q=q,
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    cache_seqlens=cache_seqlens,
+                    cache_batch_idx=cache_idxs,
+                    causal=causal,
+                    num_splits=1,
+                    return_softmax_lse=True,
+                    gqa_parallel=False
+                )
+
+    # i=0 case is with num splits heuristic
+    for i in range(0, max_splits+1):
+                out_fa3, lse_fa3 = flash_attn_interface.flash_attn_with_kvcache(
+                    q=q,
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    cache_seqlens=cache_seqlens,
+                    cache_batch_idx=cache_idxs,
+                    causal=causal,
+                    num_splits=i,
+                    return_softmax_lse=True,
+                    gqa_parallel=gqa_parallel,
+                    max_seqlen_k_hint=context_seqlen
+                )
+
+                torch.cuda.synchronize()
+                print ('output-ref', i, out_ref)
+                print ('output-fa3',i, out_fa3)
+                print ('output-max-diff', i, context_seqlen, (out_ref - out_fa3).abs().max().item())
+                print ('output-mean-diff',i, context_seqlen, (out_ref - out_fa3).abs().mean().item())
+                print ('lse-max-diff',i, context_seqlen, (lse_ref - lse_fa3).abs().max().item())
+                print ('lse-mean-diff',i,  context_seqlen, (lse_ref - lse_fa3).abs().mean().item())
+
+                if cache_seqlen_rand:
+                    assert ((out_ref - out_fa3).abs().max().item() <= 1e-2)
+                    assert ((out_ref - out_fa3).abs().mean().item() <= 1e-3)
+                else:
+                    assert ((out_ref - out_fa3).abs().max().item() <= 2e-3)
+                    assert ((out_ref - out_fa3).abs().mean().item() <= 1e-4)
+                lse_max_ref = lse_ref.abs().max().item()
+                lse_mean_ref = lse_ref.abs().mean().item()
+                lse_max_fa3 = lse_fa3.abs().max().item()
+                lse_mean_fa3 = lse_fa3.abs().mean().item()
+                lse_max_diff = (lse_ref - lse_fa3).abs().max().item()
+                lse_mean_diff = (lse_ref - lse_fa3).abs().mean().item()
+                assert ((lse_max_ref == math.inf and lse_max_fa3 == math.inf) or lse_max_diff <= 1e-3)
+                assert ((lse_mean_ref == math.inf and lse_mean_fa3 == math.inf) or lse_mean_diff <= 1e-4)
+
+
+
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
+@pytest.mark.parametrize("use_heuristic_only", [True])
+# @pytest.mark.parametrize("use_heuristic_only", [False])
+@pytest.mark.parametrize("causal", [True, False])
+# @pytest.mark.parametrize("num_requests", [1, 4, 16])
+@pytest.mark.parametrize("num_requests", [1, 3])
+# @pytest.mark.parametrize("query_seqlen", [1, 16, 32, 128])
+@pytest.mark.parametrize("query_seqlen", [1, 8, 25])
+# @pytest.mark.parametrize("context_seqlen", [4096, 16384, 65536])
+@pytest.mark.parametrize("context_seqlen", [1600, 4000, 5555])
+@pytest.mark.parametrize("headdim", [64, 128, 256])
+@pytest.mark.parametrize("cache_seqlen_rand", [True, False])
+@pytest.mark.parametrize("gqa_parallel", [True, False])
+@pytest.mark.parametrize(
+    "nheads_kv, gqa_ratio",
+    [
+        (1, 1),
+        (4, 1),
+        (2, 2),
+        (3, 3),
+        (4, 4),
+        (2, 5),
+        (3, 9),
+        (1, 16),
+        (1, 32),
+    ],
+)
+def test_flash_attn_kvcache_output_fp8(nheads_kv, gqa_ratio, num_requests, query_seqlen, context_seqlen, headdim, causal, use_heuristic_only, cache_seqlen_rand, gqa_parallel, dtype):
+    device = "cuda"
+    num_caches = 16
+    if context_seqlen <= 65536:
+        cache_seqlen = 65536
+    else:
+        cache_seqlen = context_seqlen
+    nheads_q = nheads_kv * gqa_ratio
+    if use_heuristic_only:
+        max_splits = 1
+    else:
+        max_splits = 128
+
+    k_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    v_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=torch.bfloat16
+    )
+    q = torch.randn((num_requests, query_seqlen, nheads_q, headdim), device="cuda", dtype=torch.bfloat16)
+
+    q = q.to(dtype)
+    k_cache = k_cache.to(dtype)
+    v_cache = v_cache.to(dtype)
+    cache_idxs = torch.randperm(num_caches, dtype=torch.int32, device="cuda")[:num_requests]
+    cache_seqlens = torch.randint(1, context_seqlen-1, (num_requests,), dtype=torch.int32).to(device) if cache_seqlen_rand else torch.tensor([context_seqlen] * num_requests, dtype=torch.int32, device="cuda")
+    torch.cuda.synchronize()
+
+
+    descale_q = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+    descale_k = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+    descale_v = torch.tensor([1.0], dtype=torch.float32, device='cuda')
+
+    out_ref, lse_ref = flash_attn_interface.flash_attn_with_kvcache(
+                    q=q,
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    cache_seqlens=cache_seqlens,
+                    cache_batch_idx=cache_idxs,
+                    causal=causal,
+                    num_splits=1,
+                    return_softmax_lse=True,
+                    gqa_parallel=False,
+                    descale_q=descale_q, descale_k=descale_k, descale_v=descale_v
+                )
+
+    # i=0 case is with num splits heuristic
+    for i in range(0, max_splits+1):
+                out_fa3, lse_fa3 = flash_attn_interface.flash_attn_with_kvcache(
+                    q=q,
+                    k_cache=k_cache,
+                    v_cache=v_cache,
+                    cache_seqlens=cache_seqlens,
+                    cache_batch_idx=cache_idxs,
+                    causal=causal,
+                    num_splits=i,
+                    return_softmax_lse=True,
+                    gqa_parallel=gqa_parallel,
+                    max_seqlen_k_hint=context_seqlen,
+                    descale_q=descale_q, descale_k=descale_k, descale_v=descale_v
+                )
+
+                torch.cuda.synchronize()
+                print ('output-ref', i, out_ref)
+                print ('output-fa3',i, out_fa3)
+                print ('output-max-diff', i, context_seqlen, (out_ref - out_fa3).abs().max().item())
+                print ('output-mean-diff',i, context_seqlen, (out_ref - out_fa3).abs().mean().item())
+                print ('lse-max-diff',i, context_seqlen, (lse_ref - lse_fa3).abs().max().item())
+                print ('lse-mean-diff',i,  context_seqlen, (lse_ref - lse_fa3).abs().mean().item())
+
+                if cache_seqlen_rand:
+                    assert ((out_ref - out_fa3).abs().max().item() <= 1e-1)
+                    assert ((out_ref - out_fa3).abs().mean().item() <= 1e-2)
+                else:
+                    assert ((out_ref - out_fa3).abs().max().item() <= 2e-2)
+                    assert ((out_ref - out_fa3).abs().mean().item() <= 2e-3)
+                lse_max_ref = lse_ref.abs().max().item()
+                lse_mean_ref = lse_ref.abs().mean().item()
+                lse_max_fa3 = lse_fa3.abs().max().item()
+                lse_mean_fa3 = lse_fa3.abs().mean().item()
+                lse_max_diff = (lse_ref - lse_fa3).abs().max().item()
+                lse_mean_diff = (lse_ref - lse_fa3).abs().mean().item()
+                assert ((lse_max_ref == math.inf and lse_max_fa3 == math.inf) or lse_max_diff <= 1e-3)
+                assert ((lse_mean_ref == math.inf and lse_mean_fa3 == math.inf) or lse_mean_diff <= 1e-4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hopper/test_flash_attn.py b/hopper/test_flash_attn.py
index dc8b35baf..bc3f6e5a7 100644
--- a/hopper/test_flash_attn.py
+++ b/hopper/test_flash_attn.py
@@ -23,24 +23,153 @@ def print_diffs(out, out_ref):
             print(f"==== diff ==== {idx}, test: {e_o}, ref: {e_o_ref}")
 
 
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("local", [False, True])
+@pytest.mark.parametrize("deterministic", [True])
+@pytest.mark.parametrize("gqa_parallel", [False, True])
+@pytest.mark.parametrize("d", [64, 128, 256])
+# @pytest.mark.parametrize("descale", [1.0])
+@pytest.mark.parametrize("descale", [1.0, 2.0, 3.0])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 1),
+        (64, 128),
+        (128, 128),
+        (256, 256),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (384, 256),
+        (640, 128),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (4096, 4096),
+        (4224, 4224),
+    ],
+)
+def test_flash_attn_output_fp8(
+    seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale, gqa_parallel
+):
+    device = "cuda"
+    dtype_init = torch.bfloat16
+    print(dtype)
+    print('causal',causal)
+    print('local',local)
+    print('gqa_parallel',gqa_parallel)
+    # set seed
+    torch.random.manual_seed(42)
+    # batch_size = 40
+    # nheads = 16
+    batch_size = 4
+    nheads = 6
+    nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
+    # nheads_kv = 1
+    # batch_size = 9
+    # nheads = 6
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_init, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_init, requires_grad=True)
+
+    q = q.to(dtype)
+    k = k.to(dtype)
+    v = v.to(dtype)
+
+    softmax_scale = q.shape[-1] ** (-0.5)
+    descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
+    descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
+    descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
+
+    out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel,
+                                descale_q=descale_q, descale_k=descale_k, descale_v=descale_v)
+
+    q = q.to(dtype_init)
+    k = k.to(dtype_init)
+    v = v.to(dtype_init)
+
+    descale_q = descale_q.to(dtype_init)
+    descale_k = descale_k.to(dtype_init)
+    descale_v = descale_v.to(dtype_init)
+    q = q * descale_q
+    k = k * descale_k
+    v = v * descale_v
+
+    out_ref, attn_ref = attention_ref(
+        q,
+        k,
+        v,
+        None,
+        None,
+        causal=causal,
+        window_size=window_size,
+    )
+    out_pt, attn_pt = attention_ref(
+        q,
+        k,
+        v,
+        None,
+        None,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+
+    # qk = torch.einsum('bshd,bthd->bhst', q, k).float()
+    # m = qk.amax(-1, keepdim=True)
+    # s_tmp = torch.exp((qk - m) / math.sqrt(d))
+    # exp_sum = s_tmp.sum(-1)
+    # qk = torch.einsum('bthd,bshd->bhts', q.float() / math.sqrt(d), k.float())
+    # lse_ref = torch.logsumexp(qk, dim=-1)
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    # if not causal:
+    #     print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
+    # breakpoint()
+
+    # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
+    # P = torch.softmax(qk, -1)
+    # dP = P * (dS - do_o.unsqueeze(1))
+    # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
+    # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
+    # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
+    # breakpoint()
+    
+    # assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + 1e-2
+    atol = 4 * (out_pt - out_ref).abs().max().item() + 1e-2
+    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=atol, check_dtype=False)
+
+
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize("dtype", [torch.float16])
 # @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
 @pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
 # @pytest.mark.parametrize("mha_type", ["mha"])
 @pytest.mark.parametrize("causal", [False, True])
-# @pytest.mark.parametrize("causal", [True])
+# @pytest.mark.parametrize("causal", [False])
 @pytest.mark.parametrize("local", [False, True])
 # @pytest.mark.parametrize("local", [True])
 @pytest.mark.parametrize("deterministic", [False, True])
 # @pytest.mark.parametrize("deterministic", [True])
+@pytest.mark.parametrize("gqa_parallel", [False, True])
+# @pytest.mark.parametrize("gqa_parallel", [False])
 # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
 # @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
 # @pytest.mark.parametrize('d', [56, 80])
 # @pytest.mark.parametrize("d", [64, 128, 256])
 # @pytest.mark.parametrize("d", [64, 96, 128])
-# @pytest.mark.parametrize("d", [256])
+# @pytest.mark.parametrize("d", [64])
 @pytest.mark.parametrize("d", [64, 128, 256])
 @pytest.mark.parametrize("descale", [1.0])
 # @pytest.mark.parametrize("descale", [1.0, 2.0, 3.0, 4.0])
@@ -48,7 +177,6 @@ def print_diffs(out, out_ref):
     "seqlen_q,seqlen_k",
     [
         (1, 1),
-        # (257, 1),
         (64, 128),
         (128, 128),
         (256, 256),
@@ -64,26 +192,30 @@ def print_diffs(out, out_ref):
         (1023, 1024),
         (1024, 1023),
         (4096, 4096),
+        (4224, 4224),
     ],
 )
 # @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
 def test_flash_attn_output(
-    seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale
+    seqlen_q, seqlen_k, d, causal, local, deterministic, mha_type, dtype, descale, gqa_parallel
 ):
     device = "cuda"
     if(dtype == torch.float8_e4m3fn):
-        dtype_init = torch.float16
+        dtype_init = torch.bfloat16
     else:
         dtype_init = dtype
     print(dtype)
+    print('causal',causal)
+    print('local',local)
+    print('gqa_parallel',gqa_parallel)
     # set seed
-    torch.random.manual_seed(0)
+    torch.random.manual_seed(42)
     # batch_size = 40
     # nheads = 16
     batch_size = 4
     nheads = 6
     nheads_kv = 6 if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
-    # nheads_kv = 2
+    # nheads_kv = 1
     # batch_size = 9
     # nheads = 6
     window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
@@ -99,12 +231,12 @@ def test_flash_attn_output(
     descale_q = torch.tensor([descale], dtype=torch.float32, device='cuda')
     descale_k = torch.tensor([descale], dtype=torch.float32, device='cuda')
     descale_v = torch.tensor([descale], dtype=torch.float32, device='cuda')
+
     if(dtype != torch.float8_e4m3fn):
-        out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic)
+        out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel)
     else:
-        out, q, k, v, out_padded, lse, S_dmask = _flash_attn_forward(
-            q, k, v, softmax_scale, causal, descale_q=descale_q, descale_k=descale_k, descale_v=descale_v
-        )
+        out, lse = flash_attn_func(q, k, v, causal=causal, window_size=window_size, deterministic=deterministic, gqa_parallel=gqa_parallel,
+                                   descale_q=descale_q, descale_k=descale_k, descale_v=descale_v)
 
     q = q.to(dtype_init)
     k = k.to(dtype_init)
@@ -189,7 +321,7 @@ def test_flash_attn_output(
         assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 3e-5
     else:
         # just test correctness of fp8 kernel w/o further quantization techniques
-        assert (out - out_ref).abs().max().item() <= 40 * (out_pt - out_ref).abs().max().item()
+        assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + 2e-2
 
     if d <= 128 and dtype != torch.float8_e4m3fn:
         assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 3e-5
diff --git a/hopper/test_kvcache.py b/hopper/test_kvcache.py
new file mode 100644
index 000000000..7764bd5d5
--- /dev/null
+++ b/hopper/test_kvcache.py
@@ -0,0 +1,234 @@
+import torch
+#from flash_attn_interface import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
+import flash_attn_interface as fa3
+import flash_attn as fa2
+import torch.utils.benchmark as benchmark
+import time
+
+import argparse
+import math
+
+parser = argparse.ArgumentParser(description='Process some integers.')
+parser.add_argument('--causal', action='store_true')
+parser.add_argument('--splits', type=int, default=1)
+parser.add_argument('--repeats', type=int, default=10)
+parser.add_argument('--validate', action='store_true')
+parser.add_argument('--gqa', action='store_true')
+
+args = parser.parse_args()
+
+def benchmark_fa_kv_old(fn, repeats=10, desc='', verbose=True, **kwinputs):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    if verbose:
+        print(desc, '- Forward pass')
+    t = benchmark.Timer(
+            stmt='fn(**kwinputs)',
+            globals={'fn': fn, 'kwinputs': kwinputs},
+            num_threads=torch.get_num_threads(),
+            )
+    m = t.timeit(repeats)
+    if verbose:
+        print(desc, m)
+    return t, m
+
+def benchmark_fa_kv(fn, repeats=10, *args, **kwargs):
+    # warmup
+    for _ in range(5):
+        fn(*args, **kwargs)
+    niters = repeats
+    torch.cuda.synchronize()
+    start = time.time()
+    for _ in range(niters):
+        fn(*args, **kwargs)
+    torch.cuda.synchronize()
+    end = time.time()
+    return (end - start) / niters
+
+def main():
+    # *SAMPLE CONFIG*
+    # Model arch params:
+    nheads_q = 64
+    nheads_kv = 8
+    headdim = 128
+    #dtype = torch.bfloat16
+    dtype = torch.float16
+
+    # Cache settings:
+    num_caches = 8
+    cache_seqlen = 1024 * 16
+
+    # Batching settings
+    ntokens = 1024
+    max_queries_per_batch = 4
+    small_request_ntokens = 16
+
+    # Input settings
+    query_seqlens = [900, 12, 1]
+    num_queries = len(query_seqlens)
+    # Need to add empty queries to fill out `max_queries_per_batch`
+    num_padding_queries = max_queries_per_batch - num_queries
+    context_seqlens = [4096, 5120*2, 6145*2]
+    #context_seqlens = [4096, 5120*2, 6152*2]
+
+    # Validation
+    assert sum(query_seqlens) <= ntokens
+    assert all(s < small_request_ntokens for s in query_seqlens[1:])
+    assert num_queries <= max_queries_per_batch
+    assert all(s < cache_seqlen for s in context_seqlens)
+
+    torch.manual_seed(5434)
+
+    # Allocate some tensors
+    k_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=dtype
+    )
+    v_cache = torch.randn(
+        (num_caches, cache_seqlen, nheads_kv, headdim), device="cuda", dtype=dtype
+    )
+
+    q_buf_large = torch.randn(
+        (1, ntokens, nheads_q, headdim), device="cuda", dtype=dtype
+    )
+    cache_seqlen_large = torch.tensor(
+        [context_seqlens[0]], dtype=torch.int32, device="cuda"
+    )
+    cache_idx_large = torch.tensor([1], dtype=torch.int32, device="cuda")
+
+    q_buf_small = torch.randn(
+        (max_queries_per_batch - 1, small_request_ntokens, nheads_q, headdim),
+        device="cuda",
+        dtype=dtype,
+    )
+    cache_seqlens_small = torch.tensor(
+        context_seqlens[1:] + [0] * num_padding_queries, dtype=torch.int32, device="cuda"
+    )
+    cache_idxs_small = torch.randperm(num_caches, dtype=torch.int32, device="cuda")[
+        : max_queries_per_batch - 1
+    ]
+
+    if args.validate:
+        # Call flash attn
+        # First for the single full-sized query
+        out0, lse0 = fa3.flash_attn_with_kvcache(
+            q=q_buf_large,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlen_large,
+            cache_batch_idx=cache_idx_large,
+            causal=bool(args.causal),
+            num_splits=args.splits,
+            return_softmax_lse=True,
+           #num_splits=1
+        )   
+
+         # Second for n-1 small queries
+        out1_split1, lse1_split1 = fa3.flash_attn_with_kvcache(
+            q=q_buf_small,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlens_small,
+            cache_batch_idx=cache_idxs_small,
+            causal=bool(args.causal),
+            num_splits=1,
+            gqa_decoding=bool(args.gqa),
+            return_softmax_lse=True,
+        )
+
+        # Second for n-1 small queries
+        out1, lse1 = fa3.flash_attn_with_kvcache(
+            q=q_buf_small,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlens_small,
+            cache_batch_idx=cache_idxs_small,
+            causal=bool(args.causal),
+            num_splits=args.splits,
+            gqa_decoding=bool(args.gqa),
+            return_softmax_lse=True,
+        )
+
+        # Call flash attn
+        # First for the single full-sized query
+        out2 = fa2.flash_attn_with_kvcache(
+            q=q_buf_large,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlen_large,
+            cache_batch_idx=cache_idx_large,
+            causal=bool(args.causal),
+            num_splits=args.splits,
+        )
+
+        print ('big')
+        print ('diff-max', (out0 - out2).abs().max().item(), cache_seqlens_small)
+        print ('diff-mean', (out0 - out2).abs().mean().item())
+
+
+        # Second for n-1 small queries
+        out3, lse_fa2 = fa2.flash_attn_with_kvcache(
+            q=q_buf_small,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlens_small,
+            cache_batch_idx=cache_idxs_small,
+            causal=bool(args.causal),
+            num_splits=args.splits,
+            return_softmax_lse=True,
+            #num_splits=1
+        )
+
+        print ('small') #, out1)
+        print ('lse', lse1, lse_fa2, (lse1 - lse_fa2).abs(), out1.shape)
+        print ('lse-dif-max', (lse1 - lse_fa2).abs().max().item())
+        print ('diff-max', (out1 - out3).abs().max().item())
+        print ('diff-mean', (out1 - out3).abs().mean().item())
+
+
+    print ('fa3', args.repeats)
+    time_fa3_big = benchmark_fa_kv(fa3.flash_attn_with_kvcache, repeats=args.repeats, 
+        q=q_buf_large,
+        k_cache=k_cache,
+        v_cache=v_cache,
+        cache_seqlens=cache_seqlen_large,
+        cache_batch_idx=cache_idx_large,
+        causal=bool(args.causal),
+        num_splits=args.splits,
+    )
+
+    time_fa3_small = benchmark_fa_kv(fa3.flash_attn_with_kvcache, repeats=args.repeats,
+        q=q_buf_small,
+        k_cache=k_cache,
+        v_cache=v_cache,
+        cache_seqlens=cache_seqlens_small,
+        cache_batch_idx=cache_idxs_small,
+        causal=bool(args.causal),
+        num_splits=args.splits,
+    )
+
+    print ('fa2 ')
+
+    time_fa2_big = benchmark_fa_kv(fa2.flash_attn_with_kvcache, repeats=args.repeats, 
+            q=q_buf_large,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlen_large,
+            cache_batch_idx=cache_idx_large,
+            causal=bool(args.causal),
+            num_splits=args.splits
+    )
+
+    time_fa2_small = benchmark_fa_kv(fa2.flash_attn_with_kvcache, repeats=args.repeats, 
+            q=q_buf_small,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlens_small,
+            cache_batch_idx=cache_idxs_small,
+            causal=bool(args.causal),
+            num_splits=args.splits
+    )
+
+    print ('big (split, fa3, fa2, ratio):', args.splits, time_fa3_big * 1000000, time_fa2_big * 1000000, time_fa3_big / time_fa2_big)
+    print ('small (split, fa3, fa2, ratio):', args.splits, time_fa3_small * 1000000, time_fa2_small * 1000000, time_fa3_small / time_fa2_small)
+
+if __name__ == "__main__":
+    main()
diff --git a/hopper/tile_scheduler.hpp b/hopper/tile_scheduler.hpp
index 2fbb417e4..5cc136f72 100644
--- a/hopper/tile_scheduler.hpp
+++ b/hopper/tile_scheduler.hpp
@@ -19,7 +19,7 @@ struct SingleTileScheduler {
 
     // Host side kernel arguments
     struct Arguments {
-        int const num_blocks_m, num_head, num_batch;
+        int const num_blocks_m, num_splits, num_head, num_batch;
         int* const tile_count_semaphore = nullptr;
     };
 
@@ -49,9 +49,9 @@ struct SingleTileScheduler {
         }
 
         CUTLASS_DEVICE
-        cute::tuple<int32_t, int32_t, int32_t>
+        cute::tuple<int32_t, int32_t, int32_t, int32_t>
         get_block_coord(Params const& params) const {
-            return {M_idx, H_idx, B_idx};
+            return {M_idx, 1, H_idx, B_idx};
         }
 
     };
@@ -88,26 +88,31 @@ struct SingleTileScheduler {
 
 ///////////////////////////////////////////////////////////////////////////////
 
+template <bool Is_split = false>
 class StaticPersistentTileScheduler {
 
 public:
 
     // Host side kernel arguments
     struct Arguments {
-        int const num_blocks_m, num_head, num_batch;
+        int const num_blocks_m, num_splits, num_head, num_batch;
         int* const tile_count_semaphore = nullptr;
     };
 
     // Device side kernel params
     struct Params {
-        int total_blocks;
-        cutlass::FastDivmod m_block_divmod, head_divmod;
+        int const total_blocks;
+        cutlass::FastDivmod const m_block_divmod, split_divmod, head_divmod;
     };
 
     static Params
     to_underlying_arguments(Arguments const& args) {
-        return {args.num_blocks_m * args.num_head * args.num_batch,
-                cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head)};
+        // return {args.num_blocks_m * args.num_head * args.num_batch,
+        //         cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head)};
+        return {args.num_blocks_m * args.num_splits * args.num_head * args.num_batch,                
+                cutlass::FastDivmod(args.num_blocks_m),
+                cutlass::FastDivmod(args.num_splits),
+                cutlass::FastDivmod(args.num_head)};
     }
 
     static dim3
@@ -125,11 +130,19 @@ class StaticPersistentTileScheduler {
         }
 
         CUTLASS_DEVICE
-        cute::tuple<int32_t, int32_t, int32_t>
+        cute::tuple<int32_t, int32_t, int32_t, int32_t>
         get_block_coord(Params const& params) const {
-            int m_block, bidh, bidb;
-            bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(m_block, tile_idx));
-            return {m_block, bidh, bidb};
+            int m_block, split_idx, bidh, bidb;
+            if constexpr(!Is_split) {
+                bidb = params.head_divmod.divmod(bidh,
+                         params.m_block_divmod.divmod(m_block, tile_idx));
+                return {m_block, 1, bidh, bidb};
+            } else {
+                bidb = params.head_divmod.divmod(bidh,
+                         params.split_divmod.divmod(split_idx,
+                           params.m_block_divmod.divmod(m_block, tile_idx)));
+                return {m_block, split_idx, bidh, bidb};
+            }
         }
 
     };
@@ -164,7 +177,9 @@ class StaticPersistentTileScheduler {
 
 };
 
-template<int NumMmaThreads=2 * cutlass::NumThreadsPerWarpGroup, int NumProducerThreads = cutlass::NumThreadsPerWarp>
+template<int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
+    int NumProducerThreads = cutlass::NumThreadsPerWarp,
+    bool Is_split = false>
 class DynamicPersistentTileScheduler {
 
 protected:
@@ -174,21 +189,26 @@ class DynamicPersistentTileScheduler {
 
     // Host side kernel arguments
     struct Arguments {
-        int const num_blocks_m, num_head, num_batch;
+        int const num_blocks_m, num_splits, num_head, num_batch;
         int* const tile_count_semaphore;
     };
 
     // Device side kernel params
     struct Params {
-        int const total_blocks;
-        cutlass::FastDivmod const m_block_divmod, head_divmod;
+        int const total_blocks;        
+        cutlass::FastDivmod const m_block_divmod, split_divmod, head_divmod;
         int* const tile_count_semaphore;
     };
 
     static Params
     to_underlying_arguments(Arguments const& args) {
-        return {args.num_blocks_m * args.num_head * args.num_batch,
-                cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head),
+        // return {args.num_blocks_m * args.num_head * args.num_batch,
+        //         cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head),
+        //         args.tile_count_semaphore};
+        return {args.num_blocks_m * args.num_splits * args.num_head * args.num_batch,                
+                cutlass::FastDivmod(args.num_blocks_m),
+                cutlass::FastDivmod(args.num_splits),
+                cutlass::FastDivmod(args.num_head),
                 args.tile_count_semaphore};
     }
 
@@ -207,11 +227,19 @@ class DynamicPersistentTileScheduler {
         }
 
         CUTLASS_DEVICE
-        cute::tuple<int32_t, int32_t, int32_t>
+        cute::tuple<int32_t, int32_t, int32_t, int32_t>
         get_block_coord(Params const& params) const {
-            int m_block, bidh, bidb;
-            bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(m_block, tile_idx));
-            return {m_block, bidh, bidb};
+            int m_block, split_idx, bidh, bidb;
+            if constexpr(!Is_split) {
+                bidb = params.head_divmod.divmod(bidh,
+                         params.m_block_divmod.divmod(m_block, tile_idx));
+                return {m_block, 1, bidh, bidb};
+            } else {
+                bidb = params.head_divmod.divmod(bidh,
+                         params.split_divmod.divmod(split_idx,
+                           params.m_block_divmod.divmod(m_block, tile_idx)));
+                return {m_block, split_idx, bidh, bidb};
+            }
         }
 
     };
diff --git a/hopper/utils.h b/hopper/utils.h
index aaf0712ad..8232c7fc4 100644
--- a/hopper/utils.h
+++ b/hopper/utils.h
@@ -261,16 +261,16 @@ __forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layou
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <int NumCopyThreads, typename ElemO, typename TMACopyO, typename LayoutO, 
+template <bool Is_split, int NumCopyThreads, typename ElemO, typename TMACopyO, typename LayoutO, 
           typename TileShapeO, typename SMemO, typename SeqLenTraits>
 __forceinline__ __device__ void write_tma(
         ElemO* O, const TMACopyO& tma_store_O,
         const LayoutO& layout_O, const TileShapeO& tile_shape_O,
-        const SMemO& sO, int m_block, int bidh, int bidb,
+        const SMemO& sO, int m_block, int bidh, int bidb, int n_split_idx,
         const SeqLenTraits& seqlen_traits_o, int write_warp_idx) {
     Tensor mO = tma_store_O.get_tma_tensor(layout_O.shape());
-    Tensor gO = seqlen_traits_o.get_local_tile_tensor(
-        mO, tile_shape_O, bidh, bidb
+    Tensor gO = seqlen_traits_o.get_o_local_tile_tensor<Is_split>(
+        mO, tile_shape_O, bidh, bidb, n_split_idx
     )(_, _, m_block);  // (M, K)
     auto block_tma_O = tma_store_O.get_slice(_0{});
     Tensor tOgO = block_tma_O.partition_D(gO);  // (TMA, TMA_M, TMA_K)
@@ -286,6 +286,94 @@ __forceinline__ __device__ void write_tma(
     // tma_store_wait<0>();
 }
 
+// Epilogue that copies RMEM -> GMEM directly for GQA enabled.
+// Reports as uncoalesced stores by the profiler
+template <bool Use_gqa_layout, bool Column_permute_fp8, bool Is_split = true, typename TensorO, typename OutputType,
+          typename LayoutO, typename TileShapeO, typename TiledMma, typename SeqLenTraits>
+__forceinline__ __device__ void write_rmem_to_gmem(
+        TensorO &tOrO, OutputType *O, const LayoutO& layout_O, TileShapeO tile_shape_O,
+        int m_block, int h_block, int bidh, int bidh_kv, int bidb, int n_split_idx,
+        TiledMma& tiled_mma, const SeqLenTraits& seqlen_traits_o, int thread_idx) {
+    static_assert(is_same_v<typename TensorO::value_type, float>, "rmem dtype must be float");
+    Tensor mO = make_tensor(make_gmem_ptr(O), layout_O);
+    Tensor gO = [&] {
+        if constexpr(Use_gqa_layout) {
+            return seqlen_traits_o.get_o_local_tile_tensor<Is_split>(
+                mO, tile_shape_O, bidh_kv, bidb, n_split_idx
+                )(_, _, _, m_block, h_block);  // (bM/bH, bH, K)
+        } else {
+            return seqlen_traits_o.get_o_local_tile_tensor<Is_split>(
+                mO, tile_shape_O, bidh, bidb, n_split_idx
+                )(_, _, m_block);  // (bM, bK)
+        }
+    }();
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    auto tile_shape_mnk = cute::tile_shape(tiled_mma);
+    Tensor cO = cute::make_identity_tensor(select<0, 1>(tile_shape_mnk));
+    Tensor tOcO = thread_mma.partition_C(cO);
+    // tOcO has shape ((2, 2, V), MMA_M, MMA_N), we only take only the row indices.
+    Tensor tOcO_row = tOcO(make_coord(_0{}, _, _0{}), _, _0{});
+    // reshape from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+    Tensor tOrO_rowcol = make_tensor(tOrO.data(), flash::convert_layout_acc_rowcol(tOrO.layout()));
+    const int m_bound = seqlen_traits_o.actual_seq_len - m_block * size<0>(gO);
+    // hardcoded col_idx to circumvent reg spilling with counting tensor
+    const int col_start_idx = !Column_permute_fp8 ? 2 * (thread_idx % 4) : 4 * (thread_idx % 4);
+
+    if constexpr (Use_gqa_layout) {
+        static constexpr int kBlockH = size<1>(gO);
+        const int h_bound = shape<1>(layout_O) - h_block * kBlockH;
+        #pragma unroll
+        for(int nrow = 0; nrow < size<0>(tOrO_rowcol); ++nrow) {
+            const int row = int(get<0>(tOcO_row(nrow)));
+            const int h_local = row % kBlockH;
+            const int m_local = row / kBlockH;
+            if(h_local < h_bound && m_local < m_bound) {
+                if constexpr(!Column_permute_fp8) {
+                    Tensor tOrO_nrow_float2 = recast<float2>(tOrO_rowcol(nrow, _));
+                    #pragma unroll
+                    for (int ncol = 0; ncol < size<1>(tOrO_rowcol)/2; ++ncol) {
+                        *reinterpret_cast<float2*>(&(gO(m_local, h_local, col_start_idx + 8 * ncol))) = 
+                            tOrO_nrow_float2(ncol);
+                    }
+                } else {
+                    Tensor tOrO_nrow = tOrO_rowcol(nrow, _);
+                    #pragma unroll
+                    for (int ncol = 0; ncol < size<1>(tOrO_rowcol); ncol += 4) {
+                        gO(m_local, h_local, col_start_idx + 4 * ncol) = tOrO_nrow(ncol);
+                        gO(m_local, h_local, col_start_idx + 4 * ncol + 2) = tOrO_nrow(ncol + 1);
+                        gO(m_local, h_local, col_start_idx + 4 * ncol + 1) = tOrO_nrow(ncol + 2);
+                        gO(m_local, h_local, col_start_idx + 4 * ncol + 3) = tOrO_nrow(ncol + 3);
+                    }
+                }
+            }
+        }
+    } else {
+        #pragma unroll
+        for(int nrow = 0; nrow < size<0>(tOrO_rowcol); ++nrow) {
+            const int row = int(get<0>(tOcO_row(nrow)));
+            if(row < m_bound) {
+                if constexpr(!Column_permute_fp8) {
+                    Tensor tOrO_nrow_float2 = recast<float2>(tOrO_rowcol(nrow, _));
+                    #pragma unroll
+                    for (int ncol = 0; ncol < size<1>(tOrO_rowcol)/2; ++ncol) {
+                        *reinterpret_cast<float2*>(&(gO(row, col_start_idx + 8 * ncol))) = 
+                            tOrO_nrow_float2(ncol);
+                    }
+                } else {
+                    Tensor tOrO_nrow = tOrO_rowcol(nrow, _);
+                    #pragma unroll
+                    for (int ncol = 0; ncol < size<1>(tOrO_rowcol); ncol += 4) {
+                        gO(row, col_start_idx + 4 * ncol) = tOrO_nrow(ncol);
+                        gO(row, col_start_idx + 4 * ncol + 2) = tOrO_nrow(ncol + 1);
+                        gO(row, col_start_idx + 4 * ncol + 1) = tOrO_nrow(ncol + 2);
+                        gO(row, col_start_idx + 4 * ncol + 3) = tOrO_nrow(ncol + 3);
+                    }
+                }
+            }
+        }
+    }
+}
+
 template <int NumCopyThreads, typename ElemO, typename TiledCopyO, typename LayoutO, 
           typename TileShapeO, typename SMemO, typename SeqLenTraits>
 __forceinline__ __device__ void write_tiled(
@@ -333,17 +421,24 @@ __forceinline__ __device__ void write_tiled(
     }
 }
 
-template <bool IsTMACopy, int NumCopyThreads, typename ElemO, 
+template <bool IsTMACopy, bool IsRegToGmem, bool Is_split, int NumCopyThreads, typename ElemO, 
           typename TMACopyO, typename TiledCopyO, typename LayoutO, 
-          typename TileShapeO, typename SMemO, typename SeqLenTraits>
+          typename TileShapeO, typename SMemO, typename SeqLenTraits, class TensorO, typename TiledMma>
 __forceinline__ __device__ void write_O(
         ElemO* O, const TMACopyO& tma_copy_O, const TiledCopyO& tiled_copy_O,
         const LayoutO& layout_O, const TileShapeO& tile_shape_O,
-        const SMemO& sO, int m_block, int bidh, int bidb,
-        const SeqLenTraits& seqlen_traits_o, int write_warp_idx) {
-    if constexpr (IsTMACopy) {
-        write_tma<NumCopyThreads>(O, tma_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o, write_warp_idx);
+        const SMemO& sO, int m_block, int bidh, int bidb, int n_split_idx,
+        const SeqLenTraits& seqlen_traits_o, int write_warp_idx, TiledMma & tiledMma1, TensorO & tOrO) {
+
+    if constexpr (IsRegToGmem) {
+        static_assert(Is_split, "use write_rmem_to_gmem with split kv kernel only");
+	    write_rmem_to_gmem(tOrO, O, layout_O, tile_shape_O, m_block, bidh, bidb, n_split_idx,
+		     tiledMma1, seqlen_traits_o, threadIdx.x - NumCopyThreads);
+    } else if constexpr (IsTMACopy) {
+        write_tma<Is_split, NumCopyThreads>(O, tma_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb,
+            n_split_idx, seqlen_traits_o, write_warp_idx);
     } else {
+        static_assert(!Is_split, "Don't use write_tiled with split kv kernel");
         write_tiled<NumCopyThreads>(O, tiled_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o);
     }
 }