prune more dead code

Dao-AILab · jayhshah · Oct 15, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
commit b3d60fa3a56ff58d3c2b1f27177e990573e3621d
diff --git a/hopper/epilogue_fwd_sm90_tma.hpp b/hopper/epilogue_fwd_sm90_tma.hpp
@@ -112,7 +112,6 @@ struct CollectiveEpilogueFwd {
                                  Stride<_4, _32, _1, _0>>;
     using ValueLayoutrO = Layout<Shape<_1, _2, Shape<_2, _2>, Int<kHeadDim/16>>,
                                 Stride<_0, _2, Stride<_4, _1>, _8>>;
-    // using AccessTyperO = std::conditional_t<cutlass::sizeof_bits_v<Element> == 16, uint16_t, uint32_t>;
     using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom<UniversalCopy<Element>, Element>{},
                       ThreadLayoutrO{}, ValueLayoutrO{}));
     using TiledCopyShaperO = Shape<_8, Int<kBlockM/8>, _16, Int<kHeadDim/16>>;
@@ -248,22 +247,20 @@ struct CollectiveEpilogueFwd {
                 }
             }
         }
-
-        int write_warp_idx = kNWarps - 1;
-        if constexpr(!No_smem_O) {
-            if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
-                cutlass::arch::NamedBarrier::sync(
-                    NumMmaThreads + cutlass::NumThreadsPerWarp, 
-                    cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
-                );
-            }
-        }        
+
         if constexpr (No_smem_O) { 
             flash::write_rmem_to_gmem<Seqlen_traits::UseGQAPacking, epi_column_permute>(
                 tOrO_out, epilogue_params.ptr_O, epilogue_params.layout_O, TileShapeOCopy{}, 
                 m_block, h_block, bidh, bidh_kv, bidb, n_split_idx,
                 tiled_mma, seqlen_traits_q, thread_idx);
         } else {
+            int write_warp_idx = kNWarps - 1;
+            if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
+                cutlass::arch::NamedBarrier::sync(
+                    NumMmaThreads + cutlass::NumThreadsPerWarp, 
+                    cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
+                );
+            }
             TiledCopyO gmem_tiled_copy_O;
             Tensor sO_out = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutOCopy{});        
             if constexpr(!Seqlen_traits::UseGQAPacking) {

diff --git a/hopper/flash_fwd_kernel.h b/hopper/flash_fwd_kernel.h
@@ -33,8 +33,6 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
                     ) {
 
     using Element = typename Ktraits::Element;
-    using ElementAccum = typename Ktraits::ElementAccum;
-    using SoftType = ElementAccum;
     using TileShape_MNK = typename Ktraits::TileShape_MNK;
     using ClusterShape = typename Ktraits::ClusterShape_MNK;
 
@@ -47,7 +45,7 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
     static constexpr int kBlockM = Ktraits::kBlockM;
     static constexpr int kBlockH = Ktraits::kBlockH;
     // static constexpr int kBlockN = Ktraits::kBlockN;
-    // constexpr int kHeadDim = Ktraits::kHeadDim;
+    // static constexpr int kHeadDim = Ktraits::kHeadDim;
 
     using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Is_local, Seqlen_traits, Seqlen_traits_Q>;
     using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits_Q>;
@@ -222,15 +220,12 @@ __global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp,
 
     using Element = typename Ktraits::Element;
     static_assert(cutlass::sizeof_bits_v<Element> == 8);
-    using ElementAccum = typename Ktraits::ElementAccum;
-    using SoftType = ElementAccum;
     using TileShape_MNK = typename Ktraits::TileShape_MNK;
     using ClusterShape = typename Ktraits::ClusterShape_MNK;
 
     static_assert(Ktraits::Is_WS);
     static constexpr bool Is_WS = Ktraits::Is_WS;
     static constexpr bool No_smem_O = Ktraits::No_smem_O;
-    // static constexpr bool UseVarSeqLen = Seqlen_traits::UseVarSeqLen;
 
     static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
     static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;

diff --git a/hopper/flash_fwd_launch_template.h b/hopper/flash_fwd_launch_template.h
@@ -108,11 +108,9 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
 
     int num_blocks_m = cutlass::ceil_div(params.seqlen_q, Kernel_traits::kBlockM/Kernel_traits::kBlockH);
     num_blocks_m = cutlass::ceil_div(num_blocks_m, size<0>(ClusterShape{})) * size<0>(ClusterShape{});    
-    int num_grid_heads = params.h_k * ceil_div(params.h_h_k_ratio, Kernel_traits::kBlockH);
-
-    // std::cout << "num blocks m = " << num_blocks_m << " num grid heads" << num_grid_heads << std::endl;
+    int num_blocks_h = params.h_k * ceil_div(params.h_h_k_ratio, Kernel_traits::kBlockH);
     typename Scheduler::Arguments scheduler_args =
-        {num_blocks_m, Is_split ? params.num_splits : 1, num_grid_heads, params.b, params.tile_count_semaphore};
+        {num_blocks_m, Is_split ? params.num_splits : 1, num_blocks_h, params.b, params.tile_count_semaphore};
     typename Scheduler::Params scheduler_params = Scheduler::to_underlying_arguments(scheduler_args);    
 
     // Get the ptr to kernel function.

diff --git a/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp b/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -86,11 +86,11 @@ struct CollectiveMainloopFwd {
     using TileShape_MNK = typename Ktraits::TileShape_MNK;
     using ClusterShape = typename Ktraits::ClusterShape_MNK;
 
-    static constexpr int  kStages  = Ktraits::kStages;
-    static constexpr int  kHeadDim = Ktraits::kHeadDim;
-    // static constexpr int  kBlockM  = Ktraits::kBlockM;
-    // static constexpr int  kBlockN  = Ktraits::kBlockN;
-    // static constexpr int  kBlockH  = Ktraits::kBlockH;
+    static constexpr int kStages = Ktraits::kStages;
+    static constexpr int kHeadDim = Ktraits::kHeadDim;
+    // static constexpr int kBlockM = Ktraits::kBlockM;
+    // static constexpr int kBlockN = Ktraits::kBlockN;
+    // static constexpr int kBlockH = Ktraits::kBlockH;
     static constexpr bool Is_split = Ktraits::Is_split;
     static constexpr bool No_smem_O = Ktraits::No_smem_O;
 
@@ -250,7 +250,6 @@ struct CollectiveMainloopFwd {
         n_block_max = cute::ceil_div(seqlen_k, kBlockN);
 
         if constexpr(Is_split) {
-            // int const num_n_blocks = ceil_div(seqlen_k, kBlockN);
             int const n_blocks_per_split
                 = mainloop_params.num_splits_divmod.divide(n_block_max + int(mainloop_params.num_splits_divmod) - 1);
             n_block_min = n_split_idx * n_blocks_per_split;
@@ -360,7 +359,6 @@ struct CollectiveMainloopFwd {
             }
         }
 
-        // int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
         int n_block = n_block_max - 1;
 
         int lane_predicate = cute::elect_one_sync();
@@ -498,7 +496,6 @@ struct CollectiveMainloopFwd {
             }
         }
 
-        // int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
         int n_block = n_block_max - 1;
 
         int lane_predicate = cute::elect_one_sync();
@@ -763,7 +760,6 @@ struct CollectiveMainloopFwd {
         Tensor scores_scale = make_fragment_like(softmax.row_max);
         clear(scores_scale);
 
-        // TODO: modify this for split kv to eliminate superfluous masking steps
         constexpr int n_masking_steps = !Is_causal ? 1 : cute::ceil_div(kBlockM_div_H, kBlockN) + 1;
         // Only go through these if Is_causal, since n_masking_steps = 1 when !Is_causal
         #pragma unroll