Tensor memory data path pattern matching

NVIDIA · Mar 5, 2025 · aae038f · aae038f
1 parent f246e8e
commit aae038f
Show file tree

Hide file tree

Showing 2 changed files with 186 additions and 12 deletions.
diff --git a/csrc/device_lower/analysis/tensor_memory.cpp b/csrc/device_lower/analysis/tensor_memory.cpp
@@ -8,10 +8,15 @@
 
 #include <device_lower/analysis/tensor_memory.h>
 #include <device_lower/lower2device.h>
+#include <expr_simplifier.h>
 #include <fusion.h>
 #include <ir/all_nodes.h>
+#include <options.h>
+#include <scheduler/tools/abstract_tensor.h>
 #include <type.h>
+#include <utils.h>
 
+#include <ranges>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -193,7 +198,38 @@ TMemAlllocationInfo computeTMemAlllocationInfo(Fusion* fusion) {
 std::pair<
     std::unordered_map<TensorView*, TMemRegisterDataPath>,
     std::unordered_map<TensorView*, TMemRegisterDataPath>>
-computeTMemLdStDataPath(Fusion* fusion) {
+computeTMemLdStDataPath(Fusion* fusion, const TMemAlllocationInfo& allocation) {
+  // This function uses simplifyExpr extensively. If we have disable expression
+  // simplification in order to help inspect generated kernels then we will get
+  // incorrect results here. Instead, we ensure it is enabled using this guard.
+  DisableOptionsGuard dog;
+  DisableOptionsGuard::getCurOptions().unset(DisableOption::ExprSimplify);
+  // In the CUDA programming model, each CTA has TIDx, TIDy, and TIDz.
+  // Unfortunatly, the mapping of these TIDs to hardware concepts like warp,
+  // warp group, are not clear and depend on the kernel launch configuration.
+  // Here, we try to not assume anything like "TIDx must be a multiple of 32",
+  // but still, we must be able to validate and pattern match the data access
+  // of the tensor memory load/store.
+  const auto& pdim_map = GpuLower::current()->parallelDimensionMap();
+  // Get the TID Parallel types that we are interested in. We ignore parallel
+  // types that are not used in the kernel, and the ones that have size 1.
+  std::vector<ParallelType> tid_ptypes;
+  for (auto pt : {
+           ParallelType::TIDz,
+           ParallelType::TIDy,
+           ParallelType::TIDx,
+       }) {
+    Val* size = pdim_map.getRaw(pt);
+    if (size == nullptr) {
+      continue;
+    }
+    Val* size_is_one =
+        simplifyExpr(SimplifyingIrBuilder::eqExpr(size, fusion->oneVal()));
+    if (size_is_one->isTrue()) {
+      continue;
+    }
+    tid_ptypes.push_back(pt);
+  }
   // For all expressions in the fusion, find the data path
   using DPMap = std::unordered_map<TensorView*, TMemRegisterDataPath>;
   DPMap load_data_path;
@@ -214,6 +250,132 @@ computeTMemLdStDataPath(Fusion* fusion) {
     } else {
       continue;
     }
+    const auto& loop_domain = ir_utils::getTvOutput(ldst)->getLoopDomain();
+    const auto& tmem_tv_info = allocation.getTVInfo(tmem_tv);
+    auto& id_graph = GpuLower::current()->tensorIndexer().traversalGraph();
+    ValGroups lane_allocation_valgroups =
+        id_graph.toGroups(tmem_tv_info.lane_allocation);
+
+    NVF_CHECK(
+        !tid_ptypes.empty(),
+        "Invalid data access pattern in TMem load/store: ",
+        "TMem load/store must be warp-collective, but CTA size is one.");
+
+    // We need to construct a ValGroup that represents "warp" for this
+    // expression from consumer's loop domain. Naively speaking, it is just
+    // split(TIDz * TIDy * TIDx, 32).inner, where TIDz, TIDy and TIDx are the
+    // IterDomains in the loop domain that has such parallelization. But
+    // unfortunately, in reality, it is not that simple. NVFuser allows
+    // parallelizating IterDomains in an inexact way, for example, if the
+    // kernel's parallel dimension size for TIDx is 64, then the IterDomain
+    // being parallelized with TIDx does not have to be exactly 64. This
+    // inexactness is especially common in warp-specialized kernels. If, for
+    // example, the TIDx parallelized IterDomain in the loop domain is not
+    // exact, then split(TIDz * TIDy * TIDx, 32).inner may not be the warp.
+    // To handle this, we need to introduce a concept called "contiguity of
+    // parallel types in the loop domain". We can represent wap as
+    // split(TIDz * TIDy * TIDx, 32).inner if and only if TIDz and TIDy
+    // are contiguous. If TIDz is not contiguous but TIDy is, then warp would
+    // be split(TIDy * TIDx, 32).inner. If neither TIDz nor TIDy is contiguous,
+    // then warp would be split(TIDx, 32).inner.
+
+    // Get the contiguity of tid_ptypes in the loop domain.
+    // The contiguity of each item in tid_ptypes are defined as follows:
+    // - The inner tid_ptypes is always contiguous.
+    // - The item at index i is contiguous if the item at index i+1 is
+    //   exact(its extent in the loop domain is the same as parallel
+    //   dimension size of the kernel).
+    std::vector<bool> contiguity;
+    contiguity.reserve(tid_ptypes.size());
+    bool prev_exact = true;
+    for (ParallelType pt : std::views::reverse(tid_ptypes)) {
+      contiguity.push_back(prev_exact);
+      // Update prev_exact
+      if (pdim_map.isExact(pt)) {
+        // If the parallel dimension map says exact, then all IDs with this
+        // parallel type have the same extent, so we can skip the equality check
+        // below.
+        prev_exact = true;
+        continue;
+      }
+      // If the parallel dimension map does not say exact, then pt could still
+      // be exact in this loop domain if the corresponding ID's extent is the
+      // same as the parallel dimension size of the kernel.
+      Val* pt_extent = pdim_map.getRaw(pt);
+      auto pt_in_loop_domain_it = std::find_if(
+          loop_domain.begin(), loop_domain.end(), [pt](IterDomain* id) {
+            return id->getParallelType() == pt;
+          });
+      if (pt_in_loop_domain_it == loop_domain.end()) {
+        prev_exact = false;
+        continue;
+      }
+      IterDomain* pt_in_loop_domain = *pt_in_loop_domain_it;
+      Val* extent_in_loop_domain = pt_in_loop_domain->extent();
+      // If we can not symbolically prove that the extents are the same, then
+      // we assume that they are not the same.
+      prev_exact = simplifyExpr(SimplifyingIrBuilder::eqExpr(
+                                    pt_extent, extent_in_loop_domain))
+                       ->isTrue();
+    }
+    std::reverse(contiguity.begin(), contiguity.end());
+
+    // Grab ValGroups for each parallel type from loop domain and store it in
+    // AbstractTensor
+    struct Contiguity {
+      bool contiguity;
+      static Contiguity merge(Contiguity x, Contiguity y) {
+        NVF_ERROR(x.contiguity);
+        return {y.contiguity};
+      }
+      static std::pair<Contiguity, Contiguity> split(Contiguity x) {
+        return {{true}, x};
+      }
+      static std::pair<Contiguity, Contiguity> swizzle(
+          Contiguity x,
+          Contiguity y) {
+        NVF_THROW("Should not reach here");
+      }
+    };
+    AbstractTensorWithInfo<Contiguity> pdims;
+    for (auto [i, pt] : enumerate(tid_ptypes)) {
+      auto id_it = std::find_if(
+          loop_domain.begin(), loop_domain.end(), [pt](IterDomain* id) {
+            return id->getParallelType() == pt;
+          });
+      if (id_it == loop_domain.end()) {
+        continue;
+      }
+      IterDomain* id = *id_it;
+      const ValGroup& val_group = id_graph.toGroup(id);
+      pdims.pushBack(
+          ValGroupAndItsGraph{val_group, &id_graph}, Contiguity{contiguity[i]});
+    }
+
+    // Merge contiguous parallel types
+    for (int64_t index = 0; index < (int64_t)pdims.size() - 1;) {
+      if (pdims.info(index).contiguity) {
+        pdims.merge(index);
+      } else {
+        index++;
+      }
+    }
+
+    // The innermost merged parallel type must be a multiple of 32, otherwise
+    // the expr won't be warp-collective.
+    Val* inner_extent = pdims.back()
+                            .as<ValGroupAndItsGraph>()
+                            .group->front()
+                            ->as<IterDomain>()
+                            ->extent();
+    Val* inner_extent_is_multiple_of_32 = SimplifyingIrBuilder::eqExpr(
+        SimplifyingIrBuilder::modExpr(
+            inner_extent, IrBuilder::create<Val>(32, DataType::Index)),
+        fusion->zeroVal());
+    GpuLower::current()->validate(
+        inner_extent_is_multiple_of_32,
+        "Invalid data access pattern in TMem load/store: ",
+        "TMem load/store must be warp-collective, but the innermost extent is not a multiple of 32.");
 
     // Start pattern matching:
     // fail_reasons will be used to store the reasons why the pattern does
@@ -223,11 +385,23 @@ computeTMemLdStDataPath(Fusion* fusion) {
     // Pattern match 32x32b
     if (!matched) {
       std::string reason_32x32b = "";
-      if (true) { // TODO: Implement the pattern matching
+      AbstractTensorWithInfo<Contiguity> t = pdims;
+      t.split(-1, 32);
+      const ValGroup& warp = t.back().as<ValGroupAndItsGraph>().group;
+      Val* stride = lower_utils::proveLinearAndGetStride(
+          id_graph, warp, lane_allocation_valgroups);
+      if (stride == nullptr) {
+        reason_32x32b =
+            "Not 32x32b because warps are not linearly accessing the lane allocation.";
+        fail_reasons.push_back(std::move(reason_32x32b));
+      } else {
+        GpuLower::current()->validate(
+            SimplifyingIrBuilder::eqExpr(stride, fusion->oneVal()),
+            "Invalid data access pattern in TMem load/store: ",
+            "Warp linearly accessing lanes, but not with stride 1.");
+        matched = true;
         (*target)[tmem_tv] = TMemRegisterDataPath::Path32x32b;
-        continue;
       }
-      fail_reasons.push_back(std::move(reason_32x32b));
     }
     // TODO: Pattern match 16x64b
     if (!matched) {
@@ -274,7 +448,7 @@ TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
   TensorMemoryInfo result;
   result.allocation = computeTMemAlllocationInfo(fusion);
   std::tie(result.load_data_path, result.store_data_path) =
-      computeTMemLdStDataPath(fusion);
+      computeTMemLdStDataPath(fusion, result.allocation);
   return result;
 }
 

diff --git a/doc/dev/tmem.md b/doc/dev/tmem.md
@@ -485,7 +485,6 @@ With the above restrictions in mind, let's take a look at a few examples of how
 NOT to schedule TMem load and store:<!-- */ //-->\
 ```cpp
 TEST_F(TMemTutorialC, NotWarpCollective) {
-  NOT_IMPLEMENTED
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -507,16 +506,17 @@ TEST_F(TMemTutorialC, NotWarpCollective) {
 
   EXPECT_THAT(
       [&]() { KernelExecutor().compile(&fusion); },
-      ::testing::ThrowsMessage<nvfError>(
-          ::testing::HasSubstr("TMem load/store must be warp collective.")));
+      ::testing::ThrowsMessage<nvfError>(::testing::HasSubstr(
+          "Invalid data access pattern in TMem load/store: "
+          "TMem load/store must be warp-collective, "
+          "but the innermost extent is not a multiple of 32.")));
 } /*
 ```
 
 The above example is invalid because there are only 16 threads in the kernel.
 Warp collective operations require at least a whole warp to run.<!-- */ //-->\
 ```cpp
 TEST_F(TMemTutorialC, NotContiguous) {
-  NOT_IMPLEMENTED
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -540,7 +540,8 @@ TEST_F(TMemTutorialC, NotContiguous) {
   EXPECT_THAT(
       [&]() { KernelExecutor().compile(&fusion); },
       ::testing::ThrowsMessage<nvfError>(::testing::HasSubstr(
-          "Invalid data access pattern in TMem load/store.")));
+          "Invalid data access pattern in TMem load/store: "
+          "Warp linearly accessing lanes, but not with stride 1.")));
 } /*
 ```
 
@@ -552,7 +553,6 @@ patterns requires the warp to access a contiguous 32 or 16 lanes of data
 .<!-- */ //-->\
 ```cpp
 TEST_F(TMemTutorialC, OneLane) {
-  NOT_IMPLEMENTED
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -576,7 +576,7 @@ TEST_F(TMemTutorialC, OneLane) {
   EXPECT_THAT(
       [&]() { KernelExecutor().compile(&fusion); },
       ::testing::ThrowsMessage<nvfError>(::testing::HasSubstr(
-          "Invalid data access pattern in TMem load/store.")));
+          "Invalid data access pattern in TMem load/store:")));
 } /*
 ```