NVIDIA · zasdfgbnm · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/csrc/device_lower/analysis/tensor_memory.cpp b/csrc/device_lower/analysis/tensor_memory.cpp
@@ -437,7 +437,28 @@ computeTMemLdStDataPath(Fusion* fusion, const TMemAlllocationInfo& allocation) {
       }
       NVF_THROW(error.str());
     }
-    // TODO: Validate that we are accessing the correct sub-partition
+    // Validate that warps are accessing the correct sub-partition
+    AbstractTensorWithInfo<Contiguity> t = pdims;
+    t.split(-1, 32);
+    t.split(-2, 4);
+    Val* warp_group_stride = lower_utils::proveLinearAndGetStride(
+        id_graph,
+        t[-2].as<ValGroupAndItsGraph>().group,
+        lane_allocation_valgroups);
+    NVF_ERROR(
+        warp_group_stride != nullptr,
+        "Invalid data access pattern in TMem load/store: ",
+        "Warps are not accessing the correct sub-partition.");
+    // The stride must be either 0 or 32, 32 is the most common case.
+    // 0 is a special value indicating that there is only one warp.
+    GpuLower::current()->validate(
+        SimplifyingIrBuilder::logicalOrExpr(
+            SimplifyingIrBuilder::eqExpr(
+                warp_group_stride, IrBuilder::create<Val>(32)),
+            SimplifyingIrBuilder::eqExpr(
+                warp_group_stride, IrBuilder::create<Val>(0))),
+        "Invalid data access pattern in TMem load/store: ",
+        "Warps are not accessing the correct sub-partition.");
   }
   return {std::move(load_data_path), std::move(store_data_path)};
 }

diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
@@ -1298,7 +1298,7 @@ Val* extent(const Composition<Projection>& comp) {
   return std::accumulate(
       comp.begin(),
       comp.end(),
-      static_cast<Val*>(nullptr),
+      FusionGuard::getCurFusion()->oneVal(),
       [](Val* acc, const auto& g) {
         return SimplifyingIrBuilder::mulExpr(acc, extent(g));
       });
@@ -1483,8 +1483,6 @@ Projection propagate(
     const ExprGroup& eg,
     Direction direction) {
   // Just recursively propagate subtree.
-  auto from = fromGroups(id_graph, eg, direction);
-  auto to = toGroups(id_graph, eg, direction);
   auto propagated = propagate(*part.what, id_graph, eg, direction);
   if (!propagated.hasValue()) {
     return {};
@@ -1618,6 +1616,9 @@ Val* proveLinearAndGetStrideAfterPropagation(
 Val* proveLinearAndGetStrideAfterPropagation(
     const Composition<Projection>& comp,
     const ValGroups& domain) {
+  if (comp.empty()) {
+    return FusionGuard::getCurFusion()->zeroVal();
+  }
   auto it = search(domain, comp);
   if (it == domain.end()) {
     return nullptr;
@@ -1717,7 +1718,6 @@ PartOf<Projection> cancelCommonFactors(const PartOf<Projection>& part) {
   if (new_inner_extent->isOne()) {
     new_inner_extent = nullptr;
   }
-  NVF_ERROR(!dq.empty());
   if (dq.size() == 1) {
     return PartOf<Projection>{
         std::make_shared<Projection>(dq.front()),
@@ -1806,7 +1806,6 @@ PartOf<Projection> trimRedundant(const PartOf<Projection>& part) {
   while (count < (int64_t)dq.size()) {
     dq.pop_front();
   }
-  NVF_ERROR(!dq.empty());
   if (dq.size() == 1) {
     return PartOf<Projection>{
         std::make_shared<Projection>(dq.front()),

diff --git a/doc/dev/tmem.md b/doc/dev/tmem.md
@@ -586,7 +586,6 @@ columns of the tensor memory, while all the specified patterns requires the warp
 to access a contiguous 32 or 16 lanes of data.<!-- */ //-->\
 ```cpp
 TEST_F(TMemTutorialC, WrongSubpartition) {
-  NOT_IMPLEMENTED
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -610,7 +609,8 @@ TEST_F(TMemTutorialC, WrongSubpartition) {
   EXPECT_THAT(
       [&]() { KernelExecutor().compile(&fusion); },
       ::testing::ThrowsMessage<nvfError>(::testing::HasSubstr(
-          "Invalid data access pattern in TMem load/store.")));
+          "Invalid data access pattern in TMem load/store: "
+          "Warps are not accessing the correct sub-partition.")));
 } /*
 ```
 
@@ -621,7 +621,6 @@ However, warp 0 can only access subpartition 0, and warp 1 can only access
 subpartition 1.<!-- */ //-->\
 ```cpp
 TEST_F(TMemTutorialC, WrongSubpartition2) {
-  NOT_IMPLEMENTED
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -645,7 +644,8 @@ TEST_F(TMemTutorialC, WrongSubpartition2) {
   EXPECT_THAT(
       [&]() { KernelExecutor().compile(&fusion); },
       ::testing::ThrowsMessage<nvfError>(::testing::HasSubstr(
-          "Invalid data access pattern in TMem load/store.")));
+          "Invalid data access pattern in TMem load/store: "
+          "Warps are not accessing the correct sub-partition.")));
 } /*
 ```