diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp index e946735e2374..24d7102281ee 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp @@ -450,37 +450,6 @@ assignMemoryLayouts(llvm::SmallVector> // If we can't agree on a shared encoding skip pipelinig the load. if (incompatible) continue; - - // HACK: Triton LLVM codegen has a bug where local_loads from #shared to - // #mma layout can lead to invalid code if the loaded shape is smaller - // than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with - // tile {16,8} is bad because 1 < 8). To work around this, don't - // pipeline such loads. - // - // The codegen bug is caught by an assertion, so if you think you've - // fixed it, feel free to delete this code and see if the assert still - // fails. :) - if (!loadInfo.sharedEncoding) { - if (auto dotEnc = dyn_cast( - dot.getResult().getType().getEncoding())) { - auto loadTy = cast(op->getResultTypes()[0]); - auto mmaInstrShape = dotEnc.getInstrShape(); - if (loadTy.getRank() < mmaInstrShape.size()) - continue; - bool ok = true; - for (int i = 0; i < mmaInstrShape.size(); i++) { - if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() + - i] < mmaInstrShape[i]) { - ok = false; - break; - } - } - // If this load might trigger the bug, don't do the fallback logic - // below, which might allow the load to be pipelined. - if (!ok) - continue; - } - } } } else if (auto loadOp = dyn_cast(use)) { // The use of this loadOp is another loadOp. If the use is not in the diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index 3d215a635da3..028f14d4f369 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -1450,7 +1450,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : // ----- // COMMON-LABEL: @dont_pipeline_128x1 -// COMMON-NOT: local_load{{.*}}128x1 +// AMD-NOT: local_load{{.*}}128x1 +// CHECK: local_load{{.*}}128x1 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {