diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp index b090670d955c..d1c92bc80345 100644 --- a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp @@ -22,10 +22,6 @@ void lowerDistributedToShared( std::pair *const llvmOpCount = nullptr) { auto srcTy = cast(src.getType()); auto dstTy = cast(dst.getType()); - auto outOrd = mlir::cast(dstTy.getEncoding()).getOrder(); - assert(srcTy.getShape().size() <= 2 || - (srcTy.getShape().size() == 3 && outOrd[2] == 0) && - "Unexpected rank of ConvertLayout(blocked->shared)"); auto elemTy = typeConverter->convertType(srcTy.getElementType()); auto smemBase = smemObj.getBase(); diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index fa8ec2b926eb..3ae8e8bbcde3 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -563,7 +563,8 @@ bool canFoldIntoConversion(Operation *op, Attribute targetEncoding) { } return isa(op); + triton::gpu::LocalAllocOp, triton::gpu::LocalLoadOp, + triton::gpu::LocalStoreOp>(op); } scf::ForOp replaceForOpWithNewSignature( diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir index 2ec11a24f197..b78754a6b878 100644 --- a/test/TritonGPU/combine.mlir +++ b/test/TritonGPU/combine.mlir @@ -2685,3 +2685,19 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war tt.return } } + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 1, 2, 2, 1], order = [4, 0, 1, 2, 3]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [1, 1, 32, 1, 1], warpsPerCTA = [1, 1, 1, 1, 4], order = [4, 3, 2, 1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1, 1, 4], threadsPerWarp = [2, 1, 16, 1, 1], warpsPerCTA = [1, 2, 2, 1, 1], order = [4, 0, 3, 2, 1]}> +#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [4, 0, 1, 2, 3], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:100", "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-NOT: convert_layout + tt.func public @lift_convert_to_local_load(%arg0 : !tt.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable>) -> tensor<2x4x32x1x4xi8, #blocked2> { + %1 = triton_gpu.local_load %arg0 : !tt.memdesc<2x1x32x4x4xi8, #shared, #triton_gpu.shared_memory, mutable> -> tensor<2x1x32x4x4xi8, #blocked> + %2 = tt.trans %1 {order = array} : tensor<2x1x32x4x4xi8, #blocked> -> tensor<2x4x32x1x4xi8, #blocked1> + %3 = triton_gpu.convert_layout %2 : tensor<2x4x32x1x4xi8, #blocked1> -> tensor<2x4x32x1x4xi8, #blocked2> + tt.return %3 : tensor<2x4x32x1x4xi8, #blocked2> + } +} diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp index cc52507121b5..91887c68df22 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -913,9 +913,6 @@ struct AsyncCopyGlobalToLocalOpConversion assert((isa(srcLayout) && "Unexpected srcLayout in AsyncCopyGlobalToLocalOpConversion")); auto resSharedLayout = cast(dstTy.getEncoding()); - auto srcShape = srcTy.getShape(); - assert((srcShape.size() <= 3) && - "insert_slice_async: Unexpected rank of %src"); Value llDst = adaptor.getResult(); Value llSrc = adaptor.getSrc();