From 54d18fec7d79a42fa8ff79dd11dd89e384857d59 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Tue, 6 Aug 2024 15:06:38 +0000 Subject: [PATCH 1/5] [VectorDistribution] Configure contraction layouts at linalg level --- .../Common/GPU/AMDGPUDistributeContract.cpp | 11 +- .../Codegen/Common/GPU/GPUVectorAlloc.cpp | 62 ++--- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 13 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.h | 1 + .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td | 2 +- .../Transforms/VectorizeIREEVectorExtOps.cpp | 2 + .../LLVMGPU/AMDGPUChainedMatmulPass.cpp | 12 +- .../iree/compiler/Codegen/LLVMGPU/BUILD.bazel | 2 + .../compiler/Codegen/LLVMGPU/CMakeLists.txt | 2 + .../LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp | 9 +- .../LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp | 145 ++++++++++ .../LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp | 66 +---- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 11 +- .../iree/compiler/Codegen/LLVMGPU/Passes.td | 5 + .../Utils/AMDGPUDistributionPatterns.cpp | 8 +- .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 1 + .../Codegen/LLVMGPU/test/CMakeLists.txt | 1 + .../LLVMGPU/test/cast_type_to_fit_mma.mlir | 11 +- .../LLVMGPU/test/configure_tensor_layout.mlir | 152 +++++++++++ .../LLVMGPU/test/configure_vector_layout.mlir | 255 ------------------ .../compiler/Codegen/Utils/VectorOpUtils.cpp | 35 ++- .../compiler/Codegen/Utils/VectorOpUtils.h | 4 +- 22 files changed, 427 insertions(+), 383 deletions(-) create mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp create mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp index a52ead0f5f0f..362f0c1694e5 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp @@ -28,6 +28,14 @@ struct DistributeContract final : OpDistributionPattern { LogicalResult matchAndRewrite(vector::ContractionOp contractOp, DistributionSignature &signature, PatternRewriter &rewriter) const override { + // Infer the contract kind so that we know know to correlate M/N/K dims. + auto maybeOpDetail = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpDetail)) { + return rewriter.notifyMatchFailure(contractOp, "invalid contraction"); + } + VectorContractOpInfo opDetail = maybeOpDetail.value(); + auto resultType = dyn_cast(contractOp.getResultType()); if (!resultType) { return rewriter.notifyMatchFailure( @@ -65,9 +73,6 @@ struct DistributeContract final : OpDistributionPattern { contractOp, "missing iree.amdgpu.mma intrinsic attribute"); } - // Infer the contract kind so that we know know to correlate M/N/K dims. - VectorContractOpInfo opDetail(contractOp); - SmallVector distShape = resultLayout.getDistributedShape(); LLVM_DEBUG({ llvm::dbgs() << "distributed shape: ["; diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp index 127b436c8885..3a915c9d11a4 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp @@ -5,6 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree/compiler/Codegen/Common/GPU/Passes.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h" +#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" #include "llvm/Support/Debug.h" @@ -80,12 +82,7 @@ static FailureOr allocateTensorForVector(OpBuilder &b, Location loc, Value copied = b.create(loc, vector, allocTensorOp, indices, inBounds) .getResult(); - // Create a marker for bufferization to keep this tensor in place. This - // prevents read/write forwarding of the transfers used to do the copy. - return b - .create(copied.getLoc(), - copied, copied) - ->getResult(0); + return copied; } static Value readVectorFromTensor(OpBuilder &b, VectorType vectorType, @@ -104,46 +101,45 @@ struct GPUVectorAllocPass final void runOnOperation() override { FunctionOpInterface funcOp = getOperation(); - SmallVector opsToPromote; - funcOp.walk([&](vector::ContractionOp op) { - // Today we only do promotion for certain contractions. - if (contractOpFilter(op)) + SmallVector opsToPromote; + funcOp.walk([&](IREE::VectorExt::ToLayoutOp op) { + if (op->hasAttr("shared_memory_conversion")) { opsToPromote.push_back(op); + } }); - for (vector::ContractionOp contractOp : opsToPromote) { - OpBuilder builder(contractOp); + + for (IREE::VectorExt::ToLayoutOp op : opsToPromote) { + OpBuilder builder(op); // HACK: Until proper barrier placement is handled later we have to // synchronize explicitly in this pass. // Synchronize before the write to shared memory to avoid stepping over - // reads in the previous iteration of a loop. - builder.create(contractOp->getLoc()); + // reads in the previous iteration of a loop. We set this barrier + // at the start of this block. + builder.setInsertionPointToStart(op->getBlock()); + builder.create(op->getLoc()); // Promote both of the input operands, excluding the accumulator. - OpOperand &lhs = contractOp.getLhsMutable(); - FailureOr lhsRet = - allocateTensorForVector(builder, contractOp->getLoc(), lhs.get()); - if (failed(lhsRet)) { - return signalPassFailure(); - } - - OpOperand &rhs = contractOp.getRhsMutable(); - FailureOr rhsRet = - allocateTensorForVector(builder, contractOp->getLoc(), rhs.get()); - if (failed(rhsRet)) { + builder.setInsertionPoint(op); + OpOperand &operand = op.getInputMutable(); + FailureOr ret = + allocateTensorForVector(builder, op->getLoc(), operand.get()); + if (failed(ret)) { return signalPassFailure(); } // Synchronize after the write to shared memory before we read from it. - builder.create(contractOp->getLoc()); - - Value lhsVec = - readVectorFromTensor(builder, contractOp.getLhsType(), *lhsRet); - Value rhsVec = - readVectorFromTensor(builder, contractOp.getRhsType(), *rhsRet); - lhs.set(lhsVec); - rhs.set(rhsVec); + auto synced = + builder.create(op->getLoc(), *ret); + + VectorType inputTy = cast(op.getType()); + Value read = readVectorFromTensor(builder, inputTy, synced.getResult(0)); + operand.set(read); + + // Remove the shared_memory_conversion attribute from the to_layout + // operation. + op->removeAttr("shared_memory_conversion"); } } }; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 831bdfe1ac1c..56558603b820 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -977,8 +977,14 @@ NestedLayoutAttr createNestedLayout(MLIRContext *context, int64_t rank, FailureOr> -MMAScheduleAttr::getContractionLayout(vector::ContractionOp contractOp) const { - VectorContractOpInfo opInfo(contractOp); +MMAScheduleAttr::getContractionLayout(linalg::GenericOp contractOp) const { + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpInfo)) { + return failure(); + } + VectorContractOpInfo opInfo = maybeOpInfo.value(); + LLVM_DEBUG({ llvm::errs() << "Getting mma layouts for:\n" << contractOp << "\n"; llvm::errs() << "For schedule: " << *this << "\n"; @@ -991,8 +997,7 @@ MMAScheduleAttr::getContractionLayout(vector::ContractionOp contractOp) const { auto mmaAttr = llvm::cast(getIntrinsic()); MLIRContext *context = getContext(); - SmallVector bounds; - contractOp.getIterationBounds(bounds); + SmallVector bounds = contractOp.getStaticLoopRanges(); int64_t batchCount = opInfo.getBatchCount(); if (batchCount == 1 && bounds[0] != 1) { LLVM_DEBUG({ llvm::errs() << "non-unit batch dimension\n"; }); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h index 295aaf0158c1..8e6a9e15a716 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h @@ -11,6 +11,7 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 8222117213b5..00829c087742 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -251,7 +251,7 @@ def IREEGPU_MmaScheduleAttr : AttrDef { ::mlir::FailureOr<::std::tuple> - getContractionLayout(::mlir::vector::ContractionOp contractOp) const; + getContractionLayout(::mlir::linalg::GenericOp contractOp) const; }]; } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp index 84cb05260830..00c664c932a1 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp @@ -49,6 +49,8 @@ struct VectorizeToLayoutOpPattern final // Create the toLayout operation but with vector types instead. auto newLayoutOp = rewriter.create( loc, newInput.getType(), newInput, toLayoutOp.getLayout()); + // Set attributes. + newLayoutOp->setAttrs(toLayoutOp->getAttrs()); // Create the write back to a tensor. int64_t rank = inputTy.getRank(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp index 6a1646e5c4ac..00c5028896cf 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp @@ -69,6 +69,14 @@ struct AMDGPUPrepareForChainedMatmulPass final registry.insert(); } + VectorContractOpInfo getOpInfo(vector::ContractionOp contract) const { + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contract.getIndexingMapsArray()); + assert(succeeded(maybeOpInfo) && + "contraction info for vector.contract should always be valid"); + return maybeOpInfo.value(); + } + VectorValue swapDims(RewriterBase &rewriter, VectorValue val, int64_t dimA, int64_t dimB) const { ArrayRef shape = val.getType().getShape(); @@ -109,7 +117,7 @@ struct AMDGPUPrepareForChainedMatmulPass final /// simply swap the operands without transposing them. void swapOperandsAndTranspose(RewriterBase &rewriter, vector::ContractionOp contractOp) const { - VectorContractOpInfo opInfo(contractOp); + VectorContractOpInfo opInfo = getOpInfo(contractOp); auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); auto [lhsK, rhsK] = opInfo.getOperandKIndex(); auto [accM, accN] = opInfo.getResultMNIndex(); @@ -177,7 +185,7 @@ struct AMDGPUPrepareForChainedMatmulPass final bool isOperandSwapInvariant(vector::ContractionOp contractOp) const { // Check if the innermost m, n, k dimensions are in the order: // lhs: (m, k), rhs: (n, k) - VectorContractOpInfo opInfo(contractOp); + VectorContractOpInfo opInfo = getOpInfo(contractOp); auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); auto [lhsK, rhsK] = opInfo.getOperandKIndex(); bool isLhsTransposed = lhsM > lhsK; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel index 35bafc739f14..c6fefa699e2d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel @@ -93,6 +93,7 @@ iree_compiler_cc_library( "KernelConfig.cpp", "LLVMGPUCastAddressSpaceFunction.cpp", "LLVMGPUCastTypeToFitMMA.cpp", + "LLVMGPUConfigureTensorLayouts.cpp", "LLVMGPUConfigureVectorLayouts.cpp", "LLVMGPULowerExecutableTarget.cpp", "LLVMGPUPackSharedMemoryAlloc.cpp", @@ -133,6 +134,7 @@ iree_compiler_cc_library( "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:ConfigUtils", "//compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms:GPUTransforms", "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect", + "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms:VectorExtTransforms", "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface", "//compiler/src/iree/compiler/Codegen/Interfaces:UKernelOpInterface", "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt index 0f8b40b34bc5..aaee76a02ea3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt @@ -78,6 +78,7 @@ iree_cc_library( "KernelConfig.cpp" "LLVMGPUCastAddressSpaceFunction.cpp" "LLVMGPUCastTypeToFitMMA.cpp" + "LLVMGPUConfigureTensorLayouts.cpp" "LLVMGPUConfigureVectorLayouts.cpp" "LLVMGPULowerExecutableTarget.cpp" "LLVMGPUPackSharedMemoryAlloc.cpp" @@ -175,6 +176,7 @@ iree_cc_library( iree::compiler::Codegen::Dialect::GPU::TargetUtils::ConfigUtils iree::compiler::Codegen::Dialect::GPU::Transforms::GPUTransforms iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect + iree::compiler::Codegen::Dialect::VectorExt::Transforms::VectorExtTransforms iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface iree::compiler::Codegen::Interfaces::UKernelOpInterface iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp index 013745ef072e..359c6ffa0fcd 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp @@ -30,7 +30,12 @@ struct UpcastContractOutput final : OpRewritePattern { LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { - VectorContractOpInfo opInfo(contractOp); + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpInfo)) { + return rewriter.notifyMatchFailure(contractOp, "not a contraction"); + } + VectorContractOpInfo opInfo = maybeOpInfo.value(); auto srcCType = dyn_cast(contractOp.getAccType()); if (!srcCType) { @@ -68,6 +73,8 @@ struct UpcastContractOutput final : OpRewritePattern { auto newContractOp = rewriter.create( loc, contractOp.getLhs(), contractOp.getRhs(), extOp, contractOp.getIndexingMaps(), contractOp.getIteratorTypes()); + newContractOp->setDiscardableAttrs( + contractOp->getDiscardableAttrDictionary()); rewriter.replaceOpWithNewOp(contractOp, srcCType, newContractOp); return success(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp new file mode 100644 index 000000000000..6becfe281fbf --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp @@ -0,0 +1,145 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" +#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" +#include "iree/compiler/Codegen/LLVMGPU/Passes.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeUtilities.h" + +#define DEBUG_TYPE "iree-llvmgpu-configure-vector-layouts" + +namespace mlir::iree_compiler { + +#define GEN_PASS_DEF_LLVMGPUCONFIGURETENSORLAYOUTSPASS +#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc" + +namespace { + +LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, + RewriterBase &rewriter, + linalg::GenericOp contract) { + // TODO: Add SIMT fallback. + if (!schedule) { + return contract->emitError("missing mma schedule for contraction"); + } + + // This function should have only be called on a contraction op. + assert(linalg::isaContractionOpInterface(contract) && + "cannot set contraction anchor on non contraction op"); + + auto layouts = schedule.getContractionLayout(contract); + if (failed(layouts)) { + return contract->emitError("cannot get concrete layout for contraction"); + } + + auto [aLayout, bLayout, cLayout] = *layouts; + Location loc = contract.getLoc(); + + Value lhs = contract.getOperand(0); + Value rhs = contract.getOperand(1); + Value acc = contract.getOperand(2); + + // Set layouts for lhs, rhs and acc. + rewriter.setInsertionPoint(contract); + auto layoutedLhs = rewriter.create( + loc, lhs.getType(), lhs, aLayout); + auto layoutedRhs = rewriter.create( + loc, rhs.getType(), rhs, bLayout); + auto layoutedAcc = rewriter.create( + loc, acc.getType(), acc, cLayout); + + // Promote matmul lhs and rhs. + // TODO: We should read this from the lowering_config on the operation. + // TODO: This is a hack until layout analysis is improved. The layout analysis + // should decide where to put these shared memory conversions. + layoutedLhs->setAttr("shared_memory_conversion", rewriter.getUnitAttr()); + layoutedRhs->setAttr("shared_memory_conversion", rewriter.getUnitAttr()); + + contract->setOperand(0, layoutedLhs.getResult()); + contract->setOperand(1, layoutedRhs.getResult()); + contract->setOperand(2, layoutedAcc.getResult()); + + // Set layout for result. + rewriter.setInsertionPointAfter(contract); + auto toLayout = rewriter.create( + loc, contract.getResult(0).getType(), contract.getResult(0), cLayout); + rewriter.replaceAllUsesExcept(contract.getResult(0), toLayout.getResult(), + toLayout); + + return success(); +} + +struct LLVMGPUConfigureTensorLayoutsPass final + : impl::LLVMGPUConfigureTensorLayoutsPassBase< + LLVMGPUConfigureTensorLayoutsPass> { + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + } + + void runOnOperation() override { + auto func = getOperation(); + + std::array workgroupSize; + if (func->hasAttr("workgroup_size")) { + auto tmpSizes = + llvm::cast(func->getAttr("workgroup_size")).getValue(); + for (auto [i, size] : llvm::enumerate(tmpSizes)) { + workgroupSize[i] = llvm::cast(size).getInt(); + } + } else { + std::optional> maybeWorkgroupSize = + getWorkgroupSize(func); + if (!maybeWorkgroupSize) { + func->emitOpError() + << "unable to query workgroup_size information from entry point"; + return signalPassFailure(); + } + for (auto [index, value] : llvm::enumerate(maybeWorkgroupSize.value())) { + workgroupSize[index] = value; + } + for (auto index : llvm::seq(maybeWorkgroupSize->size(), 3)) { + workgroupSize[index] = 1; + } + } + + llvm::StringLiteral scheduleAttrName = + IREE::GPU::MMAScheduleAttr::getMnemonic(); + auto scheduleAttr = + func->getAttrOfType(scheduleAttrName); + if (!scheduleAttr) { + DictionaryAttr configDict = getTranslationInfo(func).getConfiguration(); + scheduleAttr = dyn_cast_or_null( + configDict.get(scheduleAttrName)); + } + + // Vector layout option setter aimed at contractions. For now, layout + // setting for other problems like reductions is TODO. + SmallVector contracts; + + func->walk([&](linalg::GenericOp linalgOp) { + if (linalg::isaContractionOpInterface(linalgOp)) { + contracts.push_back(linalgOp); + } + }); + + IRRewriter rewriter(func); + + for (linalg::GenericOp contract : contracts) { + if (failed(setContractionAnchor(scheduleAttr, rewriter, contract))) { + return signalPassFailure(); + } + } + } +}; +} // namespace + +} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp index b37d25af33e6..b1d83c417179 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp @@ -31,56 +31,6 @@ namespace mlir::iree_compiler { namespace { -// Sets an anchoring layout for the given contraction op. Looks for a -// supported mma type from the cached list of mma types and populates the -// necessary distribution pattern for those contractions. -LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, - RewriterBase &rewriter, - vector::ContractionOp contract) { - // TODO: Add SIMT fallback. - if (!schedule) { - return contract->emitError("missing mma schedule for contraction"); - } - - auto layouts = schedule.getContractionLayout(contract); - if (failed(layouts)) { - return contract->emitError("cannot get concrete layout for contraction"); - } - - auto [aLayout, bLayout, cLayout] = *layouts; - Location loc = contract.getLoc(); - - // Set layouts for lhs, rhs and acc. - rewriter.setInsertionPoint(contract); - Value layoutedLhs = rewriter.create( - loc, contract.getLhsType(), contract.getLhs(), aLayout); - Value layoutedRhs = rewriter.create( - loc, contract.getRhsType(), contract.getRhs(), bLayout); - Value layoutedAcc = rewriter.create( - loc, contract.getAccType(), contract.getAcc(), cLayout); - contract->setOperand(0, layoutedLhs); - contract->setOperand(1, layoutedRhs); - contract->setOperand(2, layoutedAcc); - - // Set layout for result. - rewriter.setInsertionPointAfter(contract); - auto toLayout = rewriter.create( - loc, contract.getResultType(), contract.getResult(), cLayout); - rewriter.replaceAllUsesExcept(contract, toLayout.getResult(), toLayout); - - // Set intrinsic kind. - contract->setAttr("iree.amdgpu.mma", schedule.getIntrinsic()); - - LLVM_DEBUG({ - llvm::dbgs() << "chosen a layout: " << aLayout << "\n"; - llvm::dbgs() << "chosen b layout: " << bLayout << "\n"; - llvm::dbgs() << "chosen c layout: " << cLayout << "\n"; - llvm::dbgs() << "anchor set on contract: " << contract << "\n"; - }); - - return success(); -} - // Sets a layout anchor for reads from global memory. // The layout this generates is approximately the following: // @@ -333,16 +283,10 @@ struct LLVMGPUConfigureVectorLayoutsPass final // should receive layouts. Layout setting for other problems like reductions // is TODO. SmallVector reads; - SmallVector contracts; func->walk([&](Operation *op) { - llvm::TypeSwitch(op) - .Case([&](vector::TransferReadOp transfer) { - reads.push_back(transfer); - }) - .Case([&](vector::ContractionOp contract) { - contracts.push_back(contract); - }); + llvm::TypeSwitch(op).Case( + [&](vector::TransferReadOp transfer) { reads.push_back(transfer); }); }); IRRewriter rewriter(func); @@ -352,12 +296,6 @@ struct LLVMGPUConfigureVectorLayoutsPass final return signalPassFailure(); } } - - for (vector::ContractionOp contract : contracts) { - if (failed(setContractionAnchor(scheduleAttr, rewriter, contract))) { - return signalPassFailure(); - } - } } }; } // namespace diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 7c71f8675717..dab959aa51d9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -14,6 +14,7 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h" +#include "iree/compiler/Codegen/Dialect/VectorExt/Transforms/Passes.h" #include "iree/compiler/Codegen/LLVMGPU/Passes.h" #include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" @@ -239,6 +240,8 @@ static void tileAndBufferize(OpPassManager &funcPassManager) { static void addGPUVectorizationPasses(OpPassManager &funcPassManager) { funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass()); funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass()); + funcPassManager.addPass( + IREE::VectorExt::createVectorizeIREEVectorExtOpsPass()); // Vectorize. GenericVectorizationPassOptions options; options.vectorizePadding = true; @@ -819,8 +822,9 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, // be safely dropped. This additionally allows vectorization of convolution to // `vector.contract` as filter dimensions are expected to be tiled to 1 by // this point. + funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass()); if (!usePadToModelSharedMemcpy) { - funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass()); + // Folding unit dims gets confused with padding. LinalgFoldUnitExtentDimsPassOptions options; options.useRankReducingSlices = true; funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(options)); @@ -829,12 +833,17 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, } funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); + // Set anchors at tensor level for vector distribution later. + funcPassManager.addPass(createLLVMGPUConfigureTensorLayoutsPass()); // Linalg -> Vector addGPUVectorizationPasses(funcPassManager); // Allocate tensors for copies to shared memory. funcPassManager.addPass(createGPUVectorAllocPass()); + funcPassManager.addPass(createCanonicalizerPass()); + funcPassManager.addPass(createCSEPass()); + funcPassManager.addPass(createGPUCombineValueBarriersPass()); // Tensor -> Memref addVectorBufferizePasses(funcPassManager); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td index 33df5d26beba..ef51a6a9a883 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td @@ -77,6 +77,11 @@ def LLVMGPUCastTypeToFitMMAPass : InterfacePass<"iree-llvmgpu-cast-type-to-fit-m "to target GPU MMA intrinsics"; } +def LLVMGPUConfigureTensorLayoutsPass : + InterfacePass<"iree-llvmgpu-configure-tensor-layouts", "mlir::FunctionOpInterface"> { + let summary = "Pass to set layouts on tensors for later vector distribution"; +} + def LLVMGPUConfigureVectorLayoutsPass : InterfacePass<"iree-llvmgpu-configure-vector-layouts", "mlir::FunctionOpInterface"> { let summary = "Pass to set layouts for vector distribution"; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp index 986a389e788c..48de6cdaeef8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp @@ -25,6 +25,13 @@ struct DistributeContractions final LogicalResult matchAndRewrite(vector::ContractionOp contractOp, DistributionSignature &signature, PatternRewriter &rewriter) const override { + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpInfo)) { + return rewriter.notifyMatchFailure(contractOp, "invalid contraction"); + } + VectorContractOpInfo opInfo = maybeOpInfo.value(); + VectorValue result = dyn_cast(contractOp.getResult()); if (!result) { return rewriter.notifyMatchFailure(contractOp, @@ -67,7 +74,6 @@ struct DistributeContractions final Value vector = rewriter.create( loc, vectorType, rewriter.getZeroAttr(vectorType)); - VectorContractOpInfo opInfo(contractOp); auto [lhsK, rhsK] = opInfo.getOperandKIndex(); std::optional kBatch = layouts[LHS].getBatchDim(lhsK); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 3ed0f3692fb0..f46541233da1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -74,6 +74,7 @@ iree_lit_test_suite( "transpose_pipeline_test.mlir", "ukernel_pipeline_transform.mlir", "configure_vector_layout.mlir", + "configure_tensor_layout.mlir", "vector_lowering.mlir", "vector_to_gpu.mlir", "winograd_pipeline_test.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 692ce93d2916..264ed672012d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -23,6 +23,7 @@ iree_lit_test_suite( "cast_type_to_fit_mma.mlir" "config_matvec.mlir" "config_winograd.mlir" + "configure_tensor_layout.mlir" "configure_vector_layout.mlir" "conv_pipeline_test_cuda.mlir" "conv_pipeline_test_rocm.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir index f8c43416b95f..4f97eb6d4163 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir @@ -18,7 +18,8 @@ func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf // CHECK: %[[MM:.+]] = vector.contract // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>] // CHECK-SAME iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind -// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> +// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] +// CHECK-SAME: vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> // CHECK: %[[TRUNC:.+]] = arith.truncf %[[MM]] : vector<96x64xf32> to vector<96x64xf16> // CHECK: return %[[TRUNC]] : vector<96x64xf16> @@ -60,7 +61,7 @@ func.func @mfma_matmul_96x64x16_mm_cannot_downcast(%lhs: vector<96x16xf16>, %rhs // CHECK-LABEL: func.func @mfma_matmul_96x64x16_mm_cannot_downcast // CHECK-NOT: arith.extf // CHECK: vector.contract -// CHECK-SAME: %{{.+}}, %{{.+}}, %{{.+}} : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf64> +// CHECK-SAME: vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf64> // CHECK-NOT: arith.truncf // ----- @@ -83,7 +84,8 @@ func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf // CHECK: %[[MM:.+]] = vector.contract // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>] // CHECK-SAME iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind -// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] : vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> +// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] +// CHECK-SAME: vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> // CHECK: %[[TRUNC:.+]] = arith.truncf %[[MM]] : vector<48x32xf32> to vector<48x32xf16> // CHECK: return %[[TRUNC]] : vector<48x32xf16> @@ -111,6 +113,7 @@ func.func @transform_dialect_mfma_matmul_96x64x16(%lhs: vector<96x16xf16>, %rhs: // CHECK: %[[MM:.+]] = vector.contract // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>] // CHECK-SAME iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind -// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> +// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] +// CHECK-SAME: vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> // CHECK: %[[TRUNC:.+]] = arith.truncf %[[MM]] : vector<96x64xf32> to vector<96x64xf16> // CHECK: return %[[TRUNC]] : vector<96x64xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir new file mode 100644 index 000000000000..06cd515dc0ff --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir @@ -0,0 +1,152 @@ +// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-tensor-layouts, canonicalize, cse))' %s | FileCheck %s + +#translation = #iree_codegen.translation_info, + subgroup_m_count = 1, + subgroup_n_count = 1>}> + +#maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (n, k)>, + affine_map<(m, n, k) -> (m, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func.func @matmul_96x64x16(%lhs: tensor<96x16xf16>, + %rhs: tensor<64x16xf16>, + %init: tensor<96x64xf32>) + -> tensor<96x64xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<96x16xf16>, tensor<64x16xf16>) + outs(%init: tensor<96x64xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<96x64xf32> + return %out : tensor<96x64xf32> +} + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout + +// CHECK-LABEL: func.func @matmul_96x64x16 + +// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion} +// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion} +// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] +// CHECK: linalg.generic +// CHECK-SAME: ins(%[[LHS]], %[[RHS]] +// CHECK-SAME: outs(%[[ACC]] + +// ----- + +#translation = #iree_codegen.translation_info, + subgroup_m_count = 4, + subgroup_n_count = 1>}> + +#maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (n, k)>, + affine_map<(m, n, k) -> (m, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>, + %rhs: tensor<64x16xf16>, + %init: tensor<128x64xf32>) + -> tensor<128x64xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<128x16xf16>, tensor<64x16xf16>) + outs(%init: tensor<128x64xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<128x64xf32> + return %out : tensor<128x64xf32> +} + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, + subgroup_m_count = 2, + subgroup_n_count = 2>}> + +#maps = [ + affine_map<(bm, bn, m, n, k) -> (bm, m, k)>, + affine_map<(bm, bn, m, n, k) -> (bn, n, k)>, + affine_map<(bm, bn, m, n, k) -> (bm, m, bn, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"] +} + +func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>, + %rhs: tensor<8x16x16xf16>, + %init: tensor<8x16x8x16xf32>) + -> tensor<8x16x8x16xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<8x16x16xf16>, tensor<8x16x16xf16>) + outs(%init: tensor<8x16x8x16xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<8x16x8x16xf32> + return %out : tensor<8x16x8x16xf32> +} + + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout +// CHECK-LABEL: func.func @packed_matmul_128x128x128 + +// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion} +// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion} +// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] +// CHECK: linalg.generic +// CHECK-SAME: ins(%[[LHS]], %[[RHS]] +// CHECK-SAME: outs(%[[ACC]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir index 2f12b1595f0e..6e16d2f65b44 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir @@ -1,107 +1,5 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-vector-layouts, canonicalize, cse))' %s | FileCheck %s -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// Since CHECK-SAME doesnt work with CHECK-DAG, we cannot have prettier tests. - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @mfma_matmul_96x64x16_mm -func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf16>, %init: vector<96x64xf32>) -> vector<96x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> - return %0 : vector<96x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @mfma_matmul_96x64x16_mmt -func.func @mfma_matmul_96x64x16_mmt(%lhs: vector<96x16xf16>, %rhs: vector<64x16xf16>, %init: vector<96x64xf32>) -> vector<96x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>, affine_map<(m, n, d2) -> (m, n)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<96x16xf16>, vector<64x16xf16> into vector<96x64xf32> - return %0 : vector<96x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @mfma_matmul_96x64x16_mmtt -func.func @mfma_matmul_96x64x16_mmtt(%lhs: vector<96x16xf16>, %rhs: vector<64x16xf16>, %init: vector<64x96xf32>) -> vector<64x96xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>, affine_map<(m, n, k) -> (n, m)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<96x16xf16>, vector<64x16xf16> into vector<64x96xf32> - return %0 : vector<64x96xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, %rhs: vector<16x64xf16>, %init: vector<192x64xf32>) -> vector<192x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<192x16xf16>, vector<16x64xf16> into vector<192x64xf32> - return %0 : vector<192x64xf32> -} - -// ----- - #translation = #iree_codegen.translation_info, // ----- -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @wmma_matmul_48x32x32_mm -func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf32>) -> vector<48x32xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> - return %0 : vector<48x32xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @wmma_matmul_48x32x32_mmt -func.func @wmma_matmul_48x32x32_mmt(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf32>) -> vector<48x32xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>, affine_map<(m, n, d2) -> (m, n)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> - return %0 : vector<48x32xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @matmul_192x64x16_mmt_multi_m -func.func @matmul_192x64x16_mmt_multi_m(%lhs: vector<2x64x16xf16>, %rhs: vector<16x64xf16>, %init: vector<2x64x64xf32>) -> vector<2x64x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<2x64x16xf16>, vector<16x64xf16> into vector<2x64x64xf32> - return %0 : vector<2x64x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 4, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @matmul_192x64x16_mmt_multi_split_m -func.func @matmul_192x64x16_mmt_multi_split_m(%lhs: vector<2x64x16xf16>, %rhs: vector<16x64xf16>, %init: vector<2x64x64xf32>) -> vector<2x64x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<2x64x16xf16>, vector<16x64xf16> into vector<2x64x64xf32> - return %0 : vector<2x64x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, workgroup_size = [128, 2, 1]}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, %rhs: vector<2x16x64xf16>, %init: vector<4x2x64x64xf32>) -> vector<4x2x64x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], - iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<4x64x16xf16>, vector<2x16x64xf16> into vector<4x2x64x64xf32> - return %0 : vector<4x2x64x64xf32> -} - -// ----- - #translation = #iree_codegen.translation_info, memref<128x128xf16, #gpu.address_space> return } - -// ----- - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>}> -// CHECK-LABEL: func.func @batch_matmul_unit_batch -func.func @batch_matmul_unit_batch(%arg0: vector<1x64x64xf16>, %arg1: vector<1x64x128xf16>, %arg2: vector<1x64x128xf32>) -> vector<1x64x128xf32> attributes {translation_info = #translation} { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["parallel", "parallel", "parallel", "reduction"], - kind = #vector.kind} - %arg0, %arg1, %arg2 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf32> - return %0 : vector<1x64x128xf32> -} diff --git a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp index 0bf721ce738a..b6b30b519fe9 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp @@ -26,24 +26,35 @@ std::pair VectorContractOpInfo::getResultMNIndex() const { return std::make_pair(outMDims.back(), outNDims.back()); } -VectorContractOpInfo::VectorContractOpInfo(vector::ContractionOp op) { - contractionDims = *linalg::inferContractionDims(op.getIndexingMapsArray()); - - SmallVector maps = op.getIndexingMapsArray(); - MLIRContext *ctx = op.getContext(); +FailureOr +VectorContractOpInfo::inferFromIndexingMaps(ArrayRef maps) { + auto maybeContractionDims = linalg::inferContractionDims(maps); + if (failed(maybeContractionDims)) { + return failure(); + } + auto contractionDims = maybeContractionDims.value(); + MLIRContext *ctx = maps[0].getContext(); + VectorContractOpInfo opInfo; for (auto m : contractionDims.m) { - lhsMDims.push_back(*maps[0].getResultPosition(getAffineDimExpr(m, ctx))); - outMDims.push_back(*maps[2].getResultPosition(getAffineDimExpr(m, ctx))); + opInfo.lhsMDims.push_back( + *maps[0].getResultPosition(getAffineDimExpr(m, ctx))); + opInfo.outMDims.push_back( + *maps[2].getResultPosition(getAffineDimExpr(m, ctx))); } for (auto n : contractionDims.n) { - rhsNDims.push_back(*maps[1].getResultPosition(getAffineDimExpr(n, ctx))); - outNDims.push_back(*maps[2].getResultPosition(getAffineDimExpr(n, ctx))); + opInfo.rhsNDims.push_back( + *maps[1].getResultPosition(getAffineDimExpr(n, ctx))); + opInfo.outNDims.push_back( + *maps[2].getResultPosition(getAffineDimExpr(n, ctx))); } - int64_t k = contractionDims.k.back(); - lhsKDim = *maps[0].getResultPosition(getAffineDimExpr(k, ctx)); - rhsKDim = *maps[1].getResultPosition(getAffineDimExpr(k, ctx)); + opInfo.lhsKDim = *maps[0].getResultPosition(getAffineDimExpr(k, ctx)); + opInfo.rhsKDim = *maps[1].getResultPosition(getAffineDimExpr(k, ctx)); + + opInfo.contractionDims = contractionDims; + + return opInfo; } } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h index f8e0cf9ac763..bbe5d26106ff 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h @@ -5,14 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" namespace mlir::iree_compiler { /// A class for querying information about a contract op. class VectorContractOpInfo { public: - explicit VectorContractOpInfo(vector::ContractionOp op); + static FailureOr + inferFromIndexingMaps(ArrayRef maps); // Returns the (LHS M, RHS N) dimension index pair. std::pair getOperandMNIndex() const; From f1c44e3a2a0e6a9e383bd51b9c30e25384668f3e Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Thu, 22 Aug 2024 17:08:39 +0000 Subject: [PATCH 2/5] Update tests --- .../compiler/Codegen/Common/GPU/Passes.td | 5 +- .../Common/GPU/test/gpu_vector_alloc.mlir | 55 +++---- .../LLVMGPU/test/configure_vector_layout.mlir | 137 ++---------------- 3 files changed, 35 insertions(+), 162 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td index f08455138f76..50b85bd4d89c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td @@ -208,7 +208,10 @@ def GPUVectorAllocPass : let summary = "Pass to create allocations for contraction inputs to copy " "to GPU shared memory"; let dependentDialects = [ - "::mlir::gpu::GPUDialect", "::mlir::bufferization::BufferizationDialect" + "::mlir::gpu::GPUDialect", + "::mlir::vector::VectorDialect", + "::mlir::bufferization::BufferizationDialect", + "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect", ]; } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir index 5d005b861e31..07a82f727dea 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_alloc.mlir @@ -1,41 +1,26 @@ // RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-vector-alloc))" | FileCheck %s -func.func @matmul_256x256x256(%lhs: tensor<16x256xf16>, - %rhs: tensor<256x16xf16>, - %out: tensor<16x16xf32>) -> tensor<16x16xf32> { - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32> - %c32 = arith.constant 32 : index - %c256 = arith.constant 256 : index - %c0 = arith.constant 0 : index - %8 = scf.for %arg0 = %c0 to %c256 step %c32 iter_args(%arg1 = %cst_0) -> (vector<16x16xf32>) { - %10 = vector.transfer_read %lhs[%c0, %arg0], %cst {in_bounds = [true, true]} : tensor<16x256xf16>, vector<16x32xf16> - %11 = vector.transfer_read %rhs[%arg0, %c0], %cst {in_bounds = [true, true]} : tensor<256x16xf16>, vector<32x16xf16> - %12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %10, %11, %arg1 : vector<16x32xf16>, vector<32x16xf16> into vector<16x16xf32> - scf.yield %12 : vector<16x16xf32> - } - %9 = vector.transfer_write %8, %out[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32> - return %9 : tensor<16x16xf32> -} - +#layout = #iree_vector_ext.nested_layout< + subgroups_per_workgroup = [1, 1], + batches_per_subgroup = [1, 1], + outers_per_batch = [1, 1], + threads_per_outer = [4, 16], + elements_per_thread = [4, 1], -// CHECK-LABEL: func.func @matmul_256x256x256 -// CHECK: scf.for {{.*}} -> (vector<16x16xf32>) { -// CHECK-DAG: %[[A:.*]] = vector.transfer_read %{{.*}} : tensor<16x256xf16>, vector<16x32xf16> -// CHECK-DAG: %[[B:.*]] = vector.transfer_read %{{.*}} : tensor<256x16xf16>, vector<32x16xf16> -// CHECK: gpu.barrier + subgroup_strides = [1, 1], + thread_strides = [0, 0] +> -// LHS copy. -// CHECK: %[[PA:.*]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16x32xf16, #gpu.address_space> -// CHECK: %[[LWRITE:.+]] = vector.transfer_write %[[A]], %[[PA]]{{.*}} : vector<16x32xf16>, tensor<16x32xf16, #gpu.address_space> -// CHECK: %[[LCOPY:.+]] = bufferization.materialize_in_destination %[[LWRITE]] in %[[LWRITE]] +func.func @test(%vector: vector<16x16xf16>) -> vector<16x16xf16> { + %out = iree_vector_ext.to_layout %vector to #layout {shared_memory_conversion} : vector<16x16xf16> + return %out : vector<16x16xf16> +} -// RHS copy. -// CHECK: %[[PB:.*]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<32x16xf16, #gpu.address_space> -// CHECK: %[[RWRITE:.+]] = vector.transfer_write %[[B]], %[[PB]]{{.*}} : vector<32x16xf16>, tensor<32x16xf16, #gpu.address_space> -// CHECK: %[[RCOPY:.+]] = bufferization.materialize_in_destination %[[RWRITE]] in %[[RWRITE]] -// CHECK: gpu.barrier -// CHECK: %[[LHS:.+]] = vector.transfer_read %[[LCOPY]]{{.*}} : tensor<16x32xf16, #gpu.address_space>, vector<16x32xf16> -// CHECK: %[[RHS:.+]] = vector.transfer_read %[[RCOPY]]{{.*}} : tensor<32x16xf16, #gpu.address_space>, vector<32x16xf16> -// CHECK: %12 = vector.contract {{.*}} %[[LHS]], %[[RHS]], %{{.*}} +// CHECK-LABEL: func.func @test +// CHECK: gpu.barrier +// CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space} : tensor<16x16xf16, #gpu.address_space> +// CHECK: %[[WRITE:.+]] = vector.transfer_write %{{.*}}, %[[ALLOC]] +// CHECK: %[[BAR:.+]] = iree_gpu.value_barrier %[[WRITE]] +// CHECK: %[[READ:.+]] = vector.transfer_read %[[BAR]] +// CHECK: %[[OUT:.+]] = iree_vector_ext.to_layout %[[READ]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir index 6e16d2f65b44..cdd77558284c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir @@ -1,55 +1,5 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-vector-layouts, canonicalize, cse))' %s | FileCheck %s -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @matmul_16x16x256_read -func.func @matmul_16x16x256_read(%lhs: memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, - %rhs: memref<256x16xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, - %out: memref<16x16xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type>) - attributes { translation_info = #translation } { - %alloc = memref.alloc() : memref<32x16xf16, #gpu.address_space> - %alloc_0 = memref.alloc() : memref<16x32xf16, #gpu.address_space> - %cst = arith.constant 0.000000e+00 : f16 - %cst_1 = arith.constant dense<0.000000e+00> : vector<16x16xf32> - %c32 = arith.constant 32 : index - %c256 = arith.constant 256 : index - %c0 = arith.constant 0 : index - %5 = scf.for %arg0 = %c0 to %c256 step %c32 iter_args(%arg1 = %cst_1) -> (vector<16x16xf32>) { - %6 = vector.transfer_read %lhs[%c0, %arg0], %cst {in_bounds = [true, true]} : memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<16x32xf16> - %7 = vector.transfer_read %rhs[%arg0, %c0], %cst {in_bounds = [true, true]} : memref<256x16xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<32x16xf16> - // CHECK: %[[READ0:.+]] = vector.transfer_read - // CHECK: to_layout %[[READ0]] to #[[$NESTED]] - // CHECK: %[[READ1:.+]] = vector.transfer_read - // CHECK: to_layout %[[READ1]] to #[[$NESTED1]] - vector.transfer_write %6, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<16x32xf16>, memref<16x32xf16, #gpu.address_space> - gpu.barrier - vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16, #gpu.address_space> - gpu.barrier - %8 = vector.transfer_read %alloc_0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x32xf16, #gpu.address_space>, vector<16x32xf16> - %9 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<32x16xf16, #gpu.address_space>, vector<32x16xf16> - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %10 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %8, %9, %arg1 : vector<16x32xf16>, vector<32x16xf16> into vector<16x16xf32> - scf.yield %10 : vector<16x16xf32> - } - vector.transfer_write %5, %out[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type> - memref.dealloc %alloc_0 : memref<16x32xf16, #gpu.address_space> - memref.dealloc %alloc : memref<32x16xf16, #gpu.address_space> - return -} - // ----- #translation = #iree_codegen.translation_info // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-LABEL: func.func @matmul_16x16x256_read_permute -func.func @matmul_16x16x256_read_permute(%lhs: memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, +// CHECK-LABEL: func.func @transfer_read_permute +func.func @transfer_read_permute(%lhs: memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, %rhs: memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, %out: memref<16x16xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type>) attributes { translation_info = #translation } { + %alloc = memref.alloc() : memref<32x16xf16, #gpu.address_space> %alloc_0 = memref.alloc() : memref<16x32xf16, #gpu.address_space> %cst = arith.constant 0.000000e+00 : f16 @@ -72,34 +23,14 @@ func.func @matmul_16x16x256_read_permute(%lhs: memref<16x256xf16, strided<[256, %c32 = arith.constant 32 : index %c256 = arith.constant 256 : index %c0 = arith.constant 0 : index - %init_acc = vector.transfer_read %out[%c0, %c0], %cst_f32 {in_bounds = [true, true]} - : memref<16x16xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<16x16xf32> - // CHECK: scf.for - %5 = scf.for %arg0 = %c0 to %c256 step %c32 iter_args(%arg1 = %init_acc) -> (vector<16x16xf32>) { - %6 = vector.transfer_read %lhs[%c0, %arg0], %cst {in_bounds = [true, true]} : memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<16x32xf16> - %7 = vector.transfer_read %rhs[%arg0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, d0)>} : memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<32x16xf16> - // CHECK: %[[READ0:.+]] = vector.transfer_read - // CHECK: to_layout %[[READ0]] to #[[$NESTED]] - // CHECK: %[[READ1:.+]] = vector.transfer_read - // CHECK: to_layout %[[READ1]] to #[[$NESTED1]] - vector.transfer_write %6, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<16x32xf16>, memref<16x32xf16, #gpu.address_space> - gpu.barrier - vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16, #gpu.address_space> - gpu.barrier - %8 = vector.transfer_read %alloc_0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x32xf16, #gpu.address_space>, vector<16x32xf16> - %9 = vector.transfer_read %alloc[%c0, %c0], %cst {in_bounds = [true, true]} : memref<32x16xf16, #gpu.address_space>, vector<32x16xf16> - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %10 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %8, %9, %arg1 : vector<16x32xf16>, vector<32x16xf16> into vector<16x16xf32> - scf.yield %10 : vector<16x16xf32> - } - vector.transfer_write %5, %out[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[256, 1], offset: ?>, #hal.descriptor_type> + %6 = vector.transfer_read %lhs[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<16x32xf16> + %7 = vector.transfer_read %rhs[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, d0)>} : memref<16x256xf16, strided<[256, 1], offset: ?>, #hal.descriptor_type>, vector<32x16xf16> + // CHECK: %[[READ0:.+]] = vector.transfer_read + // CHECK: to_layout %[[READ0]] to #[[$NESTED]] + // CHECK: %[[READ1:.+]] = vector.transfer_read + // CHECK: to_layout %[[READ1]] to #[[$NESTED1]] + vector.transfer_write %6, %alloc_0[%c0, %c0] {in_bounds = [true, true]} : vector<16x32xf16>, memref<16x32xf16, #gpu.address_space> + vector.transfer_write %7, %alloc[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16, #gpu.address_space> memref.dealloc %alloc_0 : memref<16x32xf16, #gpu.address_space> memref.dealloc %alloc : memref<32x16xf16, #gpu.address_space> return @@ -107,52 +38,6 @@ func.func @matmul_16x16x256_read_permute(%lhs: memref<16x256xf16, strided<[256, // ----- -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// We don't really care what layout we assign here, just that the only anchor -// we set is on the contraction. - -// CHECK-LABEL: func.func @matmul_16x16x256_fused -func.func @matmul_16x16x256_fused(%lhs: memref<16x32xf16>, - %rhs: memref<32x16xf16>, - %bias: memref<16x16xf32>, - %out: memref<16x16xf32>) - attributes { translation_info = #translation } { - %cst = arith.constant 0.000000e+00 : f16 - %cst_f32 = arith.constant 0.000000e+00 : f32 - %c32 = arith.constant 32 : index - %c256 = arith.constant 256 : index - %c0 = arith.constant 0 : index - %acc = vector.transfer_read %out[%c0, %c0], %cst_f32 {in_bounds = [true, true]} : memref<16x16xf32>, vector<16x16xf32> - %8 = vector.transfer_read %lhs[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x32xf16>, vector<16x32xf16> - %9 = vector.transfer_read %rhs[%c0, %c0], %cst {in_bounds = [true, true]} : memref<32x16xf16>, vector<32x16xf16> - // CHECK-DAG: %[[READA:.+]] = vector.transfer_read - // CHECK-DAG: %[[READB:.+]] = vector.transfer_read - // CHECK-DAG: %[[READC:.+]] = vector.transfer_read - // CHECK-NOT: to_layout %[[READA]] - // CHECK-NOT: to_layout %[[READB]] - // CHECK-NOT: to_layout %[[READC]] - - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %10 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %8, %9, %acc : vector<16x32xf16>, vector<32x16xf16> into vector<16x16xf32> - %11 = vector.transfer_read %bias[%c0, %c0], %cst_f32 {in_bounds = [true, true]} : memref<16x16xf32>, vector<16x16xf32> - %12 = arith.addf %10, %11 : vector<16x16xf32> - vector.transfer_write %12, %out[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32> - return -} - -// ----- - #translation = #iree_codegen.translation_info Date: Fri, 23 Aug 2024 14:35:19 +0000 Subject: [PATCH 3/5] Address comments --- .../Codegen/Common/GPU/GPUVectorAlloc.cpp | 4 +- .../Codegen/Common/VectorLayoutAnalysis.cpp | 4 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 7 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td | 2 +- .../Dialect/VectorExt/IR/VectorExtAttrs.td | 1 - .../Dialect/VectorExt/IR/VectorExtOps.td | 17 ++++- .../Transforms/VectorizeIREEVectorExtOps.cpp | 5 +- .../LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp | 76 ++++++++----------- .../LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp | 2 +- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 1 - .../TransformExtensions/LLVMGPUExtensions.cpp | 16 ++-- .../LLVMGPU/test/configure_tensor_layout.mlir | 55 +++++++++++++- 12 files changed, 123 insertions(+), 67 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp index 3a915c9d11a4..3eae712f02f9 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp @@ -103,7 +103,7 @@ struct GPUVectorAllocPass final SmallVector opsToPromote; funcOp.walk([&](IREE::VectorExt::ToLayoutOp op) { - if (op->hasAttr("shared_memory_conversion")) { + if (op.getSharedMemoryConversion()) { opsToPromote.push_back(op); } }); @@ -139,7 +139,7 @@ struct GPUVectorAllocPass final // Remove the shared_memory_conversion attribute from the to_layout // operation. - op->removeAttr("shared_memory_conversion"); + op.setSharedMemoryConversion(false); } } }; diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorLayoutAnalysis.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorLayoutAnalysis.cpp index 8abc49e4afbf..03ee0011627b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/VectorLayoutAnalysis.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/VectorLayoutAnalysis.cpp @@ -214,8 +214,8 @@ ChangeResult DistributionLayout::resolveWithPossibleConflict( Value input = opOperand.get(); // Create a resolution operation. This conflict should be handeled later by // someone else, not this analysis. - Operation *resolveOp = builder.create( - input.getLoc(), input.getType(), input, rhs); + Operation *resolveOp = + builder.create(input.getLoc(), input, rhs); Value resolvedValue = resolveOp->getResult(0); opOperand.set(resolvedValue); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 56558603b820..36b0419638eb 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -977,7 +977,7 @@ NestedLayoutAttr createNestedLayout(MLIRContext *context, int64_t rank, FailureOr> -MMAScheduleAttr::getContractionLayout(linalg::GenericOp contractOp) const { +MMAScheduleAttr::getContractionLayout(linalg::LinalgOp contractOp) const { auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( contractOp.getIndexingMapsArray()); if (failed(maybeOpInfo)) { @@ -998,6 +998,11 @@ MMAScheduleAttr::getContractionLayout(linalg::GenericOp contractOp) const { MLIRContext *context = getContext(); SmallVector bounds = contractOp.getStaticLoopRanges(); + if (llvm::any_of(bounds, + [](int64_t x) { return x == ShapedType::kDynamic; })) { + return failure(); + } + int64_t batchCount = opInfo.getBatchCount(); if (batchCount == 1 && bounds[0] != 1) { LLVM_DEBUG({ llvm::errs() << "non-unit batch dimension\n"; }); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 00829c087742..76b8f6bc025e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -251,7 +251,7 @@ def IREEGPU_MmaScheduleAttr : AttrDef { ::mlir::FailureOr<::std::tuple> - getContractionLayout(::mlir::linalg::GenericOp contractOp) const; + getContractionLayout(::mlir::linalg::LinalgOp contractOp) const; }]; } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td index 400a6eaf85a4..9b60503a6334 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td @@ -298,5 +298,4 @@ def NestedLayoutAttr : IREEVectorExt_Attr<"NestedLayout", let genVerifyDecl = 1; } - #endif // IREE_DIALECT_VECTOREXT_ATTRS diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.td b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.td index bb5c63424867..06c9b975b564 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.td @@ -33,14 +33,29 @@ def IREEVectorExt_ToLayoutOp : IREEVectorExt_PureOp<"to_layout", [ let description = [{ The layout conversion operator takes a shaped value and a layout and transforms the value to have that layout. + + If the "shared_memory_conversion" attribute is set, then this layout + change has to be materialized through shared memory. }]; let arguments = (ins AnyShaped:$input, - VectorLayoutInterface:$layout + VectorLayoutInterface:$layout, + DefaultValuedAttr:$shared_memory_conversion ); let results = (outs AnyShaped:$output ); + let builders = [ + OpBuilder<(ins "Value":$input, + "VectorLayoutInterface":$layout, + CArg<"bool", "false">:$shared_memory_conversion), [{ + if (shared_memory_conversion) { + build($_builder, $_state, input.getType(), input, layout, UnitAttr::get(input.getContext())); + } else{ + build($_builder, $_state, input.getType(), input, layout); + } + }]> + ]; let extraClassDeclaration = [{ bool hasTensorSemantics() { return isa(getOutput().getType()); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp index 00c664c932a1..e2c5c0c47bd5 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp @@ -48,9 +48,8 @@ struct VectorizeToLayoutOpPattern final // Create the toLayout operation but with vector types instead. auto newLayoutOp = rewriter.create( - loc, newInput.getType(), newInput, toLayoutOp.getLayout()); - // Set attributes. - newLayoutOp->setAttrs(toLayoutOp->getAttrs()); + loc, newInput, toLayoutOp.getLayout(), + toLayoutOp.getSharedMemoryConversion()); // Create the write back to a tensor. int64_t rank = inputTy.getRank(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp index 6becfe281fbf..fdaa546585b1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp @@ -25,7 +25,7 @@ namespace { LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, RewriterBase &rewriter, - linalg::GenericOp contract) { + linalg::LinalgOp contract) { // TODO: Add SIMT fallback. if (!schedule) { return contract->emitError("missing mma schedule for contraction"); @@ -43,25 +43,25 @@ LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, auto [aLayout, bLayout, cLayout] = *layouts; Location loc = contract.getLoc(); - Value lhs = contract.getOperand(0); - Value rhs = contract.getOperand(1); - Value acc = contract.getOperand(2); + Value lhs = contract->getOperand(0); + Value rhs = contract->getOperand(1); + Value acc = contract->getOperand(2); // Set layouts for lhs, rhs and acc. rewriter.setInsertionPoint(contract); - auto layoutedLhs = rewriter.create( - loc, lhs.getType(), lhs, aLayout); - auto layoutedRhs = rewriter.create( - loc, rhs.getType(), rhs, bLayout); - auto layoutedAcc = rewriter.create( - loc, acc.getType(), acc, cLayout); + auto layoutedLhs = + rewriter.create(loc, lhs, aLayout); + auto layoutedRhs = + rewriter.create(loc, rhs, bLayout); + auto layoutedAcc = + rewriter.create(loc, acc, cLayout); // Promote matmul lhs and rhs. // TODO: We should read this from the lowering_config on the operation. // TODO: This is a hack until layout analysis is improved. The layout analysis // should decide where to put these shared memory conversions. - layoutedLhs->setAttr("shared_memory_conversion", rewriter.getUnitAttr()); - layoutedRhs->setAttr("shared_memory_conversion", rewriter.getUnitAttr()); + layoutedLhs.setSharedMemoryConversion(true); + layoutedRhs.setSharedMemoryConversion(true); contract->setOperand(0, layoutedLhs.getResult()); contract->setOperand(1, layoutedRhs.getResult()); @@ -70,8 +70,8 @@ LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, // Set layout for result. rewriter.setInsertionPointAfter(contract); auto toLayout = rewriter.create( - loc, contract.getResult(0).getType(), contract.getResult(0), cLayout); - rewriter.replaceAllUsesExcept(contract.getResult(0), toLayout.getResult(), + loc, contract->getResult(0), cLayout); + rewriter.replaceAllUsesExcept(contract->getResult(0), toLayout.getResult(), toLayout); return success(); @@ -89,43 +89,31 @@ struct LLVMGPUConfigureTensorLayoutsPass final auto func = getOperation(); std::array workgroupSize; - if (func->hasAttr("workgroup_size")) { - auto tmpSizes = - llvm::cast(func->getAttr("workgroup_size")).getValue(); - for (auto [i, size] : llvm::enumerate(tmpSizes)) { - workgroupSize[i] = llvm::cast(size).getInt(); - } - } else { - std::optional> maybeWorkgroupSize = - getWorkgroupSize(func); - if (!maybeWorkgroupSize) { - func->emitOpError() - << "unable to query workgroup_size information from entry point"; - return signalPassFailure(); - } - for (auto [index, value] : llvm::enumerate(maybeWorkgroupSize.value())) { - workgroupSize[index] = value; - } - for (auto index : llvm::seq(maybeWorkgroupSize->size(), 3)) { - workgroupSize[index] = 1; - } + std::optional> maybeWorkgroupSize = + getWorkgroupSize(func); + if (!maybeWorkgroupSize) { + func->emitOpError() + << "unable to query workgroup_size information from entry point"; + return signalPassFailure(); + } + for (auto [index, value] : llvm::enumerate(maybeWorkgroupSize.value())) { + workgroupSize[index] = value; + } + for (auto index : llvm::seq(maybeWorkgroupSize->size(), 3)) { + workgroupSize[index] = 1; } llvm::StringLiteral scheduleAttrName = IREE::GPU::MMAScheduleAttr::getMnemonic(); - auto scheduleAttr = - func->getAttrOfType(scheduleAttrName); - if (!scheduleAttr) { - DictionaryAttr configDict = getTranslationInfo(func).getConfiguration(); - scheduleAttr = dyn_cast_or_null( - configDict.get(scheduleAttrName)); - } + DictionaryAttr configDict = getTranslationInfo(func).getConfiguration(); + auto scheduleAttr = dyn_cast_or_null( + configDict.get(scheduleAttrName)); // Vector layout option setter aimed at contractions. For now, layout // setting for other problems like reductions is TODO. - SmallVector contracts; + SmallVector contracts; - func->walk([&](linalg::GenericOp linalgOp) { + func->walk([&](linalg::LinalgOp linalgOp) { if (linalg::isaContractionOpInterface(linalgOp)) { contracts.push_back(linalgOp); } @@ -133,7 +121,7 @@ struct LLVMGPUConfigureTensorLayoutsPass final IRRewriter rewriter(func); - for (linalg::GenericOp contract : contracts) { + for (linalg::LinalgOp contract : contracts) { if (failed(setContractionAnchor(scheduleAttr, rewriter, contract))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp index b1d83c417179..006f3a2fbbda 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp @@ -225,7 +225,7 @@ LogicalResult setTransferReadAnchor(ArrayRef workgroupSize, Location loc = transfer.getLoc(); rewriter.setInsertionPointAfter(transfer); auto toLayout = rewriter.create( - loc, transfer.getResult().getType(), transfer.getResult(), layout); + loc, transfer.getResult(), layout); rewriter.replaceAllUsesExcept(transfer, toLayout.getResult(), toLayout); return success(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index dab959aa51d9..080fdd81bcfc 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -824,7 +824,6 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, // this point. funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass()); if (!usePadToModelSharedMemcpy) { - // Folding unit dims gets confused with padding. LinalgFoldUnitExtentDimsPassOptions options; options.useRankReducingSlices = true; funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(options)); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp index f720f07d938a..93eee368616d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp @@ -1670,20 +1670,20 @@ transform_dialect::SetContractionLayoutAttributes::apply( Operation *parentOp = operand.getDefiningOp(); if (!parentOp || (parentOp->getNumResults() != 1)) continue; - Value resolvedOperand = rewriter.create( - loc, operand.getType(), operand, readLayout); + Value resolvedOperand = + rewriter.create(loc, operand, readLayout); contract.setOperand(operandIndices[i], resolvedOperand); } } // Set layout anchors. rewriter.setInsertionPoint(contract); - Value newLhs = rewriter.create( - loc, contract.getLhsType(), contract.getLhs(), aLayout); - Value newRhs = rewriter.create( - loc, contract.getRhsType(), contract.getRhs(), bLayout); - Value newAcc = rewriter.create( - loc, contract.getAccType(), contract.getAcc(), cLayout); + Value newLhs = + rewriter.create(loc, contract.getLhs(), aLayout); + Value newRhs = + rewriter.create(loc, contract.getRhs(), bLayout); + Value newAcc = + rewriter.create(loc, contract.getAcc(), cLayout); contract.setOperand(0, newLhs); contract.setOperand(1, newRhs); contract.setOperand(2, newAcc); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir index 06cd515dc0ff..2e41cb02fbfb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir @@ -18,7 +18,7 @@ iterator_types = ["parallel", "parallel", "reduction"] } -func.func @matmul_96x64x16(%lhs: tensor<96x16xf16>, +func.func @matmul_96x64x16_mfma(%lhs: tensor<96x16xf16>, %rhs: tensor<64x16xf16>, %init: tensor<96x64xf32>) -> tensor<96x64xf32> @@ -40,7 +40,58 @@ func.func @matmul_96x64x16(%lhs: tensor<96x16xf16>, // CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout // CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout -// CHECK-LABEL: func.func @matmul_96x64x16 +// CHECK-LABEL: func.func @matmul_96x64x16_mfma + +// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion} +// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion} +// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] +// CHECK: linalg.generic +// CHECK-SAME: ins(%[[LHS]], %[[RHS]] +// CHECK-SAME: outs(%[[ACC]] + +// ----- + +#translation = #iree_codegen.translation_info, + subgroup_m_count = 1, + subgroup_n_count = 1>}> + +#maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (n, k)>, + affine_map<(m, n, k) -> (m, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func.func @matmul_96x64x16_wmma(%lhs: tensor<96x16xf16>, + %rhs: tensor<64x16xf16>, + %init: tensor<96x64xf32>) + -> tensor<96x64xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<96x16xf16>, tensor<64x16xf16>) + outs(%init: tensor<96x64xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<96x64xf32> + return %out : tensor<96x64xf32> +} + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout + +// CHECK-LABEL: func.func @matmul_96x64x16_wmma // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion} // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion} From fea5dd4e42b25d63e03154a709192bd88b382e65 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Mon, 26 Aug 2024 11:49:33 +0000 Subject: [PATCH 4/5] Add logic to verify vector.contract distribution --- .../Common/GPU/AMDGPUDistributeContract.cpp | 65 +++++++++++++++++++ .../gpu_nested_layout_contract_amdgpu.mlir | 2 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 6 +- 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp index 362f0c1694e5..aeab85f52be6 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp @@ -21,6 +21,57 @@ namespace { using namespace mlir::iree_compiler::IREE::VectorExt; using VectorValue = TypedValue; +static LogicalResult isSubgroupLayoutCompatible( + IREE::GPU::MMAAttr::SingleSubgroupLayout subgroupLayout, + NestedLayoutAttr layout, int64_t dim1, int64_t dim2) { + SmallVector element = {layout.getElementsPerThread()[dim1], + layout.getElementsPerThread()[dim2]}; + SmallVector thread = {layout.getThreadsPerOuter()[dim1], + layout.getThreadsPerOuter()[dim2]}; + SmallVector tstrides = {layout.getThreadStrides()[dim1], + layout.getThreadStrides()[dim2]}; + SmallVector outer = {layout.getOutersPerBatch()[dim1], + layout.getOutersPerBatch()[dim2]}; + + if (subgroupLayout.element != element) { + return failure(); + } + if (subgroupLayout.thread != thread) { + return failure(); + } + if (subgroupLayout.tstrides != tstrides) { + return failure(); + } + if (subgroupLayout.outer != outer) { + return failure(); + } + + return success(); +} + +static LogicalResult isIntrinsicLayoutCompatible(VectorContractOpInfo &opInfo, + IREE::GPU::MMAAttr intrinsic, + NestedLayoutAttr lhsLayout, + NestedLayoutAttr rhsLayout, + NestedLayoutAttr accLayout) { + auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); + auto [lhsK, rhsK] = opInfo.getOperandKIndex(); + auto [accM, accN] = opInfo.getResultMNIndex(); + if (failed(isSubgroupLayoutCompatible(intrinsic.getASingleSubgroupLayout(), + lhsLayout, lhsM, lhsK))) { + return failure(); + } + if (failed(isSubgroupLayoutCompatible(intrinsic.getBSingleSubgroupLayout(), + rhsLayout, rhsK, rhsN))) { + return failure(); + } + if (failed(isSubgroupLayoutCompatible(intrinsic.getCSingleSubgroupLayout(), + accLayout, accM, accN))) { + return failure(); + } + return success(); +} + /// Distributes `vector.contract` ops with nested layouts. struct DistributeContract final : OpDistributionPattern { using OpDistributionPattern::OpDistributionPattern; @@ -63,6 +114,12 @@ struct DistributeContract final : OpDistributionPattern { return rewriter.notifyMatchFailure( contractOp, "missing nested layout for contraction rhs"); } + NestedLayoutAttr accLayout = + dyn_cast(signature[resultValue]); + if (!accLayout) { + return rewriter.notifyMatchFailure( + contractOp, "missing nested layout for contraction acc"); + } // We assume there is an decision made before regarding which mfma intrinsic // to use and it is attached as an attribute to this contract op. @@ -73,6 +130,14 @@ struct DistributeContract final : OpDistributionPattern { contractOp, "missing iree.amdgpu.mma intrinsic attribute"); } + // Check if the given intrinsic can be distributed with the given + // layouts. + if (failed(isIntrinsicLayoutCompatible(opDetail, mmaAttr, lhsLayout, + rhsLayout, accLayout))) { + return rewriter.notifyMatchFailure( + contractOp, "the intrinsic does not match the expected layouts"); + } + SmallVector distShape = resultLayout.getDistributedShape(); LLVM_DEBUG({ llvm::dbgs() << "distributed shape: ["; diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir index 6f13d1788867..bf2b3512468c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir @@ -455,7 +455,7 @@ builtin.module attributes { transform.with_named_sequence } { elements_per_thread = [1, 4], subgroup_strides = [1, 1], - thread_strides = [32, 1] + thread_strides = [1, 32] > // C: shape = 32x64, layout = layoutC diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 36b0419638eb..3ac237960236 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -566,7 +566,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const { } case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { - return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 16}, + return {/*outer=*/{1, 1}, /*thread=*/{16, 1}, /*strides=*/{1, 0}, /*element=*/{1, 16}}; } } @@ -598,7 +598,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getBSingleSubgroupLayout() const { } case MMAIntrinsic::WMMA_F32_16x16x16_F16: case MMAIntrinsic::WMMA_F16_16x16x16_F16: { - return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{16, 1}, + return {/*outer=*/{1, 1}, /*thread=*/{1, 16}, /*strides=*/{0, 1}, /*element=*/{16, 1}}; } } @@ -624,7 +624,7 @@ MMAAttr::SingleSubgroupLayout MMAAttr::getCSingleSubgroupLayout() const { /*element=*/{1, 1}}; } case MMAIntrinsic::WMMA_F16_16x16x16_F16: { - return {/*outer=*/{16, 1}, /*thread=*/{1, 16}, /*strides=*/{16, 1}, + return {/*outer=*/{16, 1}, /*thread=*/{1, 16}, /*strides=*/{0, 1}, /*element=*/{1, 1}}; } } From 4cd00d0351aa630751c850cda7ae68b30ae5ad08 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Mon, 26 Aug 2024 11:54:30 +0000 Subject: [PATCH 5/5] Remove redundant workgroup size calculation --- .../LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp index fdaa546585b1..082781ca41d2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp @@ -88,21 +88,6 @@ struct LLVMGPUConfigureTensorLayoutsPass final void runOnOperation() override { auto func = getOperation(); - std::array workgroupSize; - std::optional> maybeWorkgroupSize = - getWorkgroupSize(func); - if (!maybeWorkgroupSize) { - func->emitOpError() - << "unable to query workgroup_size information from entry point"; - return signalPassFailure(); - } - for (auto [index, value] : llvm::enumerate(maybeWorkgroupSize.value())) { - workgroupSize[index] = value; - } - for (auto index : llvm::seq(maybeWorkgroupSize->size(), 3)) { - workgroupSize[index] = 1; - } - llvm::StringLiteral scheduleAttrName = IREE::GPU::MMAScheduleAttr::getMnemonic(); DictionaryAttr configDict = getTranslationInfo(func).getConfiguration();