diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp index a52ead0f5f0f..362f0c1694e5 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPUDistributeContract.cpp @@ -28,6 +28,14 @@ struct DistributeContract final : OpDistributionPattern { LogicalResult matchAndRewrite(vector::ContractionOp contractOp, DistributionSignature &signature, PatternRewriter &rewriter) const override { + // Infer the contract kind so that we know know to correlate M/N/K dims. + auto maybeOpDetail = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpDetail)) { + return rewriter.notifyMatchFailure(contractOp, "invalid contraction"); + } + VectorContractOpInfo opDetail = maybeOpDetail.value(); + auto resultType = dyn_cast(contractOp.getResultType()); if (!resultType) { return rewriter.notifyMatchFailure( @@ -65,9 +73,6 @@ struct DistributeContract final : OpDistributionPattern { contractOp, "missing iree.amdgpu.mma intrinsic attribute"); } - // Infer the contract kind so that we know know to correlate M/N/K dims. - VectorContractOpInfo opDetail(contractOp); - SmallVector distShape = resultLayout.getDistributedShape(); LLVM_DEBUG({ llvm::dbgs() << "distributed shape: ["; diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp index 127b436c8885..5578c1d4ebd0 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorAlloc.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree/compiler/Codegen/Common/GPU/Passes.h" +#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" #include "llvm/Support/Debug.h" @@ -104,46 +105,41 @@ struct GPUVectorAllocPass final void runOnOperation() override { FunctionOpInterface funcOp = getOperation(); - SmallVector opsToPromote; - funcOp.walk([&](vector::ContractionOp op) { - // Today we only do promotion for certain contractions. - if (contractOpFilter(op)) + SmallVector opsToPromote; + funcOp.walk([&](IREE::VectorExt::ToLayoutOp op) { + if (op->hasAttr("shared_memory_conversion")) { opsToPromote.push_back(op); + } }); - for (vector::ContractionOp contractOp : opsToPromote) { - OpBuilder builder(contractOp); + + for (IREE::VectorExt::ToLayoutOp op : opsToPromote) { + OpBuilder builder(op); // HACK: Until proper barrier placement is handled later we have to // synchronize explicitly in this pass. // Synchronize before the write to shared memory to avoid stepping over // reads in the previous iteration of a loop. - builder.create(contractOp->getLoc()); + builder.create(op->getLoc()); // Promote both of the input operands, excluding the accumulator. - OpOperand &lhs = contractOp.getLhsMutable(); - FailureOr lhsRet = - allocateTensorForVector(builder, contractOp->getLoc(), lhs.get()); - if (failed(lhsRet)) { - return signalPassFailure(); - } - - OpOperand &rhs = contractOp.getRhsMutable(); - FailureOr rhsRet = - allocateTensorForVector(builder, contractOp->getLoc(), rhs.get()); - if (failed(rhsRet)) { + OpOperand &operand = op.getInputMutable(); + FailureOr ret = + allocateTensorForVector(builder, op->getLoc(), operand.get()); + if (failed(ret)) { return signalPassFailure(); } // Synchronize after the write to shared memory before we read from it. - builder.create(contractOp->getLoc()); - - Value lhsVec = - readVectorFromTensor(builder, contractOp.getLhsType(), *lhsRet); - Value rhsVec = - readVectorFromTensor(builder, contractOp.getRhsType(), *rhsRet); - lhs.set(lhsVec); - rhs.set(rhsVec); + builder.create(op->getLoc()); + + VectorType inputTy = cast(op.getType()); + Value read = readVectorFromTensor(builder, inputTy, *ret); + operand.set(read); + + // Remove the shared_memory_conversion attribute from the to_layout + // operation. + op->removeAttr("shared_memory_conversion"); } } }; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 4b6cd7b8dd66..c865f99854d3 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -957,8 +957,14 @@ NestedLayoutAttr createNestedLayout(MLIRContext *context, int64_t rank, FailureOr> -MMAScheduleAttr::getContractionLayout(vector::ContractionOp contractOp) const { - VectorContractOpInfo opInfo(contractOp); +MMAScheduleAttr::getContractionLayout(linalg::GenericOp contractOp) const { + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpInfo)) { + return failure(); + } + VectorContractOpInfo opInfo = maybeOpInfo.value(); + LLVM_DEBUG({ llvm::errs() << "Getting mma layouts for:\n" << contractOp << "\n"; llvm::errs() << "For schedule: " << *this << "\n"; @@ -971,8 +977,7 @@ MMAScheduleAttr::getContractionLayout(vector::ContractionOp contractOp) const { auto mmaAttr = llvm::cast(getIntrinsic()); MLIRContext *context = getContext(); - SmallVector bounds; - contractOp.getIterationBounds(bounds); + SmallVector bounds = contractOp.getStaticLoopRanges(); int64_t batchCount = opInfo.getBatchCount(); if (batchCount == 1 && bounds[0] != 1) { LLVM_DEBUG({ llvm::errs() << "non-unit batch dimension\n"; }); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h index 295aaf0158c1..8e6a9e15a716 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h @@ -11,6 +11,7 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 6809bc6f3009..8901dcc5a8af 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -253,7 +253,7 @@ def IREEGPU_MmaScheduleAttr : AttrDef { ::mlir::FailureOr<::std::tuple> - getContractionLayout(::mlir::vector::ContractionOp contractOp) const; + getContractionLayout(::mlir::linalg::GenericOp contractOp) const; }]; } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp index 84cb05260830..00c664c932a1 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp @@ -49,6 +49,8 @@ struct VectorizeToLayoutOpPattern final // Create the toLayout operation but with vector types instead. auto newLayoutOp = rewriter.create( loc, newInput.getType(), newInput, toLayoutOp.getLayout()); + // Set attributes. + newLayoutOp->setAttrs(toLayoutOp->getAttrs()); // Create the write back to a tensor. int64_t rank = inputTy.getRank(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp index d22bd206b053..72d70f3354d0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp @@ -66,6 +66,14 @@ struct AMDGPUPrepareForChainedMatmulPass registry.insert(); } + VectorContractOpInfo getOpInfo(vector::ContractionOp contract) const { + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contract.getIndexingMapsArray()); + assert(succeeded(maybeOpInfo) && + "contraction info for vector.contract should always be valid"); + return maybeOpInfo.value(); + } + VectorValue swapDims(RewriterBase &rewriter, VectorValue val, int64_t dimA, int64_t dimB) const { ArrayRef shape = val.getType().getShape(); @@ -106,7 +114,7 @@ struct AMDGPUPrepareForChainedMatmulPass /// simply swap the operands without transposing them. void swapOperandsAndTranspose(RewriterBase &rewriter, vector::ContractionOp contractOp) const { - VectorContractOpInfo opInfo(contractOp); + VectorContractOpInfo opInfo = getOpInfo(contractOp); auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); auto [lhsK, rhsK] = opInfo.getOperandKIndex(); auto [accM, accN] = opInfo.getResultMNIndex(); @@ -174,7 +182,7 @@ struct AMDGPUPrepareForChainedMatmulPass bool isOperandSwapInvariant(vector::ContractionOp contractOp) const { // Check if the innermost m, n, k dimensions are in the order: // lhs: (m, k), rhs: (n, k) - VectorContractOpInfo opInfo(contractOp); + VectorContractOpInfo opInfo = getOpInfo(contractOp); auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); auto [lhsK, rhsK] = opInfo.getOperandKIndex(); bool isLhsTransposed = lhsM > lhsK; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel index c57e1f585186..83cf5c93a127 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel @@ -95,6 +95,7 @@ iree_compiler_cc_library( "KernelConfig.cpp", "LLVMGPUCastAddressSpaceFunction.cpp", "LLVMGPUCastTypeToFitMMA.cpp", + "LLVMGPUConfigureTensorLayouts.cpp", "LLVMGPUConfigureVectorLayouts.cpp", "LLVMGPULowerExecutableTarget.cpp", "LLVMGPUPackSharedMemoryAlloc.cpp", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt index 30e722e39307..a08ed4b5c0be 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt @@ -80,6 +80,7 @@ iree_cc_library( "KernelConfig.cpp" "LLVMGPUCastAddressSpaceFunction.cpp" "LLVMGPUCastTypeToFitMMA.cpp" + "LLVMGPUConfigureTensorLayouts.cpp" "LLVMGPUConfigureVectorLayouts.cpp" "LLVMGPULowerExecutableTarget.cpp" "LLVMGPUPackSharedMemoryAlloc.cpp" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp index 621430b7e064..70a99b020ab8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp @@ -28,7 +28,12 @@ struct UpcastContractOutput final : OpRewritePattern { LogicalResult matchAndRewrite(vector::ContractionOp contractOp, PatternRewriter &rewriter) const override { - VectorContractOpInfo opInfo(contractOp); + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpInfo)) { + return rewriter.notifyMatchFailure(contractOp, "not a contraction"); + } + VectorContractOpInfo opInfo = maybeOpInfo.value(); auto srcCType = dyn_cast(contractOp.getAccType()); if (!srcCType) { @@ -66,6 +71,8 @@ struct UpcastContractOutput final : OpRewritePattern { auto newContractOp = rewriter.create( loc, contractOp.getLhs(), contractOp.getRhs(), extOp, contractOp.getIndexingMaps(), contractOp.getIteratorTypes()); + newContractOp->setDiscardableAttrs( + contractOp->getDiscardableAttrDictionary()); rewriter.replaceOpWithNewOp(contractOp, srcCType, newContractOp); return success(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp new file mode 100644 index 000000000000..6b929dfc19b4 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp @@ -0,0 +1,157 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" +#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" +#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h" +#include "iree/compiler/Codegen/LLVMGPU/Passes.h" +#include "iree/compiler/Codegen/Utils/GPUUtils.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeUtilities.h" + +#define DEBUG_TYPE "iree-llvmgpu-configure-vector-layouts" + +namespace mlir::iree_compiler { + +namespace { + +LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, + RewriterBase &rewriter, + linalg::GenericOp contract) { + // TODO: Add SIMT fallback. + if (!schedule) { + return contract->emitError("missing mma schedule for contraction"); + } + + // This function should have only be called on a contraction op. + assert(linalg::isaContractionOpInterface(contract) && + "cannot set contraction anchor on non contraction op"); + + auto layouts = schedule.getContractionLayout(contract); + if (failed(layouts)) { + return contract->emitError("cannot get concrete layout for contraction"); + } + + auto [aLayout, bLayout, cLayout] = *layouts; + Location loc = contract.getLoc(); + + Value lhs = contract.getOperand(0); + Value rhs = contract.getOperand(1); + Value acc = contract.getOperand(2); + + // Set layouts for lhs, rhs and acc. + rewriter.setInsertionPoint(contract); + auto layoutedLhs = rewriter.create( + loc, lhs.getType(), lhs, aLayout); + auto layoutedRhs = rewriter.create( + loc, rhs.getType(), rhs, bLayout); + auto layoutedAcc = rewriter.create( + loc, acc.getType(), acc, cLayout); + + // Promote matmul lhs and rhs. + // TODO: We should read this from the lowering_config on the operation. + // TODO: This is a hack until layout analysis is improved. The layout analysis + // should decide where to put these shared memory conversions. + layoutedLhs->setAttr("shared_memory_conversion", rewriter.getUnitAttr()); + layoutedRhs->setAttr("shared_memory_conversion", rewriter.getUnitAttr()); + + contract->setOperand(0, layoutedLhs.getResult()); + contract->setOperand(1, layoutedRhs.getResult()); + contract->setOperand(2, layoutedAcc.getResult()); + + // Set layout for result. + rewriter.setInsertionPointAfter(contract); + auto toLayout = rewriter.create( + loc, contract.getResult(0).getType(), contract.getResult(0), cLayout); + rewriter.replaceAllUsesExcept(contract.getResult(0), toLayout.getResult(), + toLayout); + + return success(); +} + +struct LLVMGPUConfigureTensorLayoutsPass + : public LLVMGPUConfigureTensorLayoutsBase< + LLVMGPUConfigureTensorLayoutsPass> { +public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + } + + void runOnOperation() override { + auto func = getOperation(); + + std::array workgroupSize; + if (func->hasAttr("workgroup_size")) { + auto tmpSizes = + llvm::cast(func->getAttr("workgroup_size")).getValue(); + for (auto [i, size] : llvm::enumerate(tmpSizes)) { + workgroupSize[i] = llvm::cast(size).getInt(); + } + } else { + std::optional> maybeWorkgroupSize = + getWorkgroupSize(func); + if (!maybeWorkgroupSize) { + func->emitOpError() + << "unable to query workgroup_size information from entry point"; + return signalPassFailure(); + } + for (auto [index, value] : llvm::enumerate(maybeWorkgroupSize.value())) { + workgroupSize[index] = value; + } + for (auto index : llvm::seq(maybeWorkgroupSize->size(), 3)) { + workgroupSize[index] = 1; + } + } + + llvm::StringLiteral scheduleAttrName = + IREE::GPU::MMAScheduleAttr::getMnemonic(); + auto scheduleAttr = + func->getAttrOfType(scheduleAttrName); + if (!scheduleAttr) { + DictionaryAttr configDict = getTranslationInfo(func).getConfiguration(); + scheduleAttr = dyn_cast_or_null( + configDict.get(scheduleAttrName)); + } + + // Vector layout option setter aimed at contractions. For now, layout + // setting for other problems like reductions is TODO. + SmallVector contracts; + + func->walk([&](linalg::GenericOp linalgOp) { + if (linalg::isaContractionOpInterface(linalgOp)) { + contracts.push_back(linalgOp); + } + }); + + IRRewriter rewriter(func); + + for (linalg::GenericOp contract : contracts) { + if (failed(setContractionAnchor(scheduleAttr, rewriter, contract))) { + return signalPassFailure(); + } + } + } +}; +} // namespace + +std::unique_ptr> +createLLVMGPUConfigureTensorLayouts() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp index dee5e89b086b..a826fa26f290 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp @@ -29,56 +29,6 @@ namespace mlir::iree_compiler { namespace { -// Sets an anchoring layout for the given contraction op. Looks for a -// supported mma type from the cached list of mma types and populates the -// necessary distribution pattern for those contractions. -LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule, - RewriterBase &rewriter, - vector::ContractionOp contract) { - // TODO: Add SIMT fallback. - if (!schedule) { - return contract->emitError("missing mma schedule for contraction"); - } - - auto layouts = schedule.getContractionLayout(contract); - if (failed(layouts)) { - return contract->emitError("cannot get concrete layout for contraction"); - } - - auto [aLayout, bLayout, cLayout] = *layouts; - Location loc = contract.getLoc(); - - // Set layouts for lhs, rhs and acc. - rewriter.setInsertionPoint(contract); - Value layoutedLhs = rewriter.create( - loc, contract.getLhsType(), contract.getLhs(), aLayout); - Value layoutedRhs = rewriter.create( - loc, contract.getRhsType(), contract.getRhs(), bLayout); - Value layoutedAcc = rewriter.create( - loc, contract.getAccType(), contract.getAcc(), cLayout); - contract->setOperand(0, layoutedLhs); - contract->setOperand(1, layoutedRhs); - contract->setOperand(2, layoutedAcc); - - // Set layout for result. - rewriter.setInsertionPointAfter(contract); - auto toLayout = rewriter.create( - loc, contract.getResultType(), contract.getResult(), cLayout); - rewriter.replaceAllUsesExcept(contract, toLayout.getResult(), toLayout); - - // Set intrinsic kind. - contract->setAttr("iree.amdgpu.mma", schedule.getIntrinsic()); - - LLVM_DEBUG({ - llvm::dbgs() << "chosen a layout: " << aLayout << "\n"; - llvm::dbgs() << "chosen b layout: " << bLayout << "\n"; - llvm::dbgs() << "chosen c layout: " << cLayout << "\n"; - llvm::dbgs() << "anchor set on contract: " << contract << "\n"; - }); - - return success(); -} - // Sets a layout anchor for reads from global memory. // The layout this generates is approximately the following: // @@ -332,16 +282,10 @@ struct LLVMGPUConfigureVectorLayoutsPass // should receive layouts. Layout setting for other problems like reductions // is TODO. SmallVector reads; - SmallVector contracts; func->walk([&](Operation *op) { - llvm::TypeSwitch(op) - .Case([&](vector::TransferReadOp transfer) { - reads.push_back(transfer); - }) - .Case([&](vector::ContractionOp contract) { - contracts.push_back(contract); - }); + llvm::TypeSwitch(op).Case( + [&](vector::TransferReadOp transfer) { reads.push_back(transfer); }); }); IRRewriter rewriter(func); @@ -351,12 +295,6 @@ struct LLVMGPUConfigureVectorLayoutsPass return signalPassFailure(); } } - - for (vector::ContractionOp contract : contracts) { - if (failed(setContractionAnchor(scheduleAttr, rewriter, contract))) { - return signalPassFailure(); - } - } } }; } // namespace diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 957813aa8ce9..b79d0820f509 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -14,6 +14,7 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h" +#include "iree/compiler/Codegen/Dialect/VectorExt/Transforms/Passes.h" #include "iree/compiler/Codegen/LLVMGPU/Passes.h" #include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h" #include "iree/compiler/Codegen/Utils/GPUUtils.h" @@ -247,6 +248,8 @@ static void tileAndBufferize(OpPassManager &funcPassManager) { static void addGPUVectorizationPasses(OpPassManager &funcPassManager) { funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass()); funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass()); + funcPassManager.addPass( + IREE::VectorExt::createVectorizeIREEVectorExtOpsPass()); // Vectorize. GenericVectorizationPassOptions options; options.vectorizePadding = true; @@ -772,8 +775,9 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, // be safely dropped. This additionally allows vectorization of convolution to // `vector.contract` as filter dimensions are expected to be tiled to 1 by // this point. + funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass()); if (!usePadToModelSharedMemcpy) { - funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass()); + // Folding unit dims gets confused with padding. LinalgFoldUnitExtentDimsPassOptions options; options.useRankReducingSlices = true; funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(options)); @@ -782,6 +786,8 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, } funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); + // Set anchors at tensor level for vector distribution later. + funcPassManager.addPass(createLLVMGPUConfigureTensorLayouts()); // Linalg -> Vector addGPUVectorizationPasses(funcPassManager); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h index fb8427502278..e6fca1263a19 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h @@ -152,6 +152,10 @@ std::unique_ptr> createLLVMGPUPromoteMatmulToFitMMAPass( LLVMGPUMatmulPadOption option = LLVMGPUMatmulPadOption::ParallelDims); +// Pass to set layouts on tensors for later vector distribution. +std::unique_ptr> +createLLVMGPUConfigureTensorLayouts(); + // Pass to set layouts for vector distribution. std::unique_ptr> createLLVMGPUConfigureVectorLayouts(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td index 8ea7da497989..4fdf26a730ee 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td @@ -60,6 +60,12 @@ def LLVMGPUCastTypeToFitMMA : InterfacePass<"iree-llvmgpu-cast-type-to-fit-mma", let constructor = "mlir::iree_compiler::createLLVMGPUCastTypeToFitMMAPass()"; } +def LLVMGPUConfigureTensorLayouts : + InterfacePass<"iree-llvmgpu-configure-tensor-layouts", "mlir::FunctionOpInterface"> { + let summary = "Pass to set layouts on tensors for later vector distribution"; + let constructor = "mlir::iree_compiler::createLLVMGPUConfigureTensorLayouts()"; +} + def LLVMGPUConfigureVectorLayouts : InterfacePass<"iree-llvmgpu-configure-vector-layouts", "mlir::FunctionOpInterface"> { let summary = "Pass to set layouts for vector distribution"; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp index 986a389e788c..48de6cdaeef8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp @@ -25,6 +25,13 @@ struct DistributeContractions final LogicalResult matchAndRewrite(vector::ContractionOp contractOp, DistributionSignature &signature, PatternRewriter &rewriter) const override { + auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( + contractOp.getIndexingMapsArray()); + if (failed(maybeOpInfo)) { + return rewriter.notifyMatchFailure(contractOp, "invalid contraction"); + } + VectorContractOpInfo opInfo = maybeOpInfo.value(); + VectorValue result = dyn_cast(contractOp.getResult()); if (!result) { return rewriter.notifyMatchFailure(contractOp, @@ -67,7 +74,6 @@ struct DistributeContractions final Value vector = rewriter.create( loc, vectorType, rewriter.getZeroAttr(vectorType)); - VectorContractOpInfo opInfo(contractOp); auto [lhsK, rhsK] = opInfo.getOperandKIndex(); std::optional kBatch = layouts[LHS].getBatchDim(lhsK); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 67a84393a918..f6f67dbee8a0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -74,6 +74,7 @@ iree_lit_test_suite( "transpose_pipeline_test.mlir", "ukernel_pipeline_transform.mlir", "configure_vector_layout.mlir", + "configure_tensor_layout.mlir", "vector_lowering.mlir", "vector_to_gpu.mlir", "winograd_pipeline_test.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 46366b4d9fa6..362412dfa593 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -23,6 +23,7 @@ iree_lit_test_suite( "cast_type_to_fit_mma.mlir" "config_matvec.mlir" "config_winograd.mlir" + "configure_tensor_layout.mlir" "configure_vector_layout.mlir" "conv_pipeline_test_cuda.mlir" "conv_pipeline_test_rocm.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir index f8c43416b95f..4f97eb6d4163 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_type_to_fit_mma.mlir @@ -18,7 +18,8 @@ func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf // CHECK: %[[MM:.+]] = vector.contract // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>] // CHECK-SAME iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind -// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> +// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] +// CHECK-SAME: vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> // CHECK: %[[TRUNC:.+]] = arith.truncf %[[MM]] : vector<96x64xf32> to vector<96x64xf16> // CHECK: return %[[TRUNC]] : vector<96x64xf16> @@ -60,7 +61,7 @@ func.func @mfma_matmul_96x64x16_mm_cannot_downcast(%lhs: vector<96x16xf16>, %rhs // CHECK-LABEL: func.func @mfma_matmul_96x64x16_mm_cannot_downcast // CHECK-NOT: arith.extf // CHECK: vector.contract -// CHECK-SAME: %{{.+}}, %{{.+}}, %{{.+}} : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf64> +// CHECK-SAME: vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf64> // CHECK-NOT: arith.truncf // ----- @@ -83,7 +84,8 @@ func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf // CHECK: %[[MM:.+]] = vector.contract // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>] // CHECK-SAME iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind -// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] : vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> +// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] +// CHECK-SAME: vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> // CHECK: %[[TRUNC:.+]] = arith.truncf %[[MM]] : vector<48x32xf32> to vector<48x32xf16> // CHECK: return %[[TRUNC]] : vector<48x32xf16> @@ -111,6 +113,7 @@ func.func @transform_dialect_mfma_matmul_96x64x16(%lhs: vector<96x16xf16>, %rhs: // CHECK: %[[MM:.+]] = vector.contract // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>] // CHECK-SAME iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind -// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> +// CHECK-SAME: %[[A]], %[[B]], %[[EXT]] +// CHECK-SAME: vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> // CHECK: %[[TRUNC:.+]] = arith.truncf %[[MM]] : vector<96x64xf32> to vector<96x64xf16> // CHECK: return %[[TRUNC]] : vector<96x64xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir new file mode 100644 index 000000000000..06cd515dc0ff --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir @@ -0,0 +1,152 @@ +// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-tensor-layouts, canonicalize, cse))' %s | FileCheck %s + +#translation = #iree_codegen.translation_info, + subgroup_m_count = 1, + subgroup_n_count = 1>}> + +#maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (n, k)>, + affine_map<(m, n, k) -> (m, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func.func @matmul_96x64x16(%lhs: tensor<96x16xf16>, + %rhs: tensor<64x16xf16>, + %init: tensor<96x64xf32>) + -> tensor<96x64xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<96x16xf16>, tensor<64x16xf16>) + outs(%init: tensor<96x64xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<96x64xf32> + return %out : tensor<96x64xf32> +} + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout + +// CHECK-LABEL: func.func @matmul_96x64x16 + +// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion} +// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion} +// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] +// CHECK: linalg.generic +// CHECK-SAME: ins(%[[LHS]], %[[RHS]] +// CHECK-SAME: outs(%[[ACC]] + +// ----- + +#translation = #iree_codegen.translation_info, + subgroup_m_count = 4, + subgroup_n_count = 1>}> + +#maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (n, k)>, + affine_map<(m, n, k) -> (m, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "reduction"] +} + +func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>, + %rhs: tensor<64x16xf16>, + %init: tensor<128x64xf32>) + -> tensor<128x64xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<128x16xf16>, tensor<64x16xf16>) + outs(%init: tensor<128x64xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<128x64xf32> + return %out : tensor<128x64xf32> +} + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, + subgroup_m_count = 2, + subgroup_n_count = 2>}> + +#maps = [ + affine_map<(bm, bn, m, n, k) -> (bm, m, k)>, + affine_map<(bm, bn, m, n, k) -> (bn, n, k)>, + affine_map<(bm, bn, m, n, k) -> (bm, m, bn, n)> +] + +#traits = { + indexing_maps = #maps, + iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"] +} + +func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>, + %rhs: tensor<8x16x16xf16>, + %init: tensor<8x16x8x16xf32>) + -> tensor<8x16x8x16xf32> + attributes { translation_info = #translation } { + %out = linalg.generic #traits + ins(%lhs, %rhs: tensor<8x16x16xf16>, tensor<8x16x16xf16>) + outs(%init: tensor<8x16x8x16xf32>) { + ^bb0(%in: f16, %in_1: f16, %out: f32): + %ex = arith.extf %in : f16 to f32 + %ex_1 = arith.extf %in_1 : f16 to f32 + %mul = arith.mulf %ex, %ex_1 : f32 + %sum = arith.addf %out, %mul : f32 + linalg.yield %sum : f32 + } -> tensor<8x16x8x16xf32> + return %out : tensor<8x16x8x16xf32> +} + + +// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout +// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout +// CHECK-LABEL: func.func @packed_matmul_128x128x128 + +// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion} +// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion} +// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] +// CHECK: linalg.generic +// CHECK-SAME: ins(%[[LHS]], %[[RHS]] +// CHECK-SAME: outs(%[[ACC]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir index 2f12b1595f0e..6e16d2f65b44 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_vector_layout.mlir @@ -1,107 +1,5 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-vector-layouts, canonicalize, cse))' %s | FileCheck %s -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// Since CHECK-SAME doesnt work with CHECK-DAG, we cannot have prettier tests. - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @mfma_matmul_96x64x16_mm -func.func @mfma_matmul_96x64x16_mm(%lhs: vector<96x16xf16>, %rhs: vector<16x64xf16>, %init: vector<96x64xf32>) -> vector<96x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<96x16xf16>, vector<16x64xf16> into vector<96x64xf32> - return %0 : vector<96x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @mfma_matmul_96x64x16_mmt -func.func @mfma_matmul_96x64x16_mmt(%lhs: vector<96x16xf16>, %rhs: vector<64x16xf16>, %init: vector<96x64xf32>) -> vector<96x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>, affine_map<(m, n, d2) -> (m, n)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<96x16xf16>, vector<64x16xf16> into vector<96x64xf32> - return %0 : vector<96x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @mfma_matmul_96x64x16_mmtt -func.func @mfma_matmul_96x64x16_mmtt(%lhs: vector<96x16xf16>, %rhs: vector<64x16xf16>, %init: vector<64x96xf32>) -> vector<64x96xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>, affine_map<(m, n, k) -> (n, m)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<96x16xf16>, vector<64x16xf16> into vector<64x96xf32> - return %0 : vector<64x96xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, %rhs: vector<16x64xf16>, %init: vector<192x64xf32>) -> vector<192x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<192x16xf16>, vector<16x64xf16> into vector<192x64xf32> - return %0 : vector<192x64xf32> -} - -// ----- - #translation = #iree_codegen.translation_info, // ----- -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @wmma_matmul_48x32x32_mm -func.func @wmma_matmul_48x32x32_mm(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf32>) -> vector<48x32xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> - return %0 : vector<48x32xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @wmma_matmul_48x32x32_mmt -func.func @wmma_matmul_48x32x32_mmt(%lhs: vector<48x32xf16>, %rhs: vector<32x32xf16>, %init: vector<48x32xf32>) -> vector<48x32xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>, affine_map<(m, n, d2) -> (m, n)>], - iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<48x32xf16>, vector<32x32xf16> into vector<48x32xf32> - return %0 : vector<48x32xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @matmul_192x64x16_mmt_multi_m -func.func @matmul_192x64x16_mmt_multi_m(%lhs: vector<2x64x16xf16>, %rhs: vector<16x64xf16>, %init: vector<2x64x64xf32>) -> vector<2x64x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<2x64x16xf16>, vector<16x64xf16> into vector<2x64x64xf32> - return %0 : vector<2x64x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 4, subgroup_n_count = 1>}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -// CHECK-LABEL: func.func @matmul_192x64x16_mmt_multi_split_m -func.func @matmul_192x64x16_mmt_multi_split_m(%lhs: vector<2x64x16xf16>, %rhs: vector<16x64xf16>, %init: vector<2x64x64xf32>) -> vector<2x64x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<2x64x16xf16>, vector<16x64xf16> into vector<2x64x64xf32> - return %0 : vector<2x64x64xf32> -} - -// ----- - -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, workgroup_size = [128, 2, 1]}> - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout, %rhs: vector<2x16x64xf16>, %init: vector<4x2x64x64xf32>) -> vector<4x2x64x64xf32> attributes { translation_info = #translation } { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], - iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} - %lhs, %rhs, %init : vector<4x64x16xf16>, vector<2x16x64xf16> into vector<4x2x64x64xf32> - return %0 : vector<4x2x64x64xf32> -} - -// ----- - #translation = #iree_codegen.translation_info, memref<128x128xf16, #gpu.address_space> return } - -// ----- - -// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout -// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout - -#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> -#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> -#translation = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>}> -// CHECK-LABEL: func.func @batch_matmul_unit_batch -func.func @batch_matmul_unit_batch(%arg0: vector<1x64x64xf16>, %arg1: vector<1x64x128xf16>, %arg2: vector<1x64x128xf32>) -> vector<1x64x128xf32> attributes {translation_info = #translation} { - // CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] - // CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] - // CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]] - // CHECK: vector.contract - // CHECK-SAME: %[[LHS]], %[[RHS]], %[[ACC]] - %0 = vector.contract { - indexing_maps = [#map, #map1, #map2], - iterator_types = ["parallel", "parallel", "parallel", "reduction"], - kind = #vector.kind} - %arg0, %arg1, %arg2 : vector<1x64x64xf16>, vector<1x64x128xf16> into vector<1x64x128xf32> - return %0 : vector<1x64x128xf32> -} diff --git a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp index 0bf721ce738a..b6b30b519fe9 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.cpp @@ -26,24 +26,35 @@ std::pair VectorContractOpInfo::getResultMNIndex() const { return std::make_pair(outMDims.back(), outNDims.back()); } -VectorContractOpInfo::VectorContractOpInfo(vector::ContractionOp op) { - contractionDims = *linalg::inferContractionDims(op.getIndexingMapsArray()); - - SmallVector maps = op.getIndexingMapsArray(); - MLIRContext *ctx = op.getContext(); +FailureOr +VectorContractOpInfo::inferFromIndexingMaps(ArrayRef maps) { + auto maybeContractionDims = linalg::inferContractionDims(maps); + if (failed(maybeContractionDims)) { + return failure(); + } + auto contractionDims = maybeContractionDims.value(); + MLIRContext *ctx = maps[0].getContext(); + VectorContractOpInfo opInfo; for (auto m : contractionDims.m) { - lhsMDims.push_back(*maps[0].getResultPosition(getAffineDimExpr(m, ctx))); - outMDims.push_back(*maps[2].getResultPosition(getAffineDimExpr(m, ctx))); + opInfo.lhsMDims.push_back( + *maps[0].getResultPosition(getAffineDimExpr(m, ctx))); + opInfo.outMDims.push_back( + *maps[2].getResultPosition(getAffineDimExpr(m, ctx))); } for (auto n : contractionDims.n) { - rhsNDims.push_back(*maps[1].getResultPosition(getAffineDimExpr(n, ctx))); - outNDims.push_back(*maps[2].getResultPosition(getAffineDimExpr(n, ctx))); + opInfo.rhsNDims.push_back( + *maps[1].getResultPosition(getAffineDimExpr(n, ctx))); + opInfo.outNDims.push_back( + *maps[2].getResultPosition(getAffineDimExpr(n, ctx))); } - int64_t k = contractionDims.k.back(); - lhsKDim = *maps[0].getResultPosition(getAffineDimExpr(k, ctx)); - rhsKDim = *maps[1].getResultPosition(getAffineDimExpr(k, ctx)); + opInfo.lhsKDim = *maps[0].getResultPosition(getAffineDimExpr(k, ctx)); + opInfo.rhsKDim = *maps[1].getResultPosition(getAffineDimExpr(k, ctx)); + + opInfo.contractionDims = contractionDims; + + return opInfo; } } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h index f8e0cf9ac763..bbe5d26106ff 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/VectorOpUtils.h @@ -5,14 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" namespace mlir::iree_compiler { /// A class for querying information about a contract op. class VectorContractOpInfo { public: - explicit VectorContractOpInfo(vector::ContractionOp op); + static FailureOr + inferFromIndexingMaps(ArrayRef maps); // Returns the (LHS M, RHS N) dimension index pair. std::pair getOperandMNIndex() const;