From 8e4f9d40f0eed08c994c134eeafc52cfc61ff1e6 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Mon, 4 Sep 2023 18:12:21 +0200 Subject: [PATCH 1/3] [MLIR][NVGPU] Introducing the `nvgpu.mbarrier.group` Type A common practice involves the creation of multiple MBarrier objects for utilization within loops, see an example below. This is particularly valuable in scenarios like software pipelining during matmul code generation, where we need to generate and employ five barriers dynamically within a loop. This works improves `nvgpu.mbarrier.barrier` type into the `nvgpu.mbarrier.group`. All MBarrier-related operations now uses this type. Consequently, these operations are now capable of managing multiple barriers seamlessly. ``` %barriers = nvgpu.mbarrier.create -> !nvgpu.mbarrier.group, num_barriers = 3> nvgpu.mbarrier.init %barriers[%c0], %num_threads : !nvgpu.mbarrier.group, num_barriers = 3> nvgpu.mbarrier.init %barriers[%c1], %num_threads : !nvgpu.mbarrier.group, num_barriers = 3> nvgpu.mbarrier.init %barriers[%c2], %num_threads : !nvgpu.mbarrier.group, num_barriers = 3> ... scf.for %i = %c0 to %n step %c1 { %mbarId = arith.remui %i, 3 : index %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group, num_barriers = 3>, !tokenType } ``` Differential Revision: https://reviews.llvm.org/D159433 --- .../mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h | 6 +- mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 62 ++++---- .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 135 +++++++++--------- .../NVGPU/TransformOps/NVGPUTransformOps.cpp | 39 ++--- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 102 +++++++++---- 5 files changed, 201 insertions(+), 143 deletions(-) diff --git a/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h b/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h index 8c5667cd417f0..4b8d5c5fe2a89 100644 --- a/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h +++ b/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h @@ -23,15 +23,15 @@ class Pass; #include "mlir/Conversion/Passes.h.inc" namespace nvgpu { -class MBarrierType; +class MBarrierGroupType; /// Returns the memory space attribute of the mbarrier object. Attribute getMbarrierMemorySpace(MLIRContext *context, - MBarrierType barrierType); + MBarrierGroupType barrierType); /// Return the memref type that can be used to represent an mbarrier object. MemRefType getMBarrierMemrefType(MLIRContext *context, - MBarrierType barrierType); + MBarrierGroupType barrierType); } // namespace nvgpu void populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td index 90381648dac6a..5fcf08c6d3e1d 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -135,20 +135,26 @@ def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken", }]; } -def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> { +def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> { let summary = "mbarrier barrier type"; let description = [{ - This is the type for a mbarrier object in shared memory that is used - to synchronize a variable number of threads. + This is the type for one or more mbarrier object in shared memory that is + used to synchronize a variable number of threads. - The mbarrier object is 64 bit with 8 byte alignment. The mbarrier object - can be initiated and invalidated. + If `num_barriers` is not set, the number of mbarrier objects is 1. - See for more details: - https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object + A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object + can be initiated and invalidated. + + [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object) }]; - let parameters = (ins "Attribute":$memorySpace); + let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers); let assemblyFormat = "`<` struct(params) `>`"; + let builders = [ + TypeBuilder<(ins "Attribute":$memorySpace), [{ + return $_get($_ctxt, memorySpace, 1); + }]> + ]; } def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { } @@ -486,7 +492,7 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { let summary = "Creates a `nvgpu.mbarrier` object."; let description = [{ - The Op generates an `mbarrier` object, which is a barrier created in + The Op generates one or more `mbarrier` object, which is a barrier created in shared memory and supports various synchronization behaviors for threads. The `mbarrier` object has the following type and alignment requirements: @@ -498,9 +504,9 @@ def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { ``` }]; let arguments = (ins); - let results = (outs NVGPU_MBarrier:$barrier); + let results = (outs NVGPU_MBarrierGroup:$barriers); let assemblyFormat = [{ - attr-dict `->` type($barrier) + attr-dict `->` type($barriers) }]; } @@ -516,8 +522,8 @@ def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> { nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier> ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count); - let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)"; + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId); + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers)"; } def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> { @@ -531,9 +537,9 @@ def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> { %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier>, !nvgpu.mbarrier.token ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token); + let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId); let results = (outs I1:$waitComplete); - let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)"; + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)"; } def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { @@ -550,9 +556,9 @@ def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier> -> !nvgpu.mbarrier.token ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier); + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId); let results = (outs NVGPU_MBarrierToken:$token); -let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)"; +let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)"; } def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> { @@ -568,10 +574,10 @@ def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", [] %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier> -> !nvgpu.mbarrier.token ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId, Index:$count); let results = (outs NVGPU_MBarrierToken:$token); - let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)"; + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)"; } def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> { @@ -591,9 +597,8 @@ def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> { nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier> ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, - Index:$txcount); - let assemblyFormat = "$barrier `,` $txcount attr-dict `:` type($barrier)"; + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId); + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount attr-dict `:` type($barriers)"; } def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> { @@ -610,8 +615,8 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> { ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, Index:$phase, Index:$ticks); - let assemblyFormat = "$barrier `,` $phase `,` $ticks attr-dict `:` type($barrier)"; + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$phase, Index:$ticks, Index:$mbarId); + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phase `,` $ticks attr-dict `:` type($barriers)"; } def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> { @@ -626,12 +631,13 @@ def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> { The Op uses `$barrier` mbarrier based completion mechanism. }]; let arguments = (ins Arg:$dst, - NVGPU_MBarrier:$barrier, + NVGPU_MBarrierGroup:$barriers, NVGPU_TensorMapDescriptor:$tensorMapDescriptor, - Variadic:$coordinates); + Variadic:$coordinates, + Index:$mbarId); let assemblyFormat = [{ - $tensorMapDescriptor `[` $coordinates `]` `,` $barrier `to` $dst - attr-dict `:` type($tensorMapDescriptor) `,` type($barrier) `->` type($dst) + $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` `to` $dst + attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) `->` type($dst) }]; let hasVerifier = 1; diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index f74aa05c0c4c4..4d1f6641af6dc 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -18,8 +18,10 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" +#include "mlir/IR/Value.h" #include "mlir/Pass/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -218,14 +220,14 @@ static SmallVector unpackOperandVector(RewriterBase &rewriter, } /// Returns whether mbarrier object has shared memory address space. -static bool isMbarrierShared(nvgpu::MBarrierType barrierType) { +static bool isMbarrierShared(nvgpu::MBarrierGroupType barrierType) { return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace( barrierType.getMemorySpace())); } /// Returns the memory space attribute of the mbarrier object. Attribute nvgpu::getMbarrierMemorySpace(MLIRContext *context, - nvgpu::MBarrierType barrierType) { + nvgpu::MBarrierGroupType barrierType) { Attribute memorySpace = {}; if (isMbarrierShared(barrierType)) { memorySpace = @@ -236,25 +238,13 @@ Attribute nvgpu::getMbarrierMemorySpace(MLIRContext *context, } /// Returns memref type of the mbarrier object. The type is defined in the -/// MBarrierType. +/// MBarrierGroupType. MemRefType nvgpu::getMBarrierMemrefType(MLIRContext *context, - nvgpu::MBarrierType barrierType) { + nvgpu::MBarrierGroupType barrierType) { Attribute memorySpace = nvgpu::getMbarrierMemorySpace(context, barrierType); MemRefLayoutAttrInterface layout; - return MemRefType::get({1}, IntegerType::get(context, 64), layout, - memorySpace); -} - -/// Returns the base pointer of the mbarrier object. -static Value getMbarrierPtr(ConversionPatternRewriter &rewriter, - const LLVMTypeConverter &typeConverter, - TypedValue barrier, - Value barrierMemref) { - MemRefType memrefType = - nvgpu::getMBarrierMemrefType(rewriter.getContext(), barrier.getType()); - MemRefDescriptor memRefDescriptor(barrierMemref); - return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter, - memrefType); + return MemRefType::get({barrierType.getNumBarriers()}, + IntegerType::get(context, 64), layout, memorySpace); } namespace { @@ -441,7 +431,7 @@ struct ConvertNVGPUToNVVMPass [&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type { return converter.convertType(IntegerType::get(type.getContext(), 64)); }); - converter.addConversion([&](nvgpu::MBarrierType type) -> Type { + converter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type { return converter.convertType( nvgpu::getMBarrierMemrefType(rewriter.getContext(), type)); }); @@ -779,7 +769,7 @@ struct NVGPUMBarrierCreateLowering ConversionPatternRewriter &rewriter) const override { Operation *funcOp = op->getParentOp(); MemRefType barrierType = nvgpu::getMBarrierMemrefType( - rewriter.getContext(), op.getBarrier().getType()); + rewriter.getContext(), op.getBarriers().getType()); memref::GlobalOp global; if (auto moduleOp = funcOp->getParentOfType()) @@ -794,21 +784,37 @@ struct NVGPUMBarrierCreateLowering } }; +/// Base class for lowering mbarrier operations to nvvm intrinsics. +template +struct MBarrierBasePattern : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + /// Returns the base pointer of the mbarrier object. + Value getMbarrierPtr(Operation *op, nvgpu::MBarrierGroupType mbarType, + Value memrefDesc, Value mbarId, + ConversionPatternRewriter &rewriter) const { + MemRefType mbarrierMemrefType = + nvgpu::getMBarrierMemrefType(rewriter.getContext(), mbarType); + return ConvertToLLVMPattern::getStridedElementPtr( + op->getLoc(), mbarrierMemrefType, memrefDesc, {mbarId}, rewriter); + return memrefDesc; + } +}; + /// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init` struct NVGPUMBarrierInitLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + : public MBarrierBasePattern { + using MBarrierBasePattern::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + nvgpu::MBarrierGroupType mbarrierType = op.getBarriers().getType(); rewriter.setInsertionPoint(op); - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); - + Value barrier = getMbarrierPtr(op, mbarrierType, adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); - - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(mbarrierType)) { rewriter.replaceOpWithNewOp(op, barrier, count); } else { @@ -820,16 +826,17 @@ struct NVGPUMBarrierInitLowering /// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive` struct NVGPUMBarrierArriveLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + : public MBarrierBasePattern { + using MBarrierBasePattern::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp(op, tokenType, barrier); } else { @@ -843,19 +850,19 @@ struct NVGPUMBarrierArriveLowering /// Lowers `nvgpu.mbarrier.arrive.nocomplete` to /// `nvvm.mbarrier.arrive.nocomplete` struct NVGPUMBarrierArriveNoCompleteLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern { + using MBarrierBasePattern< + nvgpu::MBarrierArriveNoCompleteOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp( op, tokenType, barrier, count); } else { @@ -868,17 +875,16 @@ struct NVGPUMBarrierArriveNoCompleteLowering /// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait` struct NVGPUMBarrierTestWaitLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierTestWaitOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern { + using MBarrierBasePattern::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Type retType = rewriter.getI1Type(); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp( op, retType, barrier, adaptor.getToken()); } else { @@ -890,18 +896,18 @@ struct NVGPUMBarrierTestWaitLowering }; struct NVGPUMBarrierArriveExpectTxLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierArriveExpectTxOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern { + using MBarrierBasePattern< + nvgpu::MBarrierArriveExpectTxOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierArriveExpectTxOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Value txcount = truncToI32(rewriter, op->getLoc(), adaptor.getTxcount()); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp( op, barrier, txcount); return success(); @@ -914,19 +920,19 @@ struct NVGPUMBarrierArriveExpectTxLowering }; struct NVGPUMBarrierTryWaitParityLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierTryWaitParityOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern { + using MBarrierBasePattern< + nvgpu::MBarrierTryWaitParityOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierTryWaitParityOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Value ticks = truncToI32(rewriter, op->getLoc(), adaptor.getTicks()); Value phase = truncToI32(rewriter, op->getLoc(), adaptor.getPhase()); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp( op, barrier, phase, ticks); return success(); @@ -939,16 +945,17 @@ struct NVGPUMBarrierTryWaitParityLowering }; struct NVGPUTmaAsyncLoadOpLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + : public MBarrierBasePattern { + using MBarrierBasePattern::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto srcMemrefType = cast(op.getDst().getType()); Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType, adaptor.getDst(), {}, rewriter); - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); SmallVector coords = adaptor.getCoordinates(); for (auto [index, value] : llvm::enumerate(coords)) { diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 680c21ab74fe0..373cd3caaa69e 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -818,7 +818,7 @@ struct HopperBuilder { HopperBuilder(RewriterBase &rewriter, Location loc) : rewriter(rewriter), loc(loc) {} - TypedValue + TypedValue buildAndInitBarrierInSharedMemory(OpFoldResult numThreads); /// Create tma descriptor op to initiate transfer from global to shared @@ -832,9 +832,9 @@ struct HopperBuilder { OpFoldResult buildTmaAsyncLoad(TypedValue globalDesc, TypedValue sharedMemref, - TypedValue barrier, + TypedValue barrier, SmallVectorImpl &loadOps); - void buildBarrierArriveTx(TypedValue barrier, + void buildBarrierArriveTx(TypedValue barrier, ArrayRef sizes); /// If threadIdx.x == 0 does TMA request + wait, else just wait. @@ -843,9 +843,9 @@ struct HopperBuilder { SmallVector buildPredicateLoadsOnThread0( ArrayRef> globalDescriptors, ArrayRef> sharedMemBuffers, - TypedValue barrier); + TypedValue barrier); - void buildTryWaitParity(TypedValue barrier); + void buildTryWaitParity(TypedValue barrier); RewriterBase &rewriter; Location loc; @@ -854,7 +854,7 @@ struct HopperBuilder { SmallVector HopperBuilder::buildPredicateLoadsOnThread0( ArrayRef> globalDescriptors, ArrayRef> sharedMemBuffers, - TypedValue barrier) { + TypedValue barrier) { SmallVector loadOps; Value zero = rewriter.create(loc, 0); Value tidx = rewriter.create(loc, gpu::Dimension::x); @@ -895,15 +895,18 @@ static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) { // return b.getI64IntegerAttr(static_cast(kSharedMemorySpace)); } -TypedValue +TypedValue HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) { auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter); Value barrier = rewriter.create( - loc, nvgpu::MBarrierType::get(rewriter.getContext(), sharedMemorySpace)); + loc, + nvgpu::MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace)); + Value zero = rewriter.create(loc, 0); rewriter.create( - loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads)); + loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads), + zero); rewriter.create(loc); - return cast>(barrier); + return cast>(barrier); } TypedValue @@ -938,12 +941,12 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue memref, OpFoldResult HopperBuilder::buildTmaAsyncLoad( TypedValue globalDesc, TypedValue sharedMemref, - TypedValue barrier, + TypedValue barrier, SmallVectorImpl &loadOps) { MLIRContext *ctx = rewriter.getContext(); Value zero = rewriter.create(loc, 0); Operation *loadOp = rewriter.create( - loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}); + loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}, zero); loadOps.push_back(loadOp); auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref); SmallVector symbols(mixedSizes.size()); @@ -957,7 +960,7 @@ OpFoldResult HopperBuilder::buildTmaAsyncLoad( } void HopperBuilder::buildBarrierArriveTx( - TypedValue barrier, + TypedValue barrier, ArrayRef mixedSizes) { assert(!mixedSizes.empty() && "expecte non-empty sizes"); MLIRContext *ctx = rewriter.getContext(); @@ -967,19 +970,21 @@ void HopperBuilder::buildBarrierArriveTx( OpFoldResult size = affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes); Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size); - rewriter.create(loc, barrier, sizeVal); + Value zero = rewriter.create(loc, 0); + rewriter.create(loc, barrier, sizeVal, zero); } void HopperBuilder::buildTryWaitParity( - TypedValue barrier) { + TypedValue barrier) { Value parity = rewriter.create(loc, 0); // 10M is an arbitrary, not too small or too big number to specify the number // of ticks before retry. // TODO: hoist this in a default dialect constant. Value ticksBeforeRetry = rewriter.create(loc, 10000000); + Value zero = rewriter.create(loc, 0); rewriter.create(loc, barrier, parity, - ticksBeforeRetry); + ticksBeforeRetry, zero); } //===----------------------------------------------------------------------===// @@ -1013,7 +1018,7 @@ SmallVector CopyBuilder::rewrite(ArrayRef copyOps) { ArrayRef{launchOp.getBlockSizeX(), launchOp.getBlockSizeY(), launchOp.getBlockSizeZ()}); - TypedValue barrier = + TypedValue barrier = buildAndInitBarrierInSharedMemory(numThreads); SmallVector> shmems; diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index f011007e040ce..8c2f8dbbd5ad9 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -470,28 +470,34 @@ func.func @mma_sp_sync_i8_16864(%arg0: vector<4x4xi8>, return %d : vector<2x2xi32> } -!barrierType = !nvgpu.mbarrier.barrier> +!barrierType = !nvgpu.mbarrier.group> !tokenType = !nvgpu.mbarrier.token // CHECK-LABEL: func @mbarrier func.func @mbarrier() { %num_threads = arith.constant 128 : index + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64 + %c0 = arith.constant 0 : index // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3> %barrier = nvgpu.mbarrier.create -> !barrierType // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] - nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]] - %token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType + %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType - // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] - %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType + %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return } @@ -500,63 +506,96 @@ func.func @mbarrier() { func.func @mbarrier_nocomplete() { %num_threads = arith.constant 128 : index %count = arith.constant 12 : index + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64 + %c0 = arith.constant 0 : index // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3> %barrier = nvgpu.mbarrier.create -> !barrierType // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] - nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]] - %token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType + %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType - // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] - %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType + %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return } +// CHECK-LABEL: func @mbarrier_wait +func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group, num_barriers = 5>, %token : !tokenType) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %n = arith.constant 100 : index + + %numBarriers = arith.constant 5 : index + + scf.for %i = %c0 to %n step %c1 { +// CHECK: %[[c5:.+]] = arith.constant 5 : index +// CHECK: scf.for %[[i:.*]] = +// CHECK: %[[S2:.+]] = arith.remui %[[i]], %[[c5]] : index +// CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64 +// CHECK: %[[S4:.+]] = llvm.extractvalue %0[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> +// CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 + %mbarId = arith.remui %i, %numBarriers : index + %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group, num_barriers = 5>, !tokenType + } + return +} + // CHECK-LABEL: func @mbarrier_txcount func.func @mbarrier_txcount() { - %num_threads = arith.constant 128 : index + %num_threads = arith.constant 128 : index + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64 + %c0 = arith.constant 0 : index // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3> %barrier = nvgpu.mbarrier.create -> !barrierType // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] - nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType - %c0 = arith.constant 0 : index %tidxreg = nvvm.read.ptx.sreg.tid.x : i32 %tidx = arith.index_cast %tidxreg : i32 to index %cnd = arith.cmpi eq, %tidx, %c0 : index scf.if %cnd { %txcount = arith.constant 256 : index - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]] - nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType + nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType scf.yield } else { %txcount = arith.constant 0 : index - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]] - nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType + nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType scf.yield } %phase = arith.constant 0 : index %ticks = arith.constant 10000000 : index - // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]] - nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType + nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType func.return } @@ -567,7 +606,7 @@ func.func @mbarrier_txcount() { !tensorMap3d = !nvgpu.tensormap.descriptor, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> !tensorMap4d = !nvgpu.tensormap.descriptor, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = interleave_16b> !tensorMap5d = !nvgpu.tensormap.descriptor, swizzle=none, l2promo = none, oob = zero, interleave = none> -!mbarrier = !nvgpu.mbarrier.barrier> +!mbarrier = !nvgpu.mbarrier.group> func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, %buffer2d: memref<32x32xf32,3>, @@ -575,18 +614,19 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>, %mbarrier: !mbarrier) { + %c0 = arith.constant 0 : index %crd0 = arith.constant 0 : index %crd1 = arith.constant 0 : index // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}] - nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3> + nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier[%c0] to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3> func.return } @@ -621,12 +661,12 @@ module @mymodule { %rhsShmem3 = memref.subview %rhsShmem2[1,0,0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16,3> to memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3> %rhsShmem = memref.subview %rhsShmem3[0,0,0][1, 64, 128][1, 1, 1] : memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3> to !shmemrhs // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global - nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs + nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs // CHECK: %[[desc:.+]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[c8192:.+]] = llvm.mlir.constant(8192 : index) : i64 // CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32 - nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs + nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs return } } From 3d2ec4f8fa6db6b51055541b2abe1b80418412bc Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Mon, 11 Sep 2023 18:11:31 +0200 Subject: [PATCH 2/3] fix transform dialect test --- mlir/test/Dialect/NVGPU/tmaload-transform.mlir | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir index 646008b64f794..30f8c45709bcd 100644 --- a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir +++ b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir @@ -34,7 +34,7 @@ func.func @main() { %out_1 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, #gpu.address_space> // CHECK: %[[B:.*]] = nvgpu.mbarrier.create -> - // CHECK: nvgpu.mbarrier.init %[[B]], %{{.*}} : + // CHECK: nvgpu.mbarrier.init %[[B]][%{{.*}}], %{{.*}} : // CHECK: gpu.barrier // // CHECK: %[[c0:.*]] = arith.constant 0 : index @@ -44,27 +44,27 @@ func.func @main() { // CHECK: scf.if %[[CMP]] { // // CHECK: %[[c0_7:.*]] = arith.constant 0 : index - // CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]] to %[[G1]] + // CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]][%{{.*}}] to %[[G1]] // CHECK-SAME: : >, // CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, // CHECK-SAME: -> memref<64x8xf32, #gpu.address_space> // // CHECK: %[[c0_8:.*]] = arith.constant 0 : index - // CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]] to %[[G2]] + // CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]][%{{.*}}] to %[[G2]] // CHECK-SAME: : >, // CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, // CHECK-SAME: -> memref<8x128xf32, #gpu.address_space> // // CHECK: %[[c6144:.*]] = arith.constant 6144 : index - // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]], %[[c6144]] : + // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c6144]] : // CHECK: } else { // CHECK: %[[c0_7:.*]] = arith.constant 0 : index - // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]], %[[c0_7]] : + // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c0_7]] : // CHECK: } // // CHECK: %[[c0_6:.*]] = arith.constant 0 : index // CHECK: %[[c10000000:.*]] = arith.constant 10000000 : index - // CHECK: nvgpu.mbarrier.try_wait.parity %[[B]], %[[c0_6]], %[[c10000000]] : + // CHECK: nvgpu.mbarrier.try_wait.parity %[[B]][%{{.*}}], %[[c0_6]], %[[c10000000]] : /// Both copies are matched and end up in the same async group. linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, #gpu.address_space>) From 11c79f82af37186c429ce9fd634a16ceda10138c Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Fri, 22 Sep 2023 16:43:59 +0200 Subject: [PATCH 3/3] fix transform dialect code --- mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 373cd3caaa69e..94d7d565ff1a9 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -70,7 +70,7 @@ void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns( LLVM::LLVMStructType::getLiteral(type.getContext(), structBody); return llvmTypeConverter.convertType(convertedType); }); - llvmTypeConverter.addConversion([&](nvgpu::MBarrierType type) -> Type { + llvmTypeConverter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type { return llvmTypeConverter.convertType( getMBarrierMemrefType(type.getContext(), type)); });