Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPUHeuristic] Modify schedule generator to consider distribution of tranfer_read layout anchor #17636

Merged
merged 7 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ static int64_t calculateSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
}

bool isValidSchedule(const GPUMatmulShapeType &problem,
const GPUMMASchedule &schedule, const bool mustBeAligned) {
const GPUMMASchedule &schedule, const bool mustBeAligned,
const int64_t subgroupSize, const bool transposedLhs,
const bool transposedRhs) {
auto alignedMSize =
mustBeAligned
? problem.mSize
Expand All @@ -48,20 +50,43 @@ bool isValidSchedule(const GPUMatmulShapeType &problem,
bool isValidN = (alignedNSize % (schedule.nSize * schedule.nTileCount *
schedule.nWarpCount)) == 0;
bool isValidK = (alignedKSize % (schedule.kSize * schedule.kTileCount)) == 0;
return isValidN && isValidM && isValidK;

// Constraint to ensure wgTileSize is distributable by wgSize.
// such that we can distribute to it's corresponding vector.transfer_read.
const int64_t kMaxVectorLoadBitWidth = 128;
int64_t elemsPerThread =
kMaxVectorLoadBitWidth / problem.bType.getIntOrFloatBitWidth();
int64_t wgThreads = schedule.mWarpCount * schedule.nWarpCount * subgroupSize;

int64_t mWgSize = schedule.mSize * schedule.mTileCount * schedule.mWarpCount;
int64_t nWgSize = schedule.nSize * schedule.nTileCount * schedule.nWarpCount;
int64_t kWgSize = schedule.kSize * schedule.kTileCount;
int64_t innerLhsDimSize = transposedLhs ? mWgSize : kWgSize;
int64_t innerRhsDimSize = transposedRhs ? kWgSize : nWgSize;

bool isDistributableLhs =
(innerLhsDimSize / elemsPerThread) % wgThreads == 0 ||
wgThreads % (innerLhsDimSize / elemsPerThread) == 0;
bool isDistributableRhs =
(innerRhsDimSize / elemsPerThread) % wgThreads == 0 ||
wgThreads % (innerRhsDimSize / elemsPerThread) == 0;

return isValidN && isValidM && isValidK && isDistributableLhs &&
isDistributableRhs;
}

FailureOr<GPUMMASchedule>
fitScheduleInSharedMemory(const GPUMatmulShapeType &problem,
ArrayRef<GPUMatmulShapeType> intrinsics,
GPUMMASchedule schedule,
int64_t sharedMemLimitInBytes, bool mustBeAligned) {
FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(
const GPUMatmulShapeType &problem, ArrayRef<GPUMatmulShapeType> intrinsics,
GPUMMASchedule schedule, int64_t sharedMemLimitInBytes,
int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
bool mustBeAligned) {
int64_t lhsBitwidth =
intrinsics[schedule.index].aType.getIntOrFloatBitWidth();
int64_t rhsBitwidth =
intrinsics[schedule.index].bType.getIntOrFloatBitWidth();

while (!isValidSchedule(problem, schedule, mustBeAligned) ||
while (!isValidSchedule(problem, schedule, mustBeAligned, subgroupSize,
transposedLhs, transposedRhs) ||
calculateSharedMemoryUsedInBytes(schedule, lhsBitwidth, rhsBitwidth) >
sharedMemLimitInBytes) {
LLVM_DEBUG({
Expand Down Expand Up @@ -113,6 +138,7 @@ fitScheduleInSharedMemory(const GPUMatmulShapeType &problem,
FailureOr<GPUMMASchedule> deduceMMASchedule(
const GPUMatmulShapeType &problem, ArrayRef<GPUMatmulShapeType> intrinsics,
const GPUMMAHeuristicSeeds &seeds, int64_t sharedMemLimitInBytes,
int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
bool canUpcastAcc, bool mustBeAligned) {
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
if (problem.aType != intrinsic.aType || problem.bType != intrinsic.bType) {
Expand Down Expand Up @@ -219,7 +245,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
GPUMMASchedule{index, intrinsic.mSize, intrinsic.nSize, intrinsic.kSize,
mWarpCount, nWarpCount, mTileCount, nTileCount,
kTileCount},
sharedMemLimitInBytes, mustBeAligned);
sharedMemLimitInBytes, subgroupSize, transposedLhs, transposedRhs,
mustBeAligned);
}
return failure();
}
Expand Down
11 changes: 7 additions & 4 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,12 @@ struct GPUMMASchedule {

/// Returns a schedule for using one of the given MMA |intrinsics| to target the
/// input |problem|. Returns std::nullopt if we cannot find such a schedule.
FailureOr<GPUMMASchedule> deduceMMASchedule(
const GPUMatmulShapeType &problem, ArrayRef<GPUMatmulShapeType> intrinsics,
const GPUMMAHeuristicSeeds &seeds, int64_t sharedMemLimitInBytes,
bool canUpcastAcc = false, bool mustBeAligned = true);
FailureOr<GPUMMASchedule>
deduceMMASchedule(const GPUMatmulShapeType &problem,
ArrayRef<GPUMatmulShapeType> intrinsics,
const GPUMMAHeuristicSeeds &seeds,
int64_t sharedMemLimitInBytes, int64_t subgroupSize,
bool transposedLhs = false, bool transposedRhs = false,
bool canUpcastAcc = false, bool mustBeAligned = true);

} // namespace mlir::iree_compiler
30 changes: 22 additions & 8 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,14 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();

// First try to find a schedule with an exactly matching intrinsic.
FailureOr<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes);
FailureOr<GPUMMASchedule> schedule = deduceMMASchedule(
problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize);
if (failed(schedule)) {
// Then try again by allowing upcasting accumulator.
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
/*canUpcastAcc=*/true);
schedule = deduceMMASchedule(
problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize,
/*transposedLhs*/ false, /*transposedRhs*/ false,
/*canUpcastAcc=*/true);
}
if (failed(schedule)) {
return failure();
Expand Down Expand Up @@ -465,14 +466,25 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,

LDBG("Matmul Vector Distribution Config");

// First try to find a schedule with an exactly matching intrinsic.
auto pipeline = CodeGenPipeline::LLVMGPUVectorDistribute;
std::optional<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes);

// Infer if lhs or rhs is transposed to help generate better schedule.
SmallVector<AffineMap> maps = op.getIndexingMapsArray();
bool transposedLhs =
kDim !=
llvm::cast<AffineDimExpr>(maps[0].getResults().back()).getPosition();
bool transposedRhs =
nDim !=
llvm::cast<AffineDimExpr>(maps[1].getResults().back()).getPosition();

// First try to find a schedule with an exactly matching intrinsic.
std::optional<GPUMMASchedule> schedule = deduceMMASchedule(
problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize);
if (!schedule) {
// Then try again by allowing upcasting accumulator.
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
targetSubgroupSize, transposedLhs, transposedRhs,
/*canUpcastAcc=*/true);
}

Expand All @@ -485,11 +497,13 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
bool mustBeAligned = false;
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
targetSubgroupSize, transposedLhs, transposedRhs,
/*canUpcastAcc=*/false, mustBeAligned);
if (!schedule) {
// Then try again by allowing upcasting accumulator.
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
targetSubgroupSize, transposedLhs, transposedRhs,
/*canUpcastAcc=*/true, mustBeAligned);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=CDNA3
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=RDNA3

// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
// to be migrated to the rocdl heuristics, but for now is just physically
Expand Down Expand Up @@ -318,22 +318,22 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
}
}

// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// CDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// CDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>


// CDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// CDNA3-SAME: translation_info = #[[$TRANSLATION]]
// CDNA3: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x8x1x1x1xf32>)
// RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// RDNA3-SAME: translation_info = #[[$TRANSLATION]]
// RDNA3: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x8x1x1x1xf32>)
// Each subgroup handles 2 * 2 tiles, and for each tile we accumulate 8 times
// along the K dimension. So in total 32 wmma ops.
// CDNA3-COUNT-32: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
// CDNA3: scf.yield %{{.+}} : vector<2x2x8x1x1x1xf32>
// RDNA3-COUNT-32: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
// RDNA3: scf.yield %{{.+}} : vector<2x2x8x1x1x1xf32>
// Since each subgroup handles 2 * 2 tiles, and for each tile, each lane holds 4 values.
// we will have 32 writes. We cannot do contiguous writes since the outputs columns has interleaved
// thread ids.
// CDNA3-COUNT-32: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<1x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
// RDNA3-COUNT-32: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<1x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>

// -----

Expand Down Expand Up @@ -408,3 +408,68 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
// CHECK: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
// CHECK: %[[OUT_GLOBAL_SUB:.+]] = memref.subview %[[OUT_GLOBAL]]
// CHECK: vector.transfer_write %{{.+}}, %[[OUT_GLOBAL_SUB]]

// -----

// This test ensures that we are generating contraction schedules does not only work on contraction,
// but also will be compatible with transfer_read layouts anchors.
// Currently the transfer_read layout anchors expects WorkgroupSize % (WgTileSize / numelPerThread) == 0.
// this test ensure that this constraint is satisfied.

// NOTE: This test is not exhaustive of all possible ways the above condition is breaking,
// but rather is an example of a matmul shape from a model that broke our compilation heuristic.

#pipeline_layout = #hal.pipeline.layout<
push_constants = 3,
sets = [
<0, bindings = [
<0, storage_buffer, ReadOnly>,
<1, storage_buffer>
]>
]>
hal.executable public @contract_schedule_considering_read_layout {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @contract_schedule_considering_read_layout ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @contract_schedule_considering_read_layout() {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x160x1536xf16>>
%7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1536x1536xf16>>
%8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor<writeonly:tensor<2x160x1536xf16>>
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [2, 160, 1536], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x160x1536xf16>> -> tensor<2x160x1536xf16>
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [2, 1536, 1536], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1536x1536xf16>> -> tensor<2x1536x1536xf16>
%11 = tensor.empty() : tensor<2x160x1536xf16>
%12 = linalg.fill ins(%cst : f16) outs(%11 : tensor<2x160x1536xf16>) -> tensor<2x160x1536xf16>
%13 = linalg.batch_matmul ins(%9, %10 : tensor<2x160x1536xf16>, tensor<2x1536x1536xf16>) outs(%12 : tensor<2x160x1536xf16>) -> tensor<2x160x1536xf16>
flow.dispatch.tensor.store %13, %8, offsets = [0, 0, 0], sizes = [2, 160, 1536], strides = [1, 1, 1] : tensor<2x160x1536xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x160x1536xf16>>
return
}
}
}
}
// Basic pipeline test to make sure it generates the instructions we expect.

// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4>

// CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK-DAG: %[[RHS_SHARED:.+]] = memref.alloc() : memref<128x132xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: %[[RHS_SHARED_SUB:.+]] = memref.subview %[[RHS_SHARED]][0, 0] [128, 128] [1, 1]
// CHECK-DAG: %[[LHS_SHARED:.+]] = memref.alloc() : memref<16x132xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: %[[LHS_SHARED_SUB:.+]] = memref.subview %[[LHS_SHARED]][0, 0] [16, 128] [1, 1]
// CHECK: scf.for {{.*}} = %c0 to %c11 step %c1 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<1x2x1x1x4x1xf16>)
// CHECK-COUNT-16: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
// CHECK: scf.yield
// CHECK-COUNT-16: amdgpu.mfma
24 changes: 17 additions & 7 deletions compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -913,13 +913,6 @@ LogicalResult setCooperativeMatrixConfig(
int64_t sharedMemoryLimitInBytes =
targetEnv.getResourceLimits().getMaxComputeSharedMemorySize();

FailureOr<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, sharedMemoryLimitInBytes);
if (failed(schedule))
return failure();

auto pipeline = CodeGenPipeline::SPIRVCooperativeMatrixVectorize;

std::optional<int64_t> subgroupSize = limits.getSubgroupSize();
// AMD RDNA architectures supports both wave32 and wave64 modes. Prefer to use
// wave32 mode for better performance.
Expand All @@ -928,6 +921,23 @@ LogicalResult setCooperativeMatrixConfig(
subgroupSize = *minSize;
}

// Infer if lhs or rhs is transposed to help generate better schedule.
SmallVector<AffineMap> maps = op.getIndexingMapsArray();
bool transposedLhs =
kIndex !=
llvm::cast<AffineDimExpr>(maps[0].getResults().back()).getPosition();
bool transposedRhs =
nIndex !=
llvm::cast<AffineDimExpr>(maps[1].getResults().back()).getPosition();

FailureOr<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, sharedMemoryLimitInBytes,
*subgroupSize, transposedLhs, transposedRhs);
if (failed(schedule))
return failure();

auto pipeline = CodeGenPipeline::SPIRVCooperativeMatrixVectorize;

std::array<int64_t, 3> workgroupSize{schedule->nWarpCount * *subgroupSize,
schedule->mWarpCount, 1};

Expand Down
Loading