Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPUHeuristic] Modify schedule generator to consider distribution of tranfer_read layout anchor #17636

Merged
merged 7 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ static int64_t calculateSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
}

bool isValidSchedule(const GPUMatmulShapeType &problem,
const GPUMMASchedule &schedule, const bool mustBeAligned) {
const GPUMMASchedule &schedule, const bool mustBeAligned,
const int64_t subgroupSize) {
auto alignedMSize =
mustBeAligned
? problem.mSize
Expand All @@ -48,20 +49,34 @@ bool isValidSchedule(const GPUMatmulShapeType &problem,
bool isValidN = (alignedNSize % (schedule.nSize * schedule.nTileCount *
schedule.nWarpCount)) == 0;
bool isValidK = (alignedKSize % (schedule.kSize * schedule.kTileCount)) == 0;
return isValidN && isValidM && isValidK;

// Constraint to ensure wgTileSize is distributable by wgSize.
// such that we can distribute to it's corresponding vector.transfer_read.
// TODO: Add constraint on M-dim and K-dim if it pops up in another
// model/example. Currently we are being conservative to preserve performance
// as much as we can.
const int64_t kMaxVectorLoadBitWidth = 128;
int64_t elemsPerThread =
kMaxVectorLoadBitWidth / problem.bType.getIntOrFloatBitWidth();
int64_t wgThreads = schedule.mWarpCount * schedule.nWarpCount * subgroupSize;
int64_t nTileSize =
schedule.nSize * schedule.nTileCount * schedule.nWarpCount;
bool isDistributableN = (nTileSize / elemsPerThread) % wgThreads == 0 ||
wgThreads % (nTileSize / elemsPerThread) == 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you commented that this will work for matmul_transpose_b but not for matmul because it depends on which dimension is inner most. Can we add identifiers for which dimension is inner most to GPUMatmulShapeType to inform this heuristic about when to check for this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we can do that let me think of a nice way to shuttle this data around. :)

Still no guarantees(though unlikely) that somewhere down the line this information may change, but defo better than no heuristics haha.


return isValidN && isValidM && isValidK && isDistributableN;
}

FailureOr<GPUMMASchedule>
fitScheduleInSharedMemory(const GPUMatmulShapeType &problem,
ArrayRef<GPUMatmulShapeType> intrinsics,
GPUMMASchedule schedule,
int64_t sharedMemLimitInBytes, bool mustBeAligned) {
FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(
const GPUMatmulShapeType &problem, ArrayRef<GPUMatmulShapeType> intrinsics,
GPUMMASchedule schedule, int64_t sharedMemLimitInBytes,
int64_t subgroupSize, bool mustBeAligned) {
int64_t lhsBitwidth =
intrinsics[schedule.index].aType.getIntOrFloatBitWidth();
int64_t rhsBitwidth =
intrinsics[schedule.index].bType.getIntOrFloatBitWidth();

while (!isValidSchedule(problem, schedule, mustBeAligned) ||
while (!isValidSchedule(problem, schedule, mustBeAligned, subgroupSize) ||
calculateSharedMemoryUsedInBytes(schedule, lhsBitwidth, rhsBitwidth) >
sharedMemLimitInBytes) {
LLVM_DEBUG({
Expand Down Expand Up @@ -113,7 +128,7 @@ fitScheduleInSharedMemory(const GPUMatmulShapeType &problem,
FailureOr<GPUMMASchedule> deduceMMASchedule(
const GPUMatmulShapeType &problem, ArrayRef<GPUMatmulShapeType> intrinsics,
const GPUMMAHeuristicSeeds &seeds, int64_t sharedMemLimitInBytes,
bool canUpcastAcc, bool mustBeAligned) {
int64_t subgroupSize, bool canUpcastAcc, bool mustBeAligned) {
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
if (problem.aType != intrinsic.aType || problem.bType != intrinsic.bType) {
continue; // Cannot use this intrinsic for mismatched types
Expand Down Expand Up @@ -219,7 +234,7 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
GPUMMASchedule{index, intrinsic.mSize, intrinsic.nSize, intrinsic.kSize,
mWarpCount, nWarpCount, mTileCount, nTileCount,
kTileCount},
sharedMemLimitInBytes, mustBeAligned);
sharedMemLimitInBytes, subgroupSize, mustBeAligned);
}
return failure();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@ struct GPUMMASchedule {
FailureOr<GPUMMASchedule> deduceMMASchedule(
const GPUMatmulShapeType &problem, ArrayRef<GPUMatmulShapeType> intrinsics,
const GPUMMAHeuristicSeeds &seeds, int64_t sharedMemLimitInBytes,
bool canUpcastAcc = false, bool mustBeAligned = true);
int64_t subgroupSize, bool canUpcastAcc = false, bool mustBeAligned = true);

} // namespace mlir::iree_compiler
32 changes: 16 additions & 16 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,13 +299,13 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();

// First try to find a schedule with an exactly matching intrinsic.
FailureOr<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes);
FailureOr<GPUMMASchedule> schedule = deduceMMASchedule(
problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize);
if (failed(schedule)) {
// Then try again by allowing upcasting accumulator.
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
/*canUpcastAcc=*/true);
schedule = deduceMMASchedule(problem, intrinsics, seeds,
maxSharedMemoryBytes, targetSubgroupSize,
/*canUpcastAcc=*/true);
}
if (failed(schedule)) {
return failure();
Expand Down Expand Up @@ -467,13 +467,13 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,

// First try to find a schedule with an exactly matching intrinsic.
auto pipeline = CodeGenPipeline::LLVMGPUVectorDistribute;
std::optional<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes);
std::optional<GPUMMASchedule> schedule = deduceMMASchedule(
problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize);
if (!schedule) {
// Then try again by allowing upcasting accumulator.
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
/*canUpcastAcc=*/true);
schedule = deduceMMASchedule(problem, intrinsics, seeds,
maxSharedMemoryBytes, targetSubgroupSize,
/*canUpcastAcc=*/true);
}

// Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute
Expand All @@ -483,14 +483,14 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
LDBG("Matmul Pad and Vector Distribute");
pipeline = CodeGenPipeline::LLVMGPUPadAndVectorDistribute;
bool mustBeAligned = false;
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
/*canUpcastAcc=*/false, mustBeAligned);
schedule = deduceMMASchedule(problem, intrinsics, seeds,
maxSharedMemoryBytes, targetSubgroupSize,
/*canUpcastAcc=*/false, mustBeAligned);
if (!schedule) {
// Then try again by allowing upcasting accumulator.
schedule =
deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes,
/*canUpcastAcc=*/true, mustBeAligned);
schedule = deduceMMASchedule(problem, intrinsics, seeds,
maxSharedMemoryBytes, targetSubgroupSize,
/*canUpcastAcc=*/true, mustBeAligned);
}
}
if (!schedule) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=CDNA3
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=RDNA3

// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
// to be migrated to the rocdl heuristics, but for now is just physically
Expand Down Expand Up @@ -318,22 +318,22 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
}
}

// CDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// CDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// CDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>


// CDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// CDNA3-SAME: translation_info = #[[$TRANSLATION]]
// CDNA3: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x8x1x1x1xf32>)
// RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// RDNA3-SAME: translation_info = #[[$TRANSLATION]]
// RDNA3: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x8x1x1x1xf32>)
// Each subgroup handles 2 * 2 tiles, and for each tile we accumulate 8 times
// along the K dimension. So in total 32 wmma ops.
// CDNA3-COUNT-32: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
// CDNA3: scf.yield %{{.+}} : vector<2x2x8x1x1x1xf32>
// RDNA3-COUNT-32: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
// RDNA3: scf.yield %{{.+}} : vector<2x2x8x1x1x1xf32>
// Since each subgroup handles 2 * 2 tiles, and for each tile, each lane holds 4 values.
// we will have 32 writes. We cannot do contiguous writes since the outputs columns has interleaved
// thread ids.
// CDNA3-COUNT-32: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<1x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>
// RDNA3-COUNT-32: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<1x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>

// -----

Expand Down Expand Up @@ -408,3 +408,70 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
// CHECK: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
// CHECK: %[[OUT_GLOBAL_SUB:.+]] = memref.subview %[[OUT_GLOBAL]]
// CHECK: vector.transfer_write %{{.+}}, %[[OUT_GLOBAL_SUB]]

// -----

// This test ensures that we are generating contraction schedules does not only work on contraction,
// but also will be compatible with transfer_read layouts anchors.
// Currently the transfer_read layout anchors expects WorkgroupSize % (WgTileSize / numelPerThread) == 0.
// this test ensure that this constraint is satisfied.

// NOTE: This test is not exhaustive of all possible ways the above condition is breaking,
// but rather is an example of a matmul shape from a model that broke our compilation heuristic.
// Current fix introduced is conservative, if similar issues pops up,
// we should add more constraints to our heuristic.

#pipeline_layout = #hal.pipeline.layout<
push_constants = 3,
sets = [
<0, bindings = [
<0, storage_buffer, ReadOnly>,
<1, storage_buffer>
]>
]>
hal.executable public @contract_schedule_considering_read_layout {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @contract_schedule_considering_read_layout ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @contract_schedule_considering_read_layout() {
%cst = arith.constant 0.000000e+00 : f16
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = arith.index_castui %0 : i32 to index
%4 = arith.index_castui %1 : i32 to index
%5 = arith.index_castui %2 : i32 to index
%6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x160x1536xf16>>
%7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x1536x1536xf16>>
%8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor<writeonly:tensor<2x160x1536xf16>>
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [2, 160, 1536], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x160x1536xf16>> -> tensor<2x160x1536xf16>
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [2, 1536, 1536], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1536x1536xf16>> -> tensor<2x1536x1536xf16>
%11 = tensor.empty() : tensor<2x160x1536xf16>
%12 = linalg.fill ins(%cst : f16) outs(%11 : tensor<2x160x1536xf16>) -> tensor<2x160x1536xf16>
%13 = linalg.batch_matmul ins(%9, %10 : tensor<2x160x1536xf16>, tensor<2x1536x1536xf16>) outs(%12 : tensor<2x160x1536xf16>) -> tensor<2x160x1536xf16>
flow.dispatch.tensor.store %13, %8, offsets = [0, 0, 0], sizes = [2, 160, 1536], strides = [1, 1, 1] : tensor<2x160x1536xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x160x1536xf16>>
return
}
}
}
}
// Basic pipeline test to make sure it generates the instructions we expect.

// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4>

// CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK-DAG: %[[RHS_SHARED:.+]] = memref.alloc() : memref<128x132xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: %[[RHS_SHARED_SUB:.+]] = memref.subview %[[RHS_SHARED]][0, 0] [128, 128] [1, 1]
// CHECK-DAG: %[[LHS_SHARED:.+]] = memref.alloc() : memref<16x132xf16, #gpu.address_space<workgroup>>
// CHECK-DAG: %[[LHS_SHARED_SUB:.+]] = memref.subview %[[LHS_SHARED]][0, 0] [16, 128] [1, 1]
// CHECK: scf.for {{.*}} = %c0 to %c11 step %c1 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<1x2x1x1x4x1xf16>)
// CHECK-COUNT-16: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
// CHECK: scf.yield
// CHECK-COUNT-16: amdgpu.mfma
14 changes: 7 additions & 7 deletions compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -913,13 +913,6 @@ LogicalResult setCooperativeMatrixConfig(
int64_t sharedMemoryLimitInBytes =
targetEnv.getResourceLimits().getMaxComputeSharedMemorySize();

FailureOr<GPUMMASchedule> schedule =
deduceMMASchedule(problem, intrinsics, seeds, sharedMemoryLimitInBytes);
if (failed(schedule))
return failure();

auto pipeline = CodeGenPipeline::SPIRVCooperativeMatrixVectorize;

std::optional<int64_t> subgroupSize = limits.getSubgroupSize();
// AMD RDNA architectures supports both wave32 and wave64 modes. Prefer to use
// wave32 mode for better performance.
Expand All @@ -928,6 +921,13 @@ LogicalResult setCooperativeMatrixConfig(
subgroupSize = *minSize;
}

FailureOr<GPUMMASchedule> schedule = deduceMMASchedule(
problem, intrinsics, seeds, sharedMemoryLimitInBytes, *subgroupSize);
if (failed(schedule))
return failure();

auto pipeline = CodeGenPipeline::SPIRVCooperativeMatrixVectorize;

std::array<int64_t, 3> workgroupSize{schedule->nWarpCount * *subgroupSize,
schedule->mWarpCount, 1};

Expand Down
Loading