Skip to content

Commit

Permalink
Re-land "[Dispatch] extend CollapseDimensionsPass to more…" (#19461)
Browse files Browse the repository at this point in the history
Re-lands #19326 after adding a few
more matmul specs that were missed. I left `@match_mmt_8192x640x2560`
commented out because the config was causing a slight regression in perf
but the overall time spent was very low.

This reverts commit 7177c29.

---------

Signed-off-by: Ian Wood <[email protected]>
  • Loading branch information
IanWood1 authored Dec 11, 2024
1 parent ea9176a commit e9e8b9f
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 38 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/pkgci_regression_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ jobs:
--goldentime-rocm-unet-ms 419.0 \
--goldentime-rocm-clip-ms 18.5 \
--goldentime-rocm-vae-ms 337.0 \
--goldendispatch-rocm-unet 1602 \
--goldendispatch-rocm-unet 1598 \
--goldendispatch-rocm-clip 1139 \
--goldendispatch-rocm-vae 246 \
--goldensize-rocm-unet-bytes 2280000 \
Expand All @@ -242,17 +242,17 @@ jobs:
--goldentime-rocm-unet-ms 80.0 \
--goldentime-rocm-clip-ms 15.5 \
--goldentime-rocm-vae-ms 80.0 \
--goldendispatch-rocm-unet 1602 \
--goldendispatch-rocm-unet 1598 \
--goldendispatch-rocm-clip 1139 \
--goldendispatch-rocm-vae 246 \
--goldensize-rocm-unet-bytes 2270000 \
--goldensize-rocm-clip-bytes 860000 \
--goldensize-rocm-vae-bytes 840000 \
--goldentime-rocm-punet-int8-fp16-ms 53 \
--goldendispatch-rocm-punet-int8-fp16 1424 \
--goldentime-rocm-punet-int8-fp16-ms 51 \
--goldendispatch-rocm-punet-int8-fp16 1416 \
--goldensize-rocm-punet-int8-fp16-bytes 2560000 \
--goldentime-rocm-punet-int8-fp8-ms 53 \
--goldendispatch-rocm-punet-int8-fp8 1704 \
--goldentime-rocm-punet-int8-fp8-ms 51 \
--goldendispatch-rocm-punet-int8-fp8 1696 \
--goldensize-rocm-punet-int8-fp8-bytes 2800000 \
--rocm-chip gfx942 \
--log-cli-level=info \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,142 @@ transform.named_sequence @match_attention_f8(%attention: !transform.any_op {tran
// Matmul tuning
//===----------------------------------------------------------------------===//

transform.named_sequence @match_mmt_i8_i8_i32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
// transform.print %root {name = "Generic"} : !transform.any_op
%ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
^bb0(%lhs: tensor<?x?xi8>, %rhs: tensor<?x?xi8>, %out: tensor<?x?xi32>):
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"]}
ins(%lhs, %rhs : tensor<?x?xi8>, tensor<?x?xi8>) outs(%out : tensor<?x?xi32>) {
^bb0(%in: i8, %in_0: i8, %acc: i32):
%18 = arith.extsi %in : i8 to i32
%19 = arith.extsi %in_0 : i8 to i32
%20 = arith.muli %18, %19 : i32
%21 = arith.addi %acc, %20 : i32
linalg.yield %21 : i32
} -> tensor<?x?xi32>
} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
transform.yield %root : !transform.any_op
}

transform.named_sequence @match_mmt_2048x10240x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_i8_i8_i32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<2048x1280xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<10240x1280xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 128],
workgroup = [128, 320, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_2048x1280x5120(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_i8_i8_i32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<2048x5120xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x5120xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 4, subgroup_n_count = 1,
reduction = [0, 0, 256],
workgroup = [128, 80, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [64, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_2048x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_i8_i8_i32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<2048x1280xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 2, subgroup_n_count = 2,
reduction = [0, 0, 128],
workgroup = [64, 160, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [256, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true,
reorder_workgroups_strategy = <Transpose>>}>
> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_8192x640x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_i8_i8_i32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<8192x640xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x640xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 8, subgroup_n_count = 1,
reduction = [0, 0, 64],
workgroup = [256, 64, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [512, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>}>
> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_8192x5120x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_i8_i8_i32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<8192x640xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<5120x640xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>,
subgroup_m_count = 2, subgroup_n_count = 4,
reduction = [0, 0, 64],
workgroup = [256, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [512, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>}>
> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_8192x640x2560 (%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_i8_i8_i32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<8192x2560xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x2560xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 8, subgroup_n_count = 1,
reduction = [0, 0, 64],
workgroup = [256, 64, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [512, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>}>
> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

//===----------------------------------------------------------------------===//
// Convolution tuning
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -152,6 +288,65 @@ transform.named_sequence @match_attention_f8(%attention: !transform.any_op {tran
transform.yield %generic, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_broadcast_rhs_mmt_Bx64x1280x2480(%generic: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %generic[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %generic[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<?x64x2480xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x2480xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 2, subgroup_n_count = 2,
reduction = [0, 0, 0, 128],
workgroup = [1, 64, 160, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [256, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true,
reorder_workgroups_strategy = <Transpose>>
}>
> -> !transform.any_param
transform.yield %generic, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_broadcast_rhs_mmt_Bx4960x640x640(%generic: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %generic[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %generic[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<?x4960x640xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x640xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 8, subgroup_n_count = 1,
reduction = [0, 0, 0, 64],
workgroup = [1, 256, 64, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [512, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>}>
> -> !transform.any_param
transform.yield %generic, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_broadcast_rhs_mmt_Bx64x640x2480(%generic: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %generic[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %generic[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<?x64x2480xi8> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x2480xi8> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
subgroup_m_count = 2, subgroup_n_count = 1,
reduction = [0, 0, 0, 128],
workgroup = [1, 32, 320, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>}>
> -> !transform.any_param
transform.yield %generic, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_broadcast_rhs_mmt_Bx4096x5120x640(%generic: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_broadcast_rhs_mmt_i8_i8_i32 failures(propagate) (%generic) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %generic[0] : (!transform.any_op) -> !transform.any_value
Expand Down Expand Up @@ -352,6 +547,12 @@ transform.named_sequence @match_matmul_like_Bx20x64x64x2048_transposev_i8xi8xi32
// TUNING_MATCH_BEGIN DO NOT REMOVE

// Matmul.
, @match_mmt_2048x10240x1280 -> @apply_op_config
, @match_mmt_2048x1280x5120 -> @apply_op_config
, @match_mmt_2048x1280x1280 -> @apply_op_config
, @match_mmt_8192x640x640 -> @apply_op_config
, @match_mmt_8192x5120x640 -> @apply_op_config
//, @match_mmt_8192x640x2560 -> @apply_op_config

// Convolution.

Expand All @@ -363,6 +564,10 @@ transform.named_sequence @match_matmul_like_Bx20x64x64x2048_transposev_i8xi8xi32
// Carried over from SPX.
, @match_broadcast_rhs_mmt_Bx1024x10240x1280 -> @apply_op_config
, @match_broadcast_rhs_mmt_Bx1024x1280x1280 -> @apply_op_config
, @match_broadcast_rhs_mmt_Bx64x1280x2480 -> @apply_op_config
, @match_broadcast_rhs_mmt_Bx4960x640x640 -> @apply_op_config
//, @match_broadcast_rhs_mmt_Bx64x640x2480 -> @apply_op_config


// Contration.
, @match_matmul_like_Bx20x1024x64x1280_i8xi8xi32 -> @apply_op_config
Expand Down
Loading

0 comments on commit e9e8b9f

Please sign in to comment.