diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td index 07fafdc308b2..60fe2137a3a2 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td @@ -28,24 +28,26 @@ def CPU_DataTiling def LLVMGPU_Default : I32EnumAttrCase<"LLVMGPUDefault", 100>; +def LLVMGPU_BaseLowering + : I32EnumAttrCase<"LLVMGPUBaseLowering", 101>; def LLVMGPU_SimpleDistribute - : I32EnumAttrCase<"LLVMGPUDistribute", 101>; + : I32EnumAttrCase<"LLVMGPUDistribute", 102>; def LLVMGPU_Vectorize - : I32EnumAttrCase<"LLVMGPUVectorize", 102>; + : I32EnumAttrCase<"LLVMGPUVectorize", 103>; def LLVMGPU_MatmulSimt - : I32EnumAttrCase<"LLVMGPUMatmulSimt", 103>; + : I32EnumAttrCase<"LLVMGPUMatmulSimt", 104>; def LLVMGPU_MatmulTensorCore - : I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 104>; + : I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 105>; def LLVMGPU_TransposeSharedMem - : I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 105>; + : I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 106>; def LLVMGPU_WarpReduction - : I32EnumAttrCase<"LLVMGPUWarpReduction", 106>; + : I32EnumAttrCase<"LLVMGPUWarpReduction", 107>; def LLVMGPU_PackUnPack - : I32EnumAttrCase<"LLVMGPUPackUnPack", 107>; + : I32EnumAttrCase<"LLVMGPUPackUnPack", 108>; def LLVMGPU_MatmulTensorCoreMmaSync - : I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 108>; + : I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 109>; def LLVMGPU_VectorDistribute - : I32EnumAttrCase<"LLVMGPUVectorDistribute", 109>; + : I32EnumAttrCase<"LLVMGPUVectorDistribute", 110>; def SPIRV_BaseLowering : I32EnumAttrCase<"SPIRVBaseLowering", 200>; @@ -82,8 +84,8 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr< CPU_DataTiling, // LLVMGPU CodeGen pipelines - LLVMGPU_Default, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, - LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore, + LLVMGPU_Default, LLVMGPU_BaseLowering, LLVMGPU_SimpleDistribute, + LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore, LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction, LLVMGPU_PackUnPack, LLVMGPU_MatmulTensorCoreMmaSync, LLVMGPU_VectorDistribute, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 23d1f74baefb..7e46f1e5d2a1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -1503,6 +1503,10 @@ static void propagateLoweringConfig(Operation *rootOperation, } } +//===----------------------------------------------------------------------===// +// Entry Point +//===----------------------------------------------------------------------===// + LogicalResult initGPULaunchConfig(ModuleOp moduleOp) { llvm::StringMap exportOps = getAllEntryPoints(moduleOp); @@ -1511,6 +1515,31 @@ LogicalResult initGPULaunchConfig(ModuleOp moduleOp) { auto exportOp = exportOps.lookup(funcOp.getName()); if (!exportOp) continue; + + if (!getTranslationInfo(funcOp)) { + // If no translation info set, first check whether we already have + // workgroup count set--it's a "contract" to indicate that we should + // bypass all tiling and distribution to go down just the most basic + // lowering flow. + if (Block *body = exportOp.getWorkgroupCountBody()) { + auto retOp = cast(body->getTerminator()); + // For scalar dispatch cases--using just one thread of one workgroup. + auto isOne = [](Value value) { return matchPattern(value, m_One()); }; + if (llvm::all_of(retOp.getOperands(), isOne)) { + std::array workgroupSize = {1, 1, 1}; + if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt))) + return failure(); + auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( + funcOp.getContext(), + IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering); + if (failed(setTranslationInfo(funcOp, translationInfo))) { + return failure(); + } + continue; + } + } + } + SmallVector computeOps = getComputeOps(funcOp); if (getTranslationInfo(exportOp)) { // Currently LLVMGPU requires propagation of user lowering configs. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp index 0d9392803511..82a98e060ea2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp @@ -83,6 +83,9 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() { case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDefault: addGPUDefaultPassPipeline(pipeline, enableMicrokernels); break; + case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering: + addGPUBaseLoweringPassPipeline(pipeline); + break; case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute: addGPUSimpleDistributePassPipeline(pipeline); break; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 9c80f5096e09..57a936c26e01 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -642,6 +642,29 @@ void addGPUDefaultPassPipeline(OpPassManager &pm, bool enableMicrokernels) { createRemoveSingleIterationLoopPass()); } +void addGPUBaseLoweringPassPipeline(OpPassManager &pm) { + auto &nestedModulePM = pm.nest(); + + nestedModulePM.addNestedPass( + createConvertToDestinationPassingStylePass( + /*useWARForCooperativeMatrixCodegen=*/false)); + nestedModulePM.addPass(createCanonicalizerPass()); + nestedModulePM.addPass(createCSEPass()); + + addBufferizePasses(nestedModulePM); + nestedModulePM.addPass(createCanonicalizerPass()); + nestedModulePM.addPass(createCSEPass()); + + nestedModulePM.addNestedPass( + IREE::LinalgExt::createLinalgExtToLoopsPass()); + nestedModulePM.addNestedPass(createMemrefCopyToLinalgPass()); + nestedModulePM.addNestedPass(createConvertLinalgToLoopsPass()); + nestedModulePM.addNestedPass( + createRemoveSingleIterationLoopPass()); + nestedModulePM.addPass(createCanonicalizerPass()); + nestedModulePM.addPass(createCSEPass()); +} + // Add passes to make the address computation more explicit and optimize them. // // The idea here is to be less dependent on what the LLVM backend is able to do, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h index 523381643e81..7f84b6ea533e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h @@ -58,6 +58,9 @@ void addGPUWarpReductionPassPipeline(OpPassManager &pm); /// Default pass pipeline on GPU, currently used only for the ukernel path. void addGPUDefaultPassPipeline(OpPassManager &pm, bool enableMicrokernels); +/// Pass pipeline to lower IREE HAL executables without tiling and distribution. +void addGPUBaseLoweringPassPipeline(OpPassManager &pm); + /// Populates passes needed to preprocess and select the translation strategy. void buildLLVMGPUCodegenConfigurationPassPipeline(OpPassManager &pm); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp index e4cf120bf4d8..0d92fc7e9ffd 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp @@ -346,6 +346,27 @@ LogicalResult initROCDLLaunchConfig(ModuleOp moduleOp) { if (!exportOp) continue; + // First check whether we already have workgroup count set--it's a + // "contract" to indicate that we should bypass all tiling and + // distribution to go down just the most basic lowering flow. + if (Block *body = exportOp.getWorkgroupCountBody()) { + auto retOp = cast(body->getTerminator()); + // For scalar dispatch cases--using just one thread of one workgroup. + auto isOne = [](Value value) { return matchPattern(value, m_One()); }; + if (llvm::all_of(retOp.getOperands(), isOne)) { + std::array workgroupSize = {1, 1, 1}; + if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt))) + return failure(); + auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( + funcOp.getContext(), + IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering); + if (failed(setTranslationInfo(funcOp, translationInfo))) { + return failure(); + } + continue; + } + } + SmallVector computeOps = getComputeOps(funcOp); if (getTranslationInfo(exportOp)) { // Currently ROCDL requires propagation of user lowering configs. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp index 9025284c0bfc..83c7e8867c8b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp @@ -48,6 +48,9 @@ class ROCDLLowerExecutableTargetPass OpPassManager pipeline(variantOp.getOperationName()); switch (translationInfo.value().getDispatchLoweringPassPipeline()) { + case CodeGenPipeline::LLVMGPUBaseLowering: + addGPUBaseLoweringPassPipeline(pipeline); + break; case CodeGenPipeline::LLVMGPUWarpReduction: addGPUWarpReductionPassPipeline(pipeline); break; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp index 5046491079d0..5286cc0a6c23 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp @@ -10,6 +10,7 @@ #include "iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h" #include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h" #include "iree/compiler/Dialect/HAL/IR/HALOps.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" @@ -21,7 +22,11 @@ class ROCDLSelectLoweringStrategyPass : public ROCDLSelectLoweringStrategyBase { public: void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + // clang-format off + registry + .insert(); + // clang-format on } void runOnOperation() override { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel index ece13b17239f..4c9570021c90 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel @@ -19,6 +19,7 @@ iree_lit_test_suite( srcs = enforce_glob( [ "config_vector_distribute.mlir", + "lowering_scalar_dispatch.mlir", "pipeline_vector_distribute.mlir", "pipeline_warp_reduction.mlir", ], diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt index 8861e4200a58..8727e44015d3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt @@ -15,6 +15,7 @@ iree_lit_test_suite( lit SRCS "config_vector_distribute.mlir" + "lowering_scalar_dispatch.mlir" "pipeline_vector_distribute.mlir" "pipeline_warp_reduction.mlir" TOOLS diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir new file mode 100644 index 000000000000..16dfc3a62c71 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir @@ -0,0 +1,43 @@ +// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-rocdl-select-lowering-strategy, iree-rocdl-lower-executable-target)))' -mlir-print-local-scope %s | FileCheck %s + +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx90a", ukernels = "none"}> + +#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>]>]> + +hal.executable @scalar_dispatch { + hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { + hal.executable.export public @scalar_dispatch ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %c1 = arith.constant 1 : index + hal.return %c1, %c1, %c1 : index, index, index + } + builtin.module { + func.func @scalar_dispatch() { + %c0 = arith.constant 0 : index + %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64 + %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64 + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %extracted = tensor.extract %2[] : tensor + %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64 + %4 = arith.addi %3, %c1442695040888963407_i64 : i64 + %inserted = tensor.insert %4 into %2[] : tensor + flow.dispatch.tensor.store %inserted, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return + } + } + } +} + +// CHECK-LABEL: hal.executable.export public @scalar_dispatch +// CHECK-SAME: translation_info = #iree_codegen.translation_info +// CHECK-SAME: workgroup_size = [1 : index, 1 : index, 1 : index] + +// CHECK: func.func @scalar_dispatch() +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: memref.load %[[SPAN0]][] : memref> +// CHECK: arith.muli {{.+}} : i64 +// CHECK: arith.addi {{.+}} : i64 +// CHECK: memref.store %{{.+}}, %[[SPAN1]][] : memref> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp index f7d084d5c993..7bbd45f925e5 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp @@ -1689,20 +1689,22 @@ static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv, static LogicalResult setConfigForKernel(const spirv::TargetEnv &targetEnv, IREE::HAL::ExecutableExportOp exportOp, mlir::FunctionOpInterface funcOp) { - // First check whether we already have workgroup count set--it's a "contract" - // to indicate that we should bypass all tiling and distribution to go down - // just the most basic lowering flow. - if (Block *body = exportOp.getWorkgroupCountBody()) { - auto retOp = cast(body->getTerminator()); - // For scalar dispatch cases--using just one thread of one workgroup. - auto isOne = [](Value value) { return matchPattern(value, m_One()); }; - if (llvm::all_of(retOp.getOperands(), isOne)) { - std::array workgroupSize = {1, 1, 1}; - if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt))) - return failure(); - auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( - funcOp.getContext(), CodeGenPipeline::SPIRVBaseLowering); - return setTranslationInfo(funcOp, translationInfo); + if (!getTranslationInfo(funcOp)) { + // If no translation info set, first check whether we already have workgroup + // count set--it's a "contract" to indicate that we should bypass all tiling + // and distribution to go down just the most basic lowering flow. + if (Block *body = exportOp.getWorkgroupCountBody()) { + auto retOp = cast(body->getTerminator()); + // For scalar dispatch cases--using just one thread of one workgroup. + auto isOne = [](Value value) { return matchPattern(value, m_One()); }; + if (llvm::all_of(retOp.getOperands(), isOne)) { + std::array workgroupSize = {1, 1, 1}; + if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt))) + return failure(); + auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( + funcOp.getContext(), CodeGenPipeline::SPIRVBaseLowering); + return setTranslationInfo(funcOp, translationInfo); + } } }