From 3b751a4d2797d29422e08327b1a53933448a26fd Mon Sep 17 00:00:00 2001 From: Prashant Kumar Date: Fri, 25 Oct 2024 08:00:21 +0530 Subject: [PATCH] [LLVMCPU] Enable tileDispatchUsingForall as default (#18777) --- .../Common/TileDispatchUsingForall.cpp | 13 +++++---- .../iree/compiler/Codegen/LLVMCPU/Passes.cpp | 28 +++++++------------ .../test/ROCDL/pipeline_tile_and_fuse.mlir | 4 +-- .../Dialect/Stream/Builtins/fill_i64.mlir | 9 +++--- .../Dialect/Stream/Builtins/splat_i64.mlir | 9 +++--- .../DispatchCreation/FormDispatchRegions.cpp | 8 ++++++ .../onnx_ops/onnx_ops_cpu_llvm_sync.json | 7 ----- 7 files changed, 38 insertions(+), 40 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp index ebbe585bf53e..218b7f5217f1 100644 --- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp @@ -202,13 +202,16 @@ static LogicalResult dropUnitDistributedDims(RewriterBase &rewriter, llvm::SmallDenseSet droppedLoops; for (auto [index, lb, ub, step] : llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) { - if (!isa(lb) || !isa(ub) || !isa(step)) { + + std::optional lbVal = getConstantIntValue(lb); + std::optional ubVal = getConstantIntValue(ub); + std::optional stepVal = getConstantIntValue(step); + + if (!(lbVal && ubVal && stepVal)) { continue; } - int64_t lbVal = getConstantIntValue(lb).value(); - int64_t ubVal = getConstantIntValue(ub).value(); - int64_t stepVal = getConstantIntValue(step).value(); - if (CEILDIV(ubVal - lbVal, stepVal) == 1) { + + if (CEILDIV(ubVal.value() - lbVal.value(), stepVal.value()) == 1) { droppedLoops.insert(index); } } diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp index 0951fbba4273..71b3aec7389f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp @@ -95,7 +95,7 @@ static llvm::cl::opt clEnableVectorContractCustomKernels( static llvm::cl::opt clTileDispatchUsingForall( "iree-llvmcpu-tile-dispatch-using-forall", llvm::cl::desc("Enable tile and distribute to workgroups using scf.forall"), - llvm::cl::init(false)); + llvm::cl::init(true)); // By default, IREE does not enable the Armv9-A streaming SVE mode in the // presence of scalable vectors (even when using `+sme`), as currently there's @@ -111,9 +111,8 @@ static llvm::cl::opt clForceArmStreaming( llvm::cl::init(false)); // TODO: Enable `TileDispatchUsingForall` for every pipeline. -static void addTileAndDistributePasses(OpPassManager &funcPassManager, - bool enableTileDispatchUsingForall) { - if (enableTileDispatchUsingForall || clTileDispatchUsingForall) { +static void addTileAndDistributePasses(OpPassManager &funcPassManager) { + if (clTileDispatchUsingForall) { funcPassManager.addPass( createTileAndDistributeToWorkgroupsUsingForallOpPass()); } else { @@ -346,8 +345,7 @@ void buildLLVMCPUVectorLoweringPipeline( void addCPUBufferOpsTileAndVectorizePipeline( OpPassManager &funcPassManager, TilingConfig &tilingConfig, LLVMCPUPipelineOptions &pipelineOpt) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/true); + addTileAndDistributePasses(funcPassManager); // Skip tiling reduction loops because this is expected to apply on copy ops // only. @@ -384,8 +382,7 @@ void addCPUBufferOpsTileAndVectorizePipeline( void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager, TilingConfig &tilingConfig, LLVMCPUPipelineOptions &pipelineOpt) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/true); + addTileAndDistributePasses(funcPassManager); SmallVector allFusableLevels(tilingConfig.getFusableLevels()); // Apply tile and fuse to all the non-distribution fusable levels. Skip @@ -464,8 +461,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager, void addConvTileAndDecomposeExpertPassPipeline( OpPassManager &funcPassManager, TilingConfig &tilingConfig, LLVMCPUPipelineOptions &pipelineOpt) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/true); + addTileAndDistributePasses(funcPassManager); // Run LLVMTileAndFuse firstly in case that we have fill + conv + generic // ops. At this stage, we do not apply vectorization. The reduction dim won't @@ -528,8 +524,7 @@ void addConvTileAndDecomposeExpertPassPipeline( void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager, TilingConfig &tilingConfig, LLVMCPUPipelineOptions &pipelineOpt) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/true); + addTileAndDistributePasses(funcPassManager); funcPassManager.addPass(createLLVMCPUTileAndFusePass( static_cast(tilingConfig.getVectorCommonParallelLevel()))); @@ -577,8 +572,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager, void addCPUDataTilingPipeline(OpPassManager &funcPassManager, TilingConfig &tilingConfig, LLVMCPUPipelineOptions &pipelineOpt) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/true); + addTileAndDistributePasses(funcPassManager); // The below two passes are nop if pack/unpack is not specified in ukernels // attribute. By default, they are disabled. @@ -621,8 +615,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager, void addCPULinalgExtTileAndVectorizePipeline( OpPassManager &funcPassManager, TilingConfig &tilingConfig, LLVMCPUPipelineOptions &pipelineOpt) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/false); + addTileAndDistributePasses(funcPassManager); funcPassManager.addPass( createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel())); // TODO: Remove the pass once we have PartialReductionOpInterface implemented @@ -661,8 +654,7 @@ void addCPULinalgExtTileAndVectorizePipeline( } void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) { - addTileAndDistributePasses(funcPassManager, - /*enableTileDispatchUsingForall=*/false); + addTileAndDistributePasses(funcPassManager); addCPUBufferizePasses(funcPassManager); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 912acf310b26..2ebc85496759 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -290,7 +290,7 @@ hal.executable private @main { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C720:.+]] = arith.constant 720 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK: scf.forall ({{.*}}) in (2, 4, 1, 5) { +// CHECK: scf.forall ({{.*}}) in (2, 4, 5) { // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x4x1x4x4x1xf32>) // CHECK: gpu.barrier // CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> @@ -307,7 +307,7 @@ hal.executable private @main { // CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32> // CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32> // CHECK: vector.transfer_write %[[EXTRACT]], %[[B2]] -// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} +// CHECK: } {mapping = [#iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping, #iree_codegen.workgroup_mapping]} // ----- diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir index 96e527a20f0f..5d7d686b5bd1 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir @@ -9,16 +9,17 @@ stream.executable private @__builtin_fill_i64 { stream.executable.export public @__builtin_fill_i64 workgroups(%arg0: index) -> (index, index, index) { - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 + %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @__builtin_fill_i64(%value: i64, %count: index, %out_binding: !stream.binding) { %c0 = arith.constant 0 : index - %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor>{%count} - %0 = tensor.empty(%count) : tensor + %count0 = flow.dispatch.workload.ordinal %count, 0 : index + %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor>{%count0} + %0 = tensor.empty(%count0) : tensor %1 = linalg.fill ins(%value : i64) outs(%0 : tensor) -> tensor - flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor -> !flow.dispatch.tensor>{%count} + flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor -> !flow.dispatch.tensor>{%count0} return } } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir index 7d94e51a26d7..4d25d358c7b4 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir @@ -9,16 +9,17 @@ stream.executable private @__builtin_splat_i64 { stream.executable.export public @__builtin_splat_i64 workgroups(%arg0: index) -> (index, index, index) { - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 + %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0 stream.return %x, %y, %z : index, index, index } builtin.module { func.func @__builtin_splat_i64(%value: i64, %count: index, %out_binding: !stream.binding) { %c0 = arith.constant 0 : index - %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor>{%count} - %0 = tensor.empty(%count) : tensor + %count0 = flow.dispatch.workload.ordinal %count, 0 : index + %out = stream.binding.subspan %out_binding[%c0] : !stream.binding -> !flow.dispatch.tensor>{%count0} + %0 = tensor.empty(%count0) : tensor %1 = linalg.fill ins(%value : i64) outs(%0 : tensor) -> tensor - flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count], strides = [1] : tensor -> !flow.dispatch.tensor>{%count} + flow.dispatch.tensor.store %1, %out, offsets = [0], sizes = [%count0], strides = [1] : tensor -> !flow.dispatch.tensor>{%count0} return } } diff --git a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp index e866022eb9a9..b38b1a593001 100644 --- a/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp +++ b/compiler/src/iree/compiler/DispatchCreation/FormDispatchRegions.cpp @@ -547,6 +547,14 @@ isFusableWithConsumer(OpOperand &fusedOperand, return false; } + // TODO: Enable grouped convolution and depth wise pooling fusion. + // Rightnow, this is going through the default CPU pipeline and not through + // CONVTilingExpert. + if (isa(producer)) { + return false; + } + auto producerFusionOp = dyn_cast(producer); auto consumerFusionOp = diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json index a025431d7af4..f8ca790fe5b1 100644 --- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json +++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json @@ -392,13 +392,6 @@ "onnx/node/generated/test_softsign_example", "onnx/node/generated/test_stft", "onnx/node/generated/test_stft_with_window", - "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip0", - "onnx/node/generated/test_tfidfvectorizer_tf_batch_onlybigrams_skip5", - "onnx/node/generated/test_tfidfvectorizer_tf_batch_uniandbigrams_skip5", - "onnx/node/generated/test_tfidfvectorizer_tf_only_bigrams_skip0", - "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_levelempty", - "onnx/node/generated/test_tfidfvectorizer_tf_onlybigrams_skip5", - "onnx/node/generated/test_tfidfvectorizer_tf_uniandbigrams_skip5", "onnx/node/generated/test_training_dropout", "onnx/node/generated/test_training_dropout_default", "onnx/node/generated/test_training_dropout_default_mask",