iree-org · qedawkins · Aug 6, 2024 · Jun 11, 2024 · Aug 6, 2024 · Aug 6, 2024
@@ -111,6 +111,24 @@ SmallVector<Value> getTileSizes(OpBuilder &b, Operation *op, unsigned level);
 /// Sets the lowering configuration, overwriting existing attribute values.
 void setLoweringConfig(Operation *op, Attribute config);
 
+/// Convenience function that sets the lowering configuration on the operation
+/// and translation info for a generic lowering config, lowering pipeline,
+/// and optional workgroup/subgroup size.
+inline LogicalResult setOpConfigAndEntryPointFnTranslation(
+    mlir::FunctionOpInterface entryPointFn, Operation *op,
+    IREE::Codegen::LoweringConfigAttrInterface config,
+    IREE::Codegen::DispatchLoweringPassPipeline passPipeline,
+    ArrayRef<int64_t> workgroupSize = {},
+    std::optional<int64_t> subgroupSize = {},
+    DictionaryAttr pipelineConfig = DictionaryAttr()) {
+  MLIRContext *context = entryPointFn.getContext();
+  setLoweringConfig(op, config);
+  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+      context, passPipeline, SymbolRefAttr(), workgroupSize, subgroupSize,
+      pipelineConfig);
+  return setTranslationInfo(entryPointFn, translationInfo);
+}
+
 /// Convenience function that sets the lowering configuration on the operation
 /// and translation info on the entry point op for the common case of specifying
 /// tile sizes to use for the operation, and pass pipeline to use for the
@@ -126,11 +144,9 @@ inline LogicalResult setOpConfigAndEntryPointFnTranslation(
   MLIRContext *context = entryPointFn.getContext();
   auto config = IREE::Codegen::LoweringConfigAttr::get(context, tileSizes,
                                                        scalableTileFlags);
-  setLoweringConfig(op, config);
-  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      entryPointFn.getContext(), passPipeline, SymbolRefAttr(), workgroupSize,
-      subgroupSize, pipelineConfig);
-  return setTranslationInfo(entryPointFn, translationInfo);
+  return setOpConfigAndEntryPointFnTranslation(entryPointFn, op, config,
+                                               passPipeline, workgroupSize,
+                                               subgroupSize, pipelineConfig);
 }
 
 /// Overload of setOpConfigAndEntryPointFnTranslation() for the "no scalable

@@ -46,12 +46,14 @@ iree_td_library(
 iree_compiler_cc_library(
     name = "IREEGPUDialect",
     srcs = [
+        "DerivedConfigUtils.cpp",
         "IREEGPUAttrs.cpp",
         "IREEGPUDialect.cpp",
         "IREEGPUInterfaces.cpp",
         "IREEGPUOps.cpp",
     ],
     hdrs = [
+        "DerivedConfigUtils.h",
         "IREEGPUAttrs.h",
         "IREEGPUDialect.h",
         "IREEGPUEnums.h",
@@ -77,9 +79,9 @@ iree_compiler_cc_library(
         ":IREEGPUInterfaces",
         ":IREEGPUOpsGen",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
-        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:ConfigUtils",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
+        "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
         "@llvm-project//mlir:AffineDialect",

@@ -14,6 +14,7 @@ iree_cc_library(
   NAME
     IREEGPUDialect
   HDRS
+    "DerivedConfigUtils.h"
     "IREEGPUAttrs.h"
     "IREEGPUDialect.h"
     "IREEGPUEnums.h"
@@ -31,6 +32,7 @@ iree_cc_library(
     "IREEGPUOps.cpp.inc"
     "IREEGPUOps.h.inc"
   SRCS
+    "DerivedConfigUtils.cpp"
     "IREEGPUAttrs.cpp"
     "IREEGPUDialect.cpp"
     "IREEGPUInterfaces.cpp"
@@ -56,9 +58,9 @@ iree_cc_library(
     MLIRVectorDialect
     MLIRVectorInterfaces
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
-    iree::compiler::Codegen::Dialect::GPU::TargetUtils::ConfigUtils
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
     iree::compiler::Codegen::Utils::VectorOpUtils
+    iree::compiler::Dialect::LinalgExt::IR
   PUBLIC
 )
 

@@ -0,0 +1,111 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Dialect/GPU/IR/DerivedConfigUtils.h"
+#include <numeric>
+
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/TypeUtilities.h"
+
+namespace mlir::iree_compiler::IREE::GPU {
+
+static constexpr int64_t kPreferredCopyNumBits = 128;
+
+SmallVector<int64_t>
+getThreadTileSizesFromLoopRanges(SmallVector<int64_t> loopRanges,
+                                 int64_t numThreads, int64_t vectorSize) {
+  // TODO: We shouldn't need this check, however loop fusion currently requires
+  // loop trip counts to be identical, meaning we need to use a num_threads
+  // variant of tiling. Remove this and simply return the preferred vector size
+  // once loop fusion can resolve the forall properly.
+  if (llvm::any_of(loopRanges,
+                   [](int64_t s) { return ShapedType::isDynamic(s); })) {
+    return {};
+  }
+
+  int64_t flatNumTrips = std::accumulate(loopRanges.begin(), loopRanges.end(),
+                                         1, std::multiplies<int64_t>());
+  if (flatNumTrips % numThreads != 0) {
+    return {};
+  }
+  int64_t maxVectorSize = flatNumTrips / numThreads;
+
+  while (maxVectorSize % vectorSize != 0) {
+    vectorSize /= 2;
+  }
+
+  SmallVector<int64_t> tileSizes(loopRanges.size(), 0);
+  tileSizes.back() = vectorSize;
+  int64_t residualNumThreads = numThreads / (loopRanges.back() / vectorSize);
+  for (int i = tileSizes.size() - 2, e = 0; i >= e; --i) {
+    if (loopRanges[i] >= residualNumThreads) {
+      tileSizes[i] = loopRanges[i] / residualNumThreads;
+      residualNumThreads = 1;
+      break;
+    }
+    tileSizes[i] = 1;
+    residualNumThreads /= loopRanges[i];
+  }
+
+  return tileSizes;
+}
+
+SmallVector<int64_t> deriveLinalgOpThreadTileSizes(linalg::LinalgOp linalgOp,
+                                                   int64_t numThreads) {
+  if (!linalgOp.hasPureTensorSemantics()) {
+    return {};
+  }
+  // TODO: Support multi-result
+  if (linalgOp->getNumResults() != 1) {
+    return {};
+  }
+  SmallVector<int64_t> loopRanges = linalgOp.getStaticLoopRanges();
+  int64_t vectorSize = kPreferredCopyNumBits /
+                       getElementTypeOrSelf(linalgOp->getResultTypes()[0])
+                           .getIntOrFloatBitWidth();
+  return getThreadTileSizesFromLoopRanges(loopRanges, numThreads, vectorSize);
+}
+
+SmallVector<int64_t>
+deriveIm2colOpThreadTileSizes(IREE::LinalgExt::Im2colOp im2colOp,
+                              int64_t numThreads) {
+  if (!im2colOp.hasPureTensorSemantics()) {
+    return {};
+  }
+  // TODO(Max191): Add `getStaticLoopRanges` to TilingInterface, and use it
+  // here instead of `im2colOp.getOutputType().getShape()`. Then we can also
+  // get rid of the specialization for Im2colOp vs LinalgOp and just use
+  // TilingInterface ops.
+  SmallVector<int64_t> loopRanges(im2colOp.getOutputType().getShape());
+  int64_t vectorSize = kPreferredCopyNumBits /
+                       getElementTypeOrSelf(im2colOp->getResultTypes()[0])
+                           .getIntOrFloatBitWidth();
+  return getThreadTileSizesFromLoopRanges(loopRanges, numThreads, vectorSize);
+}
+
+SmallVector<int64_t> deriveThreadTileSizes(Operation *op) {
+  std::optional<SmallVector<int64_t>> workgroupSize =
+      getWorkgroupSize(op->getParentOfType<FunctionOpInterface>());
+  if (!workgroupSize) {
+    return {};
+  }
+  int64_t numThreads =
+      std::accumulate(workgroupSize->begin(), workgroupSize->end(), 1,
+                      std::multiplies<int64_t>());
+  return TypeSwitch<Operation *, SmallVector<int64_t>>(op)
+      .Case([&](linalg::LinalgOp linalgOp) -> SmallVector<int64_t> {
+        return deriveLinalgOpThreadTileSizes(linalgOp, numThreads);
+      })
+      .Case([&](IREE::LinalgExt::Im2colOp im2colOp) -> SmallVector<int64_t> {
+        return deriveIm2colOpThreadTileSizes(im2colOp, numThreads);
+      })
+      .Default([](Operation *op) -> SmallVector<int64_t> { return {}; });
+}
+
+} // namespace mlir::iree_compiler::IREE::GPU
@@ -0,0 +1,18 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_DIALECT_GPU_IR_DERIVEDCONFIGUTILS_H_
+#define IREE_COMPILER_CODEGEN_DIALECT_GPU_IR_DERIVEDCONFIGUTILS_H_
+
+#include "mlir/IR/Operation.h"
+
+namespace mlir::iree_compiler::IREE::GPU {
+
+SmallVector<int64_t> deriveThreadTileSizes(Operation *op);
+
+} // namespace mlir::iree_compiler::IREE::GPU
+
+#endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IR_DERIVEDCONFIGUTILS_H_
@@ -7,10 +7,11 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include <numeric>
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/DerivedConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
-#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/Utils/VectorOpUtils.h"
 #include "llvm/ADT/STLExtras.h"

@@ -21,8 +21,9 @@ iree_compiler_cc_library(
         "ConfigUtils.h",
     ],
     deps = [
+        "//compiler/src/iree/compiler/Codegen/Common/GPU:GPUHeuristics",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
-        "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",

@@ -23,8 +23,9 @@ iree_cc_library(
     MLIRIR
     MLIRLinalgDialect
     MLIRSupport
+    iree::compiler::Codegen::Common::GPU::GPUHeuristics
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
-    iree::compiler::Dialect::LinalgExt::IR
+    iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
   PUBLIC
 )