intel · nbpatel · Sep 15, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022
diff --git a/docs/Transforms/InsertGpuAllocs.md b/docs/Transforms/InsertGpuAllocs.md
@@ -0,0 +1,71 @@
+# InsertGpuAllocs Pass
+
+
+The InsertGpuAllocs pass, as the name suggests, inserts the gpu allocs in the IR. Memref alloc is an operation in the memref dialect that can be used to allocate the memory on the host side and or on the device side. The MLIR IR is a mix of host and device code.
+To distinguish between host side memory allocation and device side memory allocation, we convert all the memref.allocs that refer to device (gpu) side memory allocations and references, into gpu.alloc, which is an operation of the upstream GPU dialect. This distinction helps in lowering to llvm and calling the appropriate memory allocation operation at runtime.
+The pass traverses all the memref (load/store) operations inside the gpu launch op in the IR and checks for its aliases and its defining op. If the defining op is a memref.alloc op it replaces that op in the IR with gpu.alloc op, because all the operations under the gpu.launch op are device side computations and will execute on the device.
+
+# Example
+
+```
+// -----// IR Dump Before {anonymous}::InsertGPUAllocs //----- //
+func.func @main() {
+  %0 = memref.alloc() : memref<8xf32>
+  %1 = memref.alloc() : memref<8xf32>
+  %2 = memref.alloc() : memref<8xf32>
+  .
+  .
+  .
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c8, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
+    %7 = gpu.block_id  x
+    %8 = memref.load %0[%7] : memref<8xf32>
+    %9 = memref.load %1[%7] : memref<8xf32>
+    %10 = arith.addf %8, %9 : f32
+    memref.store %10, %2[%7] : memref<8xf32>
+    gpu.terminator
+  }
+  %6 = memref.cast %2 : memref<8xf32> to memref<*xf32>
+  call @printMemrefF32(%6) : (memref<*xf32>) -> ()
+  return
+}
+```
+
+The Pass will change the IR to:
+
+```
+// -----// IR Dump After {anonymous}::InsertGPUAllocs //----- //
+func.func @main() {
+  %memref = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+  %memref_2 = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+  %memref_3 = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+  .
+  .
+  .
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c8, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
+    %4 = gpu.block_id  x
+    %5 = memref.load %memref[%4] : memref<8xf32>
+    %6 = memref.load %memref_2[%4] : memref<8xf32>
+    %7 = arith.addf %5, %6 : f32
+    memref.store %7, %memref_3[%4] : memref<8xf32>
+    gpu.terminator
+  }
+  %3 = memref.cast %memref_3 : memref<8xf32> to memref<*xf32>
+  call @printMemrefF32(%3) : (memref<*xf32>) -> ()
+  return
+}
+```
+
+
+As shown in the example above, the memref.allocs in the IR are referring to device buffer allocation and hence they are replaced with gpu.alloc from the gpu dialect.
+
+## Limitations of this pass.
+
+1. This pass only supports only memref::AllocOp and not its variants like memref::AllocaOp, memref::AllocaScopeOp & AllocaScopeReturnOp.
+2. This pass needs to be run before the GpuKernelOutlining pass since it operates on gpu.launch blocks and not on gpu.launch_func.
+3. This pass only covers static shapes and shapes with unknown dims and known rank.
+
+Note: We plan to add support for these limitations in incremental future PR's.
+
+## Reason for this Custom Pass:
+
+Upstream does not have a pass which does these conversions. Our goal is to add this pass to upstream which we think will be useful to the MLIR community.
diff --git a/docs/Transforms/SetSPIRVAbiAttribute.md b/docs/Transforms/SetSPIRVAbiAttribute.md
@@ -0,0 +1,49 @@
+# SetSPIRVAbiAttribute Pass
+
+
+The SetSPIRVAbiAttribute pass, adds a kernel attribute called spv.entry_point_abi to the kernel function. Since SPIR-V programs themselves are not enough for running workloads on GPU; a companion host application is needed to manage the resources referenced by SPIR-V programs and dispatch the workload. It is also quite possible that both those programs are written by different frond-end languages.Hence the need to add the entry point abi.
+spv.entry_point_abi is a struct attribute that should be attached to the entry function. Some of the lowering passes expect this attribute to perform the lowering.
+
+# Example
+
+```
+// -----// IR Dump Before {anonymous}::SetSPIRVAbiAttribute () //----- //
+gpu.module @main_kernel {
+  gpu.func @main_kernel(%arg0: memref<8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>) kernel {
+    cf.br ^bb1
+  ^bb1:  // pred: ^bb0
+    %0 = gpu.block_id  x
+    %1 = memref.load %arg0[%0] : memref<8xf32>
+    %2 = memref.load %arg1[%0] : memref<8xf32>
+    %3 = arith.addf %1, %2 : f32
+    memref.store %3, %arg2[%0] : memref<8xf32>
+    gpu.return
+  }
+}
+```
+
+The Pass will change the IR to:
+
+```
+// -----// IR Dump After {anonymous}::SetSPIRVAbiAttribute () //----- //
+gpu.module @main_kernel {
+  gpu.func @main_kernel(%arg0: memref<8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>) kernel attributes {spv.entry_point_abi = #spv.entry_point_abi<>} {
+    cf.br ^bb1
+  ^bb1:  // pred: ^bb0
+    %0 = gpu.block_id  x
+    %1 = memref.load %arg0[%0] : memref<8xf32>
+    %2 = memref.load %arg1[%0] : memref<8xf32>
+    %3 = arith.addf %1, %2 : f32
+    memref.store %3, %arg2[%0] : memref<8xf32>
+    gpu.return
+  }
+}
+```
+
+
+As shown in the example above, the kernel attribute is added after the pass.
+
+
+## Reason for this Custom Pass:
+
+Upstream does not have a pass which does these conversions. This is a very small pass, so, maybe we can have it as a custom pass rather than upstreaming.
diff --git a/docs/Transforms/SetSPIRVCapabilities.md b/docs/Transforms/SetSPIRVCapabilities.md
@@ -0,0 +1,71 @@
+# SetSPIRVCapabilities Pass
+
+
+SPIR-V aims to support multiple execution environments. These execution environments affect the availability of certain SPIR-V features. SPIR-V compilation should also take into consideration the execution environment, so we generate SPIR-V modules valid for the target environment. This is conveyed by the spv.target_env  attribute. The SetSPIRVCapabilities pass, adds these various capabilties for the SPIR-V execution. The attribute #spv.vce has a few fields:
+
+A #spv.vce (spirv::VerCapExtAttr) attribute:
+1. The target SPIR-V version.
+2. A list of SPIR-V capabilities for the target. SPIR-V Capabilities: Capabilities are specific features supported by the target architecture. E.g., VectorAnyIntel capabilities means, the target architecture has the ability to handle any vectors of length (2 to 2^64-1). A SPIR-V module needs to specify the features (capabilities) used by the module so that the client API that consumes this module knows what capabilities are used in the module and may decide to accept and reject the module based on whether it supports them or not. It also allows a validator to validate that the module uses only its declared capabilities.
+3. A list of SPIR-V extensions for the target. SPIR-V Extensions: SPIR-V specification allows multiple vendors or parties simultaneously extend the SPIR-V specification for their need. This field lists the extensions supported by the target architecture. Extension may indicate the availability of different types of (capabilities) features (e.g., types, ops, enum case). A extension indicates the availability of one or multiple capabilities (features).
+
+# Example
+
+```
+// -----// IR Dump Before {anonymous}::SetSPIRVCapabilitiesPass () //----- //
+module attributes {gpu.container_module} {
+  func.func @main() {
+    %c8 = arith.constant 8 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 2.200000e+00 : f32
+    %cst_0 = arith.constant 1.100000e+00 : f32
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %memref = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+    %memref_2 = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+    %memref_3 = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+    %0 = memref.cast %memref : memref<8xf32> to memref<?xf32>
+    %1 = memref.cast %memref_2 : memref<8xf32> to memref<?xf32>
+    %2 = memref.cast %memref_3 : memref<8xf32> to memref<?xf32>
+    call @fillResource1DFloat(%0, %cst_0) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%1, %cst) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%2, %cst_1) : (memref<?xf32>, f32) -> ()
+    gpu.launch_func  @main_kernel::@main_kernel blocks in (%c8, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8xf32>, %memref_2 : memref<8xf32>, %memref_3 : memref<8xf32>)
+    %3 = memref.cast %memref_3 : memref<8xf32> to memref<*xf32>
+    call @printMemrefF32(%3) : (memref<*xf32>) -> ()
+    return
+  }
+```
+
+The Pass will change the IR to:
+
+```
+// -----// IR Dump After {anonymous}::SetSPIRVCapabilitiesPass () //----- //
+module attributes {gpu.container_module, spv.target_env = #spv.target_env<#spv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, #spv.resource_limits<>>} {
+  func.func @main() {
+    %c8 = arith.constant 8 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 2.200000e+00 : f32
+    %cst_0 = arith.constant 1.100000e+00 : f32
+    %cst_1 = arith.constant 0.000000e+00 : f32
+    %memref = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+    %memref_2 = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+    %memref_3 = gpu.alloc  () {gpu.alloc_shared} : memref<8xf32>
+    %0 = memref.cast %memref : memref<8xf32> to memref<?xf32>
+    %1 = memref.cast %memref_2 : memref<8xf32> to memref<?xf32>
+    %2 = memref.cast %memref_3 : memref<8xf32> to memref<?xf32>
+    call @fillResource1DFloat(%0, %cst_0) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%1, %cst) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%2, %cst_1) : (memref<?xf32>, f32) -> ()
+    gpu.launch_func  @main_kernel::@main_kernel blocks in (%c8, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8xf32>, %memref_2 : memref<8xf32>, %memref_3 : memref<8xf32>)
+    %3 = memref.cast %memref_3 : memref<8xf32> to memref<*xf32>
+    call @printMemrefF32(%3) : (memref<*xf32>) -> ()
+    return
+  }
+```
+
+
+As shown in the example above, the pass adds the SPIR-V capabilites as an attribute.
+
+
+## Reason for this Custom Pass:
+
+Upstream does not have a pass which does these conversions. This pass add a lot of things specific to Intel GPU. So, maybe we can have it as a custom pass rather than upstreaming.
diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h
@@ -21,6 +21,9 @@ namespace imex {
 // Passes
 //===----------------------------------------------------------------------===//
 std::unique_ptr<mlir::Pass> createSerializeSPIRVPass();
+std::unique_ptr<mlir::Pass> createInsertGPUAllocsPass();
+std::unique_ptr<mlir::Pass> createSetSPIRVCapabilitiesPass();
+std::unique_ptr<mlir::Pass> createSetSPIRVAbiAttribute();
 
 //===----------------------------------------------------------------------===//
 // Registration

diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_mlir_library(IMEXTransforms
   SerializeSPIRV.cpp
+  InsertGpuAllocs.cpp
+  SetSPIRVCapabilities.cpp
+  SetSPIRVAbiAttribute.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/imex/Transforms