tests

iree-org · Aug 9, 2024 · 3e9a393 · 3e9a393
1 parent c72f055
commit 3e9a393
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 255 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -74,6 +74,7 @@ iree_lit_test_suite(
             "transpose_pipeline_test.mlir",
             "ukernel_pipeline_transform.mlir",
             "configure_vector_layout.mlir",
+            "configure_tensor_layout.mlir",
             "vector_lowering.mlir",
             "vector_to_gpu.mlir",
             "winograd_pipeline_test.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -23,6 +23,7 @@ iree_lit_test_suite(
     "cast_type_to_fit_mma.mlir"
     "config_matvec.mlir"
     "config_winograd.mlir"
+    "configure_tensor_layout.mlir"
     "configure_vector_layout.mlir"
     "conv_pipeline_test_cuda.mlir"
     "conv_pipeline_test_rocm.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir
@@ -0,0 +1,152 @@
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-tensor-layouts, canonicalize, cse))' %s | FileCheck %s
+
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+                                              workgroup_size = [64, 1, 1]
+                                              subgroup_size = 64,
+      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
+                                             subgroup_m_count = 1,
+                                             subgroup_n_count = 1>}>
+
+#maps = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+
+#traits = {
+  indexing_maps = #maps,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @matmul_96x64x16(%lhs: tensor<96x16xf16>,
+                           %rhs: tensor<64x16xf16>,
+                           %init: tensor<96x64xf32>)
+                           -> tensor<96x64xf32>
+                           attributes { translation_info = #translation } {
+  %out = linalg.generic #traits
+                        ins(%lhs, %rhs: tensor<96x16xf16>, tensor<64x16xf16>)
+                        outs(%init: tensor<96x64xf32>) {
+    ^bb0(%in: f16, %in_1: f16, %out: f32):
+      %ex   = arith.extf %in   : f16 to f32
+      %ex_1 = arith.extf %in_1 : f16 to f32
+      %mul  = arith.mulf %ex, %ex_1 : f32
+      %sum  = arith.addf %out, %mul : f32
+      linalg.yield %sum : f32
+  } -> tensor<96x64xf32>
+  return %out : tensor<96x64xf32>
+}
+
+// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [32, 2], elements_per_thread = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 32]>
+// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [32, 2], elements_per_thread = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 32]>
+// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [4, 1], threads_per_outer = [2, 32], elements_per_thread = [4, 1], subgroup_strides = [0, 0], thread_strides = [32, 1]>
+
+// CHECK-LABEL: func.func @matmul_96x64x16
+
+// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion}
+// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion}
+// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]]
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
+// CHECK-SAME: outs(%[[ACC]]
+
+// -----
+
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+                                              workgroup_size = [64, 1, 1]
+                                              subgroup_size = 64,
+      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                             subgroup_m_count = 4,
+                                             subgroup_n_count = 1>}>
+
+#maps = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+
+#traits = {
+  indexing_maps = #maps,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>,
+                                          %rhs: tensor<64x16xf16>,
+                                          %init: tensor<128x64xf32>)
+                                          -> tensor<128x64xf32>
+                           attributes { translation_info = #translation } {
+  %out = linalg.generic #traits
+                        ins(%lhs, %rhs: tensor<128x16xf16>, tensor<64x16xf16>)
+                        outs(%init: tensor<128x64xf32>) {
+    ^bb0(%in: f16, %in_1: f16, %out: f32):
+      %ex   = arith.extf %in   : f16 to f32
+      %ex_1 = arith.extf %in_1 : f16 to f32
+      %mul  = arith.mulf %ex, %ex_1 : f32
+      %sum  = arith.addf %out, %mul : f32
+      linalg.yield %sum : f32
+  } -> tensor<128x64xf32>
+  return %out : tensor<128x64xf32>
+}
+
+// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [4, 1]
+// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1]
+// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [4, 1]
+
+// CHECK-LABEL: func.func @matmul_128x64x16_multi_subgroup
+
+// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion}
+// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion}
+// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]]
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
+// CHECK-SAME: outs(%[[ACC]]
+
+// -----
+
+#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+                                              workgroup_size = [64, 1, 1]
+                                              subgroup_size = 64,
+      {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+                                             subgroup_m_count = 2,
+                                             subgroup_n_count = 2>}>
+
+#maps = [
+  affine_map<(bm, bn, m, n, k) -> (bm, m, k)>,
+  affine_map<(bm, bn, m, n, k) -> (bn, n, k)>,
+  affine_map<(bm, bn, m, n, k) -> (bm, m, bn, n)>
+]
+
+#traits = {
+  indexing_maps = #maps,
+  iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]
+}
+
+func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>,
+                                     %rhs: tensor<8x16x16xf16>,
+                                     %init: tensor<8x16x8x16xf32>)
+                                     -> tensor<8x16x8x16xf32>
+                           attributes { translation_info = #translation } {
+  %out = linalg.generic #traits
+                        ins(%lhs, %rhs: tensor<8x16x16xf16>, tensor<8x16x16xf16>)
+                        outs(%init: tensor<8x16x8x16xf32>) {
+    ^bb0(%in: f16, %in_1: f16, %out: f32):
+      %ex   = arith.extf %in   : f16 to f32
+      %ex_1 = arith.extf %in_1 : f16 to f32
+      %mul  = arith.mulf %ex, %ex_1 : f32
+      %sum  = arith.addf %out, %mul : f32
+      linalg.yield %sum : f32
+  } -> tensor<8x16x8x16xf32>
+  return %out : tensor<8x16x8x16xf32>
+}
+
+
+// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [2, 1, 1], batches_per_subgroup = [4, 1, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 16, 4], elements_per_thread = [1, 1, 4], subgroup_strides = [2, 0, 0], thread_strides = [0, 1, 16]>
+// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [2, 1, 1], batches_per_subgroup = [4, 1, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 16, 4], elements_per_thread = [1, 1, 4], subgroup_strides = [1, 0, 0], thread_strides = [0, 1, 16]>
+// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [2, 1, 2, 1], batches_per_subgroup = [4, 1, 4, 1], outers_per_batch = [1, 1, 1, 1], threads_per_outer = [1, 4, 1, 16], elements_per_thread = [1, 4, 1, 1], subgroup_strides = [2, 0, 1, 0], thread_strides = [0, 16, 0, 1]>
+// CHECK-LABEL: func.func @packed_matmul_128x128x128
+
+// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion}
+// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion}
+// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]]
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
+// CHECK-SAME: outs(%[[ACC]]