Skip to content

Commit

Permalink
tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Groverkss committed Aug 9, 2024
1 parent c72f055 commit 3e9a393
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 255 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ iree_lit_test_suite(
"transpose_pipeline_test.mlir",
"ukernel_pipeline_transform.mlir",
"configure_vector_layout.mlir",
"configure_tensor_layout.mlir",
"vector_lowering.mlir",
"vector_to_gpu.mlir",
"winograd_pipeline_test.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ iree_lit_test_suite(
"cast_type_to_fit_mma.mlir"
"config_matvec.mlir"
"config_winograd.mlir"
"configure_tensor_layout.mlir"
"configure_vector_layout.mlir"
"conv_pipeline_test_cuda.mlir"
"conv_pipeline_test_rocm.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-llvmgpu-configure-tensor-layouts, canonicalize, cse))' %s | FileCheck %s

#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [64, 1, 1]
subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 1,
subgroup_n_count = 1>}>

#maps = [
affine_map<(m, n, k) -> (m, k)>,
affine_map<(m, n, k) -> (n, k)>,
affine_map<(m, n, k) -> (m, n)>
]

#traits = {
indexing_maps = #maps,
iterator_types = ["parallel", "parallel", "reduction"]
}

func.func @matmul_96x64x16(%lhs: tensor<96x16xf16>,
%rhs: tensor<64x16xf16>,
%init: tensor<96x64xf32>)
-> tensor<96x64xf32>
attributes { translation_info = #translation } {
%out = linalg.generic #traits
ins(%lhs, %rhs: tensor<96x16xf16>, tensor<64x16xf16>)
outs(%init: tensor<96x64xf32>) {
^bb0(%in: f16, %in_1: f16, %out: f32):
%ex = arith.extf %in : f16 to f32
%ex_1 = arith.extf %in_1 : f16 to f32
%mul = arith.mulf %ex, %ex_1 : f32
%sum = arith.addf %out, %mul : f32
linalg.yield %sum : f32
} -> tensor<96x64xf32>
return %out : tensor<96x64xf32>
}

// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [1, 1], threads_per_outer = [32, 2], elements_per_thread = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 32]>
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [2, 2], outers_per_batch = [1, 1], threads_per_outer = [32, 2], elements_per_thread = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 32]>
// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1], batches_per_subgroup = [3, 2], outers_per_batch = [4, 1], threads_per_outer = [2, 32], elements_per_thread = [4, 1], subgroup_strides = [0, 0], thread_strides = [32, 1]>

// CHECK-LABEL: func.func @matmul_96x64x16

// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion}
// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion}
// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[ACC]]

// -----

#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [64, 1, 1]
subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4,
subgroup_n_count = 1>}>

#maps = [
affine_map<(m, n, k) -> (m, k)>,
affine_map<(m, n, k) -> (n, k)>,
affine_map<(m, n, k) -> (m, n)>
]

#traits = {
indexing_maps = #maps,
iterator_types = ["parallel", "parallel", "reduction"]
}

func.func @matmul_128x64x16_multi_subgroup(%lhs: tensor<128x16xf16>,
%rhs: tensor<64x16xf16>,
%init: tensor<128x64xf32>)
-> tensor<128x64xf32>
attributes { translation_info = #translation } {
%out = linalg.generic #traits
ins(%lhs, %rhs: tensor<128x16xf16>, tensor<64x16xf16>)
outs(%init: tensor<128x64xf32>) {
^bb0(%in: f16, %in_1: f16, %out: f32):
%ex = arith.extf %in : f16 to f32
%ex_1 = arith.extf %in_1 : f16 to f32
%mul = arith.mulf %ex, %ex_1 : f32
%sum = arith.addf %out, %mul : f32
linalg.yield %sum : f32
} -> tensor<128x64xf32>
return %out : tensor<128x64xf32>
}

// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [4, 1]
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [1, 1]
// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [4, 1]

// CHECK-LABEL: func.func @matmul_128x64x16_multi_subgroup

// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion}
// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion}
// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[ACC]]

// -----

#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute
workgroup_size = [64, 1, 1]
subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 2,
subgroup_n_count = 2>}>

#maps = [
affine_map<(bm, bn, m, n, k) -> (bm, m, k)>,
affine_map<(bm, bn, m, n, k) -> (bn, n, k)>,
affine_map<(bm, bn, m, n, k) -> (bm, m, bn, n)>
]

#traits = {
indexing_maps = #maps,
iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]
}

func.func @packed_matmul_128x128x128(%lhs: tensor<8x16x16xf16>,
%rhs: tensor<8x16x16xf16>,
%init: tensor<8x16x8x16xf32>)
-> tensor<8x16x8x16xf32>
attributes { translation_info = #translation } {
%out = linalg.generic #traits
ins(%lhs, %rhs: tensor<8x16x16xf16>, tensor<8x16x16xf16>)
outs(%init: tensor<8x16x8x16xf32>) {
^bb0(%in: f16, %in_1: f16, %out: f32):
%ex = arith.extf %in : f16 to f32
%ex_1 = arith.extf %in_1 : f16 to f32
%mul = arith.mulf %ex, %ex_1 : f32
%sum = arith.addf %out, %mul : f32
linalg.yield %sum : f32
} -> tensor<8x16x8x16xf32>
return %out : tensor<8x16x8x16xf32>
}


// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [2, 1, 1], batches_per_subgroup = [4, 1, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 16, 4], elements_per_thread = [1, 1, 4], subgroup_strides = [2, 0, 0], thread_strides = [0, 1, 16]>
// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [2, 1, 1], batches_per_subgroup = [4, 1, 1], outers_per_batch = [1, 1, 1], threads_per_outer = [1, 16, 4], elements_per_thread = [1, 1, 4], subgroup_strides = [1, 0, 0], thread_strides = [0, 1, 16]>
// CHECK-DAG: #[[$NESTED2:.+]] = #iree_vector_ext.nested_layout<subgroups_per_workgroup = [2, 1, 2, 1], batches_per_subgroup = [4, 1, 4, 1], outers_per_batch = [1, 1, 1, 1], threads_per_outer = [1, 4, 1, 16], elements_per_thread = [1, 4, 1, 1], subgroup_strides = [2, 0, 1, 0], thread_strides = [0, 16, 0, 1]>
// CHECK-LABEL: func.func @packed_matmul_128x128x128

// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED]] {shared_memory_conversion}
// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED1]] {shared_memory_conversion}
// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_layout %{{.*}} to #[[$NESTED2]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[ACC]]
Loading

0 comments on commit 3e9a393

Please sign in to comment.