Skip to content

Commit

Permalink
[Codegen] Add vector transfer + slice foldings in GenericVectorization (
Browse files Browse the repository at this point in the history
#17613)

Vectorizing a `linalg.copy` op can result in a sequence of
```
%extract = tensor.extract_slice %source
%read = vector.transfer_read %extract
%write = vector.transfer_read %dest
%insert = tensor.insert_slice %write into %dest
```
This sequence is folded by the transfer_write folder into
```
%extract = tensor.extract_slice %source
%insert = tensor.insert_slice %extract into %dest
```
In order to conserve the vector transfers, this PR adds folding patterns
for vector transfer ops acting on insert/extract slice ops. This will
fold the insert_slice into the transfer_write and the extract_slice into
the transfer_read, and the vector transfers will not be folded.

This is turned off for the vector distribution pipeline because it
causes distribution to fail in some cases.

Also removes `Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir`, since
it completes a TODO to remove the test after eliminating some undesired
extra buffers.

---------

Signed-off-by: Max Dawkins <[email protected]>
  • Loading branch information
Max191 authored Jul 19, 2024
1 parent 3e3d9da commit 8b83425
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 114 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ class GenericVectorizationPass
this->generateContract.setValue(options.generateContract);
this->foldCastIntoContract.setValue(options.foldCastIntoContract);
this->maxVectorSize.setValue(options.maxVectorSize);
this->earlySubsetTransferFolding.setValue(
options.earlySubsetTransferFolding);
}

void getDependentDialects(DialectRegistry &registry) const override {
Expand Down Expand Up @@ -384,8 +386,17 @@ void GenericVectorizationPass::runOnOperation() {
};

{
// Canonicalize mask related ops before we lower them.
// Canonicalize mask related ops before we lower them. Also run patterns
// for vector transfers on tensor subset ops, since they can be folded if
// not handled here.
RewritePatternSet maskCanonPatterns(funcOp.getContext());
if (earlySubsetTransferFolding) {
// It is important to add these vector transfer on tensor subset patterns
// in the first greedy pattern rewrite, since transfer foldings can remove
// vectorized reads and writes by folding them into tensor ops.
tensor::populateFoldTensorSubsetIntoVectorTransferPatterns(
maskCanonPatterns);
}
vector::CreateMaskOp::getCanonicalizationPatterns(maskCanonPatterns,
funcOp.getContext());
vector::ConstantMaskOp::getCanonicalizationPatterns(maskCanonPatterns,
Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ struct GenericVectorizationPassOptions {
bool foldCastIntoContract = false;
// Max vector size allowed to avoid creating large vectors.
int64_t maxVectorSize = std::numeric_limits<int64_t>::max();
// Enable early folding of tensor subset ops into vector transfer ops.
bool earlySubsetTransferFolding = true;
};
/// Creates a pass to perform vectorization on LinAlg and tensor ops.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
Expand Down
4 changes: 3 additions & 1 deletion compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ def GenericVectorization :
"Enable folding casting ops into vector.contract.">,
Option<"maxVectorSize", "max-vector-size", "int64_t",
/*default=*/"2147483647",
"Max vector size allowed to avoid creating large vectors.">
"Max vector size allowed to avoid creating large vectors.">,
Option<"earlySubsetTransferFolding", "early-subset-transfer-folding", "bool",/*default=*/"true",
"Enable early folding of tensor subset ops into vector transfer ops.">
];
let constructor =
"mlir::iree_compiler::createGenericVectorizationPass()";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
// CHECK-LABEL: func.func @single_static_pack_infer_vector_size
// CHECK: tensor.pack

// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)>
// CHECK-MASK-LABEL: func.func @single_static_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8
Expand All @@ -79,9 +79,8 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
// CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]]
// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8>
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1]
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor<?x?x16x2xi8>
Expand Down Expand Up @@ -130,12 +129,12 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor<?x?xi8>, %arg1: t
// CHECK-LABEL: func.func @single_dynamic_pack_infer_vector_size
// CHECK: tensor.pack

// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-LABEL: func.func @single_dynamic_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8
Expand All @@ -145,9 +144,8 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor<?x?xi8>, %arg1: t
// CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]]
// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8>
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1]
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor<?x?x16x2xi8>
Expand Down Expand Up @@ -204,13 +202,13 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor<?x32x128xf32>) -> tensor
}
return %3 : tensor<32x?x64x16x2xbf16>
}
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-LABEL: func.func @generic_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0_BF16:.+]] = arith.constant 0.000000e+00 : bf16
Expand All @@ -229,9 +227,8 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor<?x32x128xf32>) -> tensor
// CHECK-MASK-DAG: %[[SRC_SZ0:.+]] = affine.min #[[$MAP4]]
// CHECK-MASK-DAG: %[[SRC_SZ2:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK-DAG: %[[ITER_SLICE:.+]] = tensor.extract_slice %[[GENERIC_EMPTY]]
// CHECK-MASK-DAG: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[SRC_SZ0]], 2, %[[SRC_SZ2]]]
// CHECK-MASK-DAG: %[[READ_MASK:.+]] = vector.create_mask %[[SRC_SZ0]], %[[C2]], %[[SRC_SZ2]] : vector<64x2x12xi1>
// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}} %[[READ_MASK]]
// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC]]{{.+}} %[[READ_MASK]]
// CHECK-MASK-DAG: %[[WRITE_MASK:.+]] = vector.create_mask %[[C2]], %[[SRC_SZ2]], %[[SRC_SZ0]] : vector<2x12x64xi1>
// CHECK-MASK: %[[TRUNC:.+]] = arith.truncf %[[GENERIC_READ]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[TRUNC]], [1, 2, 0]
Expand Down Expand Up @@ -278,10 +275,10 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>,
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-LABEL: func.func @single_dynamic_unpack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
Expand All @@ -292,9 +289,8 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>,
// CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
// CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
Expand Down Expand Up @@ -338,10 +334,10 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-LABEL: func.func @generic_unpack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
Expand All @@ -352,9 +348,8 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
// CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
// CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
Expand Down Expand Up @@ -404,4 +399,46 @@ func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x6
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x67x120x176xf32>

// -----

#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
func.func @tiled_linalg_copy(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg1, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%0 = scf.for %arg3 = %c0 to %dim step %c16 iter_args(%arg4 = %arg1) -> (tensor<?x?xf32>) {
%1 = scf.for %arg5 = %c0 to %dim_0 step %c32 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%2 = affine.min #map(%arg3)[%dim]
%3 = affine.min #map1(%arg5)[%dim_0]
%extracted_slice_0 = tensor.extract_slice %arg0[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg1[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%copy = linalg.copy ins(%extracted_slice_0 : tensor<?x?xf32>) outs(%extracted_slice_1 : tensor<?x?xf32>) -> tensor<?x?xf32>
%inserted_slice = tensor.insert_slice %copy into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %1 : tensor<?x?xf32>
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK-LABEL: func.func @tiled_linalg_copy
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<?x?xf32>, %[[DST:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-MASK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-MASK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-MASK-DAG: %[[C32:.+]] = arith.constant 32 : index
// CHECK-MASK: scf.for %[[IV0:.+]] = %[[C0]]
// CHECK-MASK: scf.for %[[IV1:.+]] = %[[C0]] {{.*}} iter_args(%[[ITER_ARG:.+]] = {{.*}})
// CHECK-MASK-DAG: %[[DST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[DST_SLICE:.+]] = tensor.extract_slice %[[DST]][%[[IV0]], %[[IV1]]] [%[[DST_SZ0]], %[[DST_SZ1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK-MASK: %[[MASK:.+]] = vector.create_mask %[[DST_SZ0]], %[[DST_SZ1]] : vector<16x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[IV0]], %[[IV1]]],{{.*}} %[[MASK]]{{.*}} : tensor<?x?xf32>, vector<16x32xf32>
// CHECK-MASK: vector.transfer_write %[[READ]], %[[DST_SLICE]]{{.+}}, %[[MASK]]
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,11 @@ module {
// CHECK: scf.for
// CHECK: scf.for
// CHECK: scf.for
// CHECK: %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]]
// CHECK: %[[RESULT_VEC:.+]] = scf.if %{{.+}} -> (vector<4xf32>) {
// CHECK: %[[VEC_LOAD:.+]] = vector.load %[[INPUT_SUBVIEW]]
// CHECK: scf.yield %[[VEC_LOAD]]
// CHECK: }
// CHECK: %[[DROP_UNIT_OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SLICE]]
// CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]]
// CHECK: vector.store %[[RESULT_VEC]], %[[OUTPUT_SUBVIEW]]

// -----
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
// CHECK: scf.for {{.*}} iter_args(%[[OUT_TENSOR:.*]] = {{.*}}) -> (tensor<1024x1024xf32>) {
// CHECK-NEXT: scf.for {{.*}} iter_args(%[[OUT_TENSOR_1:.*]] = %[[OUT_TENSOR]]) -> (tensor<1024x1024xf32>) {
// CHECK-NEXT: %[[OUT_SLICE:.*]] = tensor.extract_slice %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32> to tensor<8x?xf32>
// CHECK-NEXT: %[[OUT_SLICE_1:.*]] = tensor.extract_slice %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> to tensor<8x?xf32>
// CHECK-NEXT: %[[OUT_VEC:.*]] = vector.transfer_read %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32>, vector<8x[16]xf32>
// CHECK-NEXT: %[[INNER_LOOP:.*]] = scf.for {{.*}} iter_args(%[[RES:.*]] = %[[OUT_VEC]]) -> (vector<8x[16]xf32>) {
// CHECK-NEXT: %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<1024x1024xf32>, vector<8x1xf32>
Expand All @@ -30,9 +29,8 @@
// CHECK-SAME: %[[LHS]], %[[RHS]], %[[RES]] : vector<8x1xf32>, vector<1x[16]xf32> into vector<8x[16]xf32>
// CHECK-NEXT: scf.yield %[[CONTRACT]] : vector<8x[16]xf32>
// CHECK-NEXT: }
// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE_1]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32>
// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> into tensor<8x?xf32>
// CHECK-NEXT: tensor.insert_slice %[[INSERT_SLICE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32>
// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32>
// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32>

func.func @pipeline() {
%c1 = arith.constant 1 : index
Expand Down
Loading

0 comments on commit 8b83425

Please sign in to comment.