Skip to content

Commit

Permalink
Reapply "[Codegen] Add vector transfer + slice foldings in GenericVec…
Browse files Browse the repository at this point in the history
…torization (iree-org#17613)" (iree-org#17997)

This reverts commit 10877f6.
  • Loading branch information
Max191 committed Aug 27, 2024
1 parent 271a14b commit 89915b7
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 79 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,17 @@ void GenericVectorizationPass::runOnOperation() {
}

{
// Canonicalize mask related ops before we lower them.
// Canonicalize mask related ops before we lower them. Also run patterns
// for vector transfers on tensor subset ops, since they can be folded if
// not handled here.
RewritePatternSet maskCanonPatterns(funcOp.getContext());
if (earlySubsetTransferFolding) {
// It is important to add these vector transfer on tensor subset patterns
// in the first greedy pattern rewrite, since transfer foldings can remove
// vectorized reads and writes by folding them into tensor ops.
tensor::populateFoldTensorSubsetIntoVectorTransferPatterns(
maskCanonPatterns);
}
vector::CreateMaskOp::getCanonicalizationPatterns(maskCanonPatterns,
funcOp.getContext());
vector::ConstantMaskOp::getCanonicalizationPatterns(maskCanonPatterns,
Expand Down
41 changes: 41 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,48 @@ createConvertToDestinationPassingStylePass(
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createDecomposePackUnPackOpsPass(bool tileOuterToOne);

<<<<<<< HEAD
std::unique_ptr<Pass> createDecomposeSoftmaxPass(bool useFusion);
=======
/// Fuses tensor.pad ops into their consumer ops' tiled loop nests.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createFuseTensorPadWithConsumerPass();

struct GenericVectorizationPassOptions {
bool enableVectorMasking = false;
// Controls whether the op lowering configuration (if present) should be used
// to specify the masked vector sizes.
bool useConfiguredVectorSizes = true;
bool vectorizePadding = false;
bool vectorizeGatherAccesses = false;
// The flag controls whether it touches the structure generated from tiling,
// which affects later steps like bufferization and vector hoisting.
bool enableCleanup = true;
// Enable conversion for reduction ops to contraction ops.
bool generateContract = true;
// Enable folding casting ops into contraction ops. Note that the resulting
// mixed-type contraction ops are only handled by certain backends.
bool foldCastIntoContract = false;
// Max vector size allowed to avoid creating large vectors.
int64_t maxVectorSize = std::numeric_limits<int64_t>::max();
// Enable early folding of tensor subset ops into vector transfer ops.
bool earlySubsetTransferFolding = true;
};
/// Creates a pass to perform vectorization on LinAlg and tensor ops.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createGenericVectorizationPass();
std::unique_ptr<InterfacePass<FunctionOpInterface>>
createGenericVectorizationPass(const GenericVectorizationPassOptions &options);

std::unique_ptr<InterfacePass<FunctionOpInterface>>
createOptimizeTensorInsertExtractSlicesPass();

std::unique_ptr<InterfacePass<FunctionOpInterface>>
createHoistStaticallyBoundAllocationsPass();

std::unique_ptr<InterfacePass<FunctionOpInterface>>
createHoistUnrolledVectorExtractInsertSlicePass();
>>>>>>> 2778c202e7 (Reapply "[Codegen] Add vector transfer + slice foldings in GenericVectorization (#17613)" (#17997))

/// Pass to perform linalg on tensor bufferization. The function passed into
/// the pass through the `allocationFn` argument is invoked whenever a new
Expand Down
4 changes: 3 additions & 1 deletion compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,9 @@ def GenericVectorizationPass :
"Enable folding casting ops into vector.contract.">,
Option<"maxVectorSize", "max-vector-size", "int64_t",
/*default=*/"2147483647",
"Max vector size allowed to avoid creating large vectors.">
"Max vector size allowed to avoid creating large vectors.">,
Option<"earlySubsetTransferFolding", "early-subset-transfer-folding", "bool",/*default=*/"true",
"Enable early folding of tensor subset ops into vector transfer ops.">
];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
// CHECK-LABEL: func.func @single_static_pack_infer_vector_size
// CHECK: tensor.pack

// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)>
// CHECK-MASK-LABEL: func.func @single_static_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8
Expand All @@ -79,9 +79,8 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
// CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]]
// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8>
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1]
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor<?x?x16x2xi8>
Expand Down Expand Up @@ -130,12 +129,12 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor<?x?xi8>, %arg1: t
// CHECK-LABEL: func.func @single_dynamic_pack_infer_vector_size
// CHECK: tensor.pack

// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-LABEL: func.func @single_dynamic_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8
Expand All @@ -145,9 +144,8 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor<?x?xi8>, %arg1: t
// CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]]
// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8>
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1]
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor<?x?x16x2xi8>
Expand Down Expand Up @@ -204,13 +202,13 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor<?x32x128xf32>) -> tensor
}
return %3 : tensor<32x?x64x16x2xbf16>
}
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-LABEL: func.func @generic_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0_BF16:.+]] = arith.constant 0.000000e+00 : bf16
Expand All @@ -229,9 +227,8 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor<?x32x128xf32>) -> tensor
// CHECK-MASK-DAG: %[[SRC_SZ0:.+]] = affine.min #[[$MAP4]]
// CHECK-MASK-DAG: %[[SRC_SZ2:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK-DAG: %[[ITER_SLICE:.+]] = tensor.extract_slice %[[GENERIC_EMPTY]]
// CHECK-MASK-DAG: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[SRC_SZ0]], 2, %[[SRC_SZ2]]]
// CHECK-MASK-DAG: %[[READ_MASK:.+]] = vector.create_mask %[[SRC_SZ0]], %[[C2]], %[[SRC_SZ2]] : vector<64x2x12xi1>
// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}} %[[READ_MASK]]
// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC]]{{.+}} %[[READ_MASK]]
// CHECK-MASK-DAG: %[[WRITE_MASK:.+]] = vector.create_mask %[[C2]], %[[SRC_SZ2]], %[[SRC_SZ0]] : vector<2x12x64xi1>
// CHECK-MASK: %[[TRUNC:.+]] = arith.truncf %[[GENERIC_READ]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[TRUNC]], [1, 2, 0]
Expand Down Expand Up @@ -278,10 +275,10 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>,
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-LABEL: func.func @single_dynamic_unpack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
Expand All @@ -292,9 +289,8 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>,
// CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
// CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
Expand Down Expand Up @@ -338,10 +334,10 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-LABEL: func.func @generic_unpack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
Expand All @@ -352,9 +348,8 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
// CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
// CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
Expand Down Expand Up @@ -404,39 +399,32 @@ func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x6
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x67x120x176xf32>

// -----

#aarch64_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", target_triple = "aarch64-none-elf"}>

func.func @dynamic_fill_with_scalable_tiling_infer_remainder_vector_size(%arg0: tensor<1x67x120x100xf32>) -> tensor<1x67x120x100xf32>
attributes {hal.executable.target = #aarch64_sve}
{
%c0 = arith.constant 0 : index
#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
func.func @tiled_linalg_copy(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c100 = arith.constant 100 : index
%c67 = arith.constant 67 : index
%c120 = arith.constant 120 : index
%cst = arith.constant 0.000000e+00 : f32
%vscale = vector.vscale
%c4_vscale = arith.muli %vscale, %c4 : index
%0 = scf.for %arg1 = %c0 to %c67 step %c1 iter_args(%arg2 = %arg0) -> (tensor<1x67x120x100xf32>) {
%1 = scf.for %arg3 = %c0 to %c120 step %c4 iter_args(%arg4 = %arg2) -> (tensor<1x67x120x100xf32>) {
%rem_start = affine.apply affine_map<()[s0] -> (-(100 mod s0) + 100)>()[%c4_vscale]
%3 = scf.for %arg5 = %rem_start to %c100 step %c4_vscale iter_args(%arg6 = %arg4) -> (tensor<1x67x120x100xf32>) {
%rem_elts = affine.apply affine_map<(d0) -> (-d0 + 100)>(%arg5)
%extracted_slice = tensor.extract_slice %arg6[0, %arg1, %arg3, %arg5] [1, 1, 4, %rem_elts] [1, 1, 1, 1] : tensor<1x67x120x100xf32> to tensor<1x1x4x?xf32>
%4 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<1x1x4x?xf32>) -> tensor<1x1x4x?xf32>
%inserted_slice = tensor.insert_slice %4 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 4, %rem_elts] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x67x120x100xf32>
scf.yield %inserted_slice : tensor<1x67x120x100xf32>
}
scf.yield %3 : tensor<1x67x120x100xf32>
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg1, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%0 = scf.for %arg3 = %c0 to %dim step %c16 iter_args(%arg4 = %arg1) -> (tensor<?x?xf32>) {
%1 = scf.for %arg5 = %c0 to %dim_0 step %c32 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%2 = affine.min #map(%arg3)[%dim]
%3 = affine.min #map1(%arg5)[%dim_0]
%extracted_slice_0 = tensor.extract_slice %arg0[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg1[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%copy = linalg.copy ins(%extracted_slice_0 : tensor<?x?xf32>) outs(%extracted_slice_1 : tensor<?x?xf32>) -> tensor<?x?xf32>
%inserted_slice = tensor.insert_slice %copy into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %1 : tensor<1x67x120x100xf32>
scf.yield %1 : tensor<?x?xf32>
}
return %0 : tensor<1x67x120x100xf32>
return %0 : tensor<?x?xf32>
}

// CHECK-MASK-LABEL: func.func @dynamic_fill_with_scalable_tiling_infer_remainder_vector_size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,11 @@ func.func @pad_only_dispatch() attributes {hal.executable.target = #executable_t
// CHECK: scf.for
// CHECK: scf.for
// CHECK: scf.for
// CHECK: %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]]
// CHECK: %[[RESULT_VEC:.+]] = scf.if %{{.+}} -> (vector<4xf32>) {
// CHECK: %[[VEC_LOAD:.+]] = vector.load %[[INPUT_SUBVIEW]]
// CHECK: scf.yield %[[VEC_LOAD]]
// CHECK: }
// CHECK: %[[DROP_UNIT_OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SLICE]]
// CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]]
// CHECK: vector.store %[[RESULT_VEC]], %[[OUTPUT_SUBVIEW]]

// -----

Expand Down
Loading

0 comments on commit 89915b7

Please sign in to comment.