From 2cf9303942048fe249c1ae14d6e49a7373e3f693 Mon Sep 17 00:00:00 2001 From: Ahmed Taei Date: Fri, 24 Jul 2020 17:11:11 -0700 Subject: [PATCH] Detect and lower depthwise conv to linalg.generic --- integrations/tensorflow/e2e/BUILD | 2 - .../tensorflow/e2e/depth_conv_test.py | 45 ++++++++- .../HLOToLinalg/HLOToLinalgOnBuffers.cpp | 94 ++++++++++++++++++- .../Conversion/HLOToLinalg/test/conv.mlir | 43 ++++++++- 4 files changed, 174 insertions(+), 10 deletions(-) diff --git a/integrations/tensorflow/e2e/BUILD b/integrations/tensorflow/e2e/BUILD index a3346e455cc4..3fd9219d55d1 100644 --- a/integrations/tensorflow/e2e/BUILD +++ b/integrations/tensorflow/e2e/BUILD @@ -67,7 +67,6 @@ VMLA_FAILING = [ # keep sorted LLVM_FAILING = [ "broadcasting_test.py", - "depth_conv_test.py", "dynamic_mlp_relu_test.py", "dynamic_mlp_test.py", "fill_test.py", # TODO(jennik): Get this test working on IREE. @@ -82,7 +81,6 @@ LLVM_FAILING = [ VULKAN_FAILING = [ "broadcasting_test.py", "control_flow_test.py", - "depth_conv_test.py", "dynamic_mlp_relu_test.py", "dynamic_mlp_test.py", "fill_test.py", # TODO(jennik): Get this test working on IREE. diff --git a/integrations/tensorflow/e2e/depth_conv_test.py b/integrations/tensorflow/e2e/depth_conv_test.py index 9d88beeb30b8..c807d917d52a 100644 --- a/integrations/tensorflow/e2e/depth_conv_test.py +++ b/integrations/tensorflow/e2e/depth_conv_test.py @@ -26,7 +26,7 @@ class Conv2dModule(tf.Module): tf.TensorSpec([2, 4, 5, 2], tf.float32), tf.TensorSpec([2, 2, 2, 3], tf.float32), ]) - def conv2d_2452x2223_valid(self, img, kernel): + def conv2d_2423x2223_valid(self, img, kernel): return tf.nn.depthwise_conv2d( img, kernel, [1, 1, 1, 1], "VALID", name="result") @@ -34,11 +34,29 @@ def conv2d_2452x2223_valid(self, img, kernel): tf.TensorSpec([2, 4, 5, 2], tf.float32), tf.TensorSpec([2, 4, 2, 3], tf.float32), ]) - def conv2d_2452x2223_same(self, img, kernel): + def conv2d_2423x2223_same(self, img, kernel): return tf.nn.depthwise_conv2d( img, kernel, [1, 1, 1, 1], "SAME", name="result") + + @tf.function(input_signature=[ + tf.TensorSpec([2, 4, 5, 2], tf.float32), + tf.TensorSpec([2, 4, 2, 3], tf.float32), + ]) + def conv2d_2423x2223_valid_stride_2(self, img, kernel): + return tf.nn.depthwise_conv2d( + img, kernel, [1, 2, 2, 1], "VALID", name="result") + + @tf.function(input_signature=[ + tf.TensorSpec([2, 4, 5, 2], tf.float32), + tf.TensorSpec([2, 4, 2, 3], tf.float32), + ]) + def conv2d_2423x2223_same_stride_2(self, img, kernel): + return tf.nn.depthwise_conv2d( + img, kernel, [1, 2, 2, 1], "SAME", name="result") + + @tf_test_utils.compile_module(Conv2dModule) class ConvTest(tf_test_utils.TracedModuleTestCase): @@ -47,7 +65,7 @@ def test_batched_feature_unpadded(self): def batched_feature_unpadded(module): i = tf_utils.ndarange([2, 4, 5, 2]) k = tf_utils.ndarange([2, 2, 2, 3]) - module.conv2d_2452x2223_valid(i, k) + module.conv2d_2423x2223_valid(i, k) self.compare_backends(batched_feature_unpadded) @@ -56,10 +74,29 @@ def test_batched_feature_unpadded_same(self): def batched_feature_unpadded_same(module): i = tf_utils.ndarange([2, 4, 5, 2]) k = tf_utils.ndarange([2, 4, 2, 3]) - module.conv2d_2452x2223_same(i, k) + module.conv2d_2423x2223_same(i, k) self.compare_backends(batched_feature_unpadded_same) + def test_batched_feature_unpadded_same_stride_2(self): + + def batched_feature_unpadded_same_stride_2(module): + i = tf_utils.ndarange([2, 4, 5, 2]) + k = tf_utils.ndarange([2, 4, 2, 3]) + module.conv2d_2423x2223_valid_stride_2(i, k) + + self.compare_backends(batched_feature_unpadded_same_stride_2) + + + def test_batched_feature_padded_same_stride_2(self): + + def batched_feature_padded_same_stride_2(module): + i = tf_utils.ndarange([2, 4, 5, 2]) + k = tf_utils.ndarange([2, 4, 2, 3]) + module.conv2d_2423x2223_same_stride_2(i, k) + + self.compare_backends(batched_feature_padded_same_stride_2) + if __name__ == "__main__": if hasattr(tf, "enable_v2_behavior"): diff --git a/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp b/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp index 3356d03d186f..3c0968ed4f6e 100644 --- a/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp +++ b/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp @@ -384,9 +384,97 @@ LogicalResult ConvOpConversion::apply( rewriter.notifyMatchFailure(op, "failed to zero fill result buffer"); return failure(); } - rewriter.create(op.getLoc(), inputBuffers[1], inputBuffers[0], - resultBuffers[0], stridesArg, dilationArg, - padding); + // Depthwise conv path... + if (op.feature_group_count().getZExtValue() > 1u && + op.feature_group_count().getZExtValue() == + op.dimension_numbers().kernel_input_feature_dimension().getInt()) { + // Lowering depthwise convolution to linalg.generic op. The idea is to use + // the group convolution formulation to perform the separable depth wise + // convolution as the following, given an n-dimensional input x and filter w + // the direct convolution operation can be written as: + // y[n, d1, d2, ....dn, ci * groupSize + co] = sum(k1, k2, ....kn, + // x[n, d1 * stride1 + k1, d1 * stride2 + k2, ...dn * striden + kn] + // * w[k1, k2, ...kn, ci, co]) + + // TODO(ataei): Support dilation. + if (llvm::any_of(dilation, [](Attribute attr) { + return (attr.dyn_cast().getInt() != 1); + })) { + return failure(); + } + + SmallVector inputExprs; + SmallVector filterExprs; + SmallVector outputExprs; + + const auto spatialDims = + llvm::size(op.dimension_numbers().input_spatial_dimensions()); + const int d1Index = 1; + const int coIndex = d1Index + spatialDims; + const int ciIndex = coIndex + 1; + const int k1Index = ciIndex + 1; + // n, d1 * stride1 + k1, d1 * stride2 + k2, ...dn * striden + kn + inputExprs.push_back(rewriter.getAffineDimExpr(0)); + for (int i = 0; i < spatialDims; ++i) { + if (op.window_stridesAttr()) { + auto stride = op.window_stridesAttr().getValue(i); + inputExprs.push_back(rewriter.getAffineDimExpr(d1Index + i) * + stride.getZExtValue() + + rewriter.getAffineDimExpr(k1Index + i)); + } else { + inputExprs.push_back(rewriter.getAffineDimExpr(d1Index + i) + + rewriter.getAffineDimExpr(k1Index + i)); + } + } + inputExprs.push_back(rewriter.getAffineDimExpr(ciIndex)); + // k1, k2, ...kn, ci, co + for (int i = 0; i < spatialDims; ++i) { + filterExprs.push_back(rewriter.getAffineDimExpr(k1Index + i)); + } + filterExprs.push_back(rewriter.getAffineDimExpr(ciIndex)); + filterExprs.push_back(rewriter.getAffineDimExpr(coIndex)); + + // n, d1, d2, ....dn, ci * groupSize + co + outputExprs.push_back(rewriter.getAffineDimExpr(0)); + for (int i = 0; i < spatialDims; ++i) { + outputExprs.push_back(rewriter.getAffineDimExpr(d1Index + i)); + } + outputExprs.push_back( + rewriter.getAffineDimExpr(ciIndex) * + op.dimension_numbers().kernel_output_feature_dimension().getInt() + + rewriter.getAffineDimExpr(coIndex)); + + // nloops = |d| + |k| + |{n, ci, co}| + int nloops = spatialDims * 2 + 3; + SmallVector indexingMaps; + indexingMaps.emplace_back(AffineMap::get( + nloops, /*symbolCount=*/0, inputExprs, rewriter.getContext())); + indexingMaps.emplace_back(AffineMap::get( + nloops, /*symbolCount=*/0, filterExprs, rewriter.getContext())); + indexingMaps.emplace_back(AffineMap::get( + nloops, /*symbolCount=*/0, outputExprs, rewriter.getContext())); + + Location loc = op.getLoc(); + SmallVector linalgOpArgs = {inputBuffers[0], inputBuffers[1], + resultBuffers[0]}; + + SmallVector loopAttributeTypes(spatialDims + 3, "parallel"); + loopAttributeTypes.append(spatialDims, "reduction"); + rewriter.create( + loc, ArrayRef{}, linalgOpArgs, + 2, // args_in + 1, // args_out + indexingMaps, loopAttributeTypes, + [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) { + Value mul = nestedBuilder.create(nestedLoc, args[0], args[1]); + Value add = nestedBuilder.create(nestedLoc, mul, args[2]); + nestedBuilder.create(loc, add); + }); + } else { + rewriter.create(op.getLoc(), inputBuffers[1], + inputBuffers[0], resultBuffers[0], + stridesArg, dilationArg, padding); + } return success(); } diff --git a/iree/compiler/Conversion/HLOToLinalg/test/conv.mlir b/iree/compiler/Conversion/HLOToLinalg/test/conv.mlir index 9845ed91b1e2..2a065e823794 100644 --- a/iree/compiler/Conversion/HLOToLinalg/test/conv.mlir +++ b/iree/compiler/Conversion/HLOToLinalg/test/conv.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt -iree-codegen-hlo-to-linalg-on-buffers %s | IreeFileCheck %s +// RUN: iree-opt -split-input-file -iree-codegen-hlo-to-linalg-on-buffers %s | IreeFileCheck %s module { // CHECK: func @conv @@ -37,3 +37,44 @@ module { hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write" } } + +// ----- + +module { + func @depthwise_conv() { + %c0 = constant 0 : index + %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<2x4x5x2xf32> + %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<2x2x2x3xf32> + %2 = "mhlo.convolution"(%0, %1) { + batch_group_count = 1 : i64, + dimension_numbers = { + input_batch_dimension = 0 : i64, + input_feature_dimension = 3 : i64, + input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, + kernel_input_feature_dimension = 2 : i64, + kernel_output_feature_dimension = 3 : i64, + kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, + output_batch_dimension = 0 : i64, + output_feature_dimension = 3 : i64, + output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64> + }, + feature_group_count = 2 : i64, + padding = dense<0> : tensor<2x2xi64>, + rhs_dilation = dense<1> : tensor<2xi64>, + window_strides = dense<1> : tensor<2xi64>} : (tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>) -> tensor<2x3x4x6xf32> + hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<2x3x4x6xf32> + return + } + hal.interface @legacy_io attributes {sym_visibility = "private"} { + hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" + hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" + hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write" + } +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d5, d2 + d6, d4)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d5, d6, d4, d3)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4 * 3 + d3)> +// CHECK: linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]] +// CHECK: mulf +// CHECK: addf +// CHECK: memref<2x4x5x2xf32>, memref<2x2x2x3xf32>, memref<2x3x4x6xf32>