add count_include_pad argument

apache · May 22, 2018 · 3f37699 · 3f37699
1 parent 022f238
commit 3f37699
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 44 deletions.
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
@@ -77,6 +77,7 @@ def GetConvertEnumVariableToString(self, variable=''):
 
 class Arg:
     typeDict = {'boolean':'bool',\
+        'boolean or None':'dmlc::optional<bool>',\
         'Shape(tuple)':'Shape',\
         'Symbol':'Symbol',\
         'NDArray':'Symbol',\

diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -190,7 +190,7 @@ private[mxnet] object SymbolImplMacros {
       case "long" | "long(non-negative)" => "Long"
       case "double" | "doubleorNone" => "Double"
       case "string" => "String"
-      case "boolean" => "Boolean"
+      case "boolean" => "BooleanorNone"
       case "tupleof<float>" | "tupleof<double>" | "ptr" | "" => "Any"
       case default => throw new IllegalArgumentException(
         s"Invalid type for args: $default, $argType")

diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
@@ -121,7 +121,11 @@ mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
       return mkldnn::algorithm::pooling_max;
       break;
     case pool_enum::kAvgPooling:
-      return mkldnn::algorithm::pooling_avg_include_padding;
+      if (param.count_include_pad.has_value() && !param.count_include_pad.value()) {
+        return mkldnn::algorithm::pooling_avg_exclude_padding;
+      } else {
+        return mkldnn::algorithm::pooling_avg_include_padding;
+      }
       break;
     default:
       LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method.";

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
@@ -214,16 +214,19 @@ template <typename DType, int p = 1>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w, DType* out_data,
-                                       const bool getAvg = false) {
+                                       const bool getAvg = false, const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int c = (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (wend - wstart) : 1);
+    int pool_size = (getAvg? (wend - wstart) : 1);
     wstart = max(wstart, 0);
     wend = min(wend, width);
+    if (getAvg && !count_include_pad) {
+      pool_size = (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * width;
     for (int w = wstart; w < wend; ++w) {
@@ -244,7 +247,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_h, const int stride_w,
                                        const int pad_h, const int pad_w, DType* out_data,
-                                       const bool getAvg = false) {
+                                       const bool getAvg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -254,11 +258,14 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
     int wstart = pw * stride_w - pad_w;
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+    int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     hend = min(hend, height);
     wend = min(wend, width);
+    if (getAvg && !count_include_pad) {
+      pool_size = (hend - hstart) * (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
@@ -282,7 +289,8 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_d, const int stride_h, const int stride_w,
                                        const int pad_d, const int pad_h, const int pad_w,
-                                       DType* out_data, const bool getAvg = false) {
+                                       DType* out_data, const bool getAvg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -295,13 +303,16 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
     int dend = min(dstart + kernel_d, depth + pad_d);
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     dend = min(dend, depth);
     hend = min(hend, height);
     wend = min(wend, width);
+    if (getAvg && !count_include_pad) {
+      pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    }
     DType sum = 0;
     const DType* out_slice = in_data + (n * channels + c) * depth * height * width;
     for (int d = dstart; d < dend; ++d) {
@@ -487,7 +498,8 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
                                          const int stride_w, const int pad_w, DType* in_grad,
-                                         const bool isAvg = false) {
+                                         const bool isAvg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -507,6 +519,11 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
       int wstart = pw * stride_w - pad_w;
       int wend = min(wstart + kernel_w, width + pad_w);
       int pool_size = (isAvg? (wend - wstart) : 1);
+      if (isAvg && !count_include_pad) {
+        wstart = max(wstart, 0);
+        wend = min(wend, width);
+        pool_size = (wend - wstart);
+      }
       gradient +=
         lp_grad<DType, p>::Map(out_grad_slice[pw], in_data[index], out_data_slice[pw]) / pool_size;
     }
@@ -528,7 +545,8 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w, DType* in_grad,
-                                         const bool isAvg = false) {
+                                         const bool isAvg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -555,6 +573,13 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
         int wend = min(wstart + kernel_w, width + pad_w);
         int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
         int out_index = ph * pooled_width + pw;
+        if (isAvg && !count_include_pad) {
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, height);
+          wend = min(wend, width);
+          pool_size = (hend - hstart) * (wend - wstart);
+        }
         gradient +=
           lp_grad<DType, p>::Map(out_grad_slice[out_index],
                                  in_data[index],
@@ -580,7 +605,8 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int kernel_d, const int kernel_h,
                                          const int kernel_w, const int stride_d, const int stride_h,
                                          const int stride_w, const int pad_d, const int pad_h,
-                                         const int pad_w, DType* in_grad, const bool isAvg = false) {
+                                         const int pad_w, DType* in_grad, const bool isAvg = false,
+                                         const bool count_include_pad = true) {
   // index is the input image index in NCDHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
@@ -613,6 +639,15 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
           int wend = min(wstart + kernel_w, width + pad_w);
           int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
           int out_index = (pd * pooled_height + ph) * pooled_width + pw;
+          if (isAvg && !count_include_pad) {
+            dstart = max(dstart, 0);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            dend = min(dend, depth);
+            hend = min(hend, height);
+            wend = min(wend, width);
+            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          }
           gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index],
                                              in_data[index],
                                              out_data_slice[out_index]) / pool_size;
@@ -643,7 +678,7 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data) {
+                 DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
@@ -659,7 +694,8 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
-                                   kernel[0], stride[0], pad[0], out_data, true);
+                                   kernel[0], stride[0], pad[0], out_data,
+                                   true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -693,7 +729,8 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], out_data, true);
+                                   stride[0], stride[1], pad[0], pad[1], out_data,
+                                   true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -731,7 +768,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
-                                   pad[0], pad[1], pad[2], out_data, true);
+                                   pad[0], pad[1], pad[2], out_data, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -777,7 +814,8 @@ template<typename DType, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
-                   const int pool_type, OpReqType req_type, DType* in_grad) {
+                   const int pool_type, OpReqType req_type, DType* in_grad,
+                   const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
@@ -798,7 +836,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
-                                     stride[0], pad[0], in_grad, true);
+                                     stride[0], pad[0], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -836,7 +874,8 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+                                     stride[0], stride[1], pad[0], pad[1], in_grad,
+                                     true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -878,7 +917,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
-                                     pad[2], in_grad, true);
+                                     pad[2], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)